From be6c6415467d09da6109d27cfa218868abc1f9db Mon Sep 17 00:00:00 2001
From: Axel Huebl <axel.huebl@plasma.ninja>
Date: Thu, 26 Oct 2023 09:50:01 -0700
Subject: [PATCH 01/14] `AMReXBuildInfo.cmake`: AMReX_DIR (#3609)

## Summary

The `AMReX_DIR` points if set to the CMake module path root.

The old logic did not work for me in a situation (ImpactX) where:
- AMReX is pre-installed but
- found through a superbuild of another transient lib (ABLASTR)

## Additional background

Follow-up to #3599

## Checklist

The proposed changes:
- [x] fix a bug or incorrect behavior in AMReX
- [ ] add new capabilities to AMReX
- [ ] changes answers in the test suite to more than roundoff level
- [ ] are likely to significantly affect the results of downstream AMReX
users
- [ ] include documentation in the code and/or rst files, if appropriate
---
 Tools/CMake/AMReXBuildInfo.cmake | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/Tools/CMake/AMReXBuildInfo.cmake b/Tools/CMake/AMReXBuildInfo.cmake
index 9afbc2ceaf4..0b87f053c2e 100644
--- a/Tools/CMake/AMReXBuildInfo.cmake
+++ b/Tools/CMake/AMReXBuildInfo.cmake
@@ -38,7 +38,7 @@ include(AMReXTargetHelpers)
 #
 # Set paths
 #
-if (AMReX_FOUND)
+if (AMReX_DIR)
    # AMReX is pre-installed and used as a library
    if (WIN32)  # see AMReXInstallHelpers.cmake
        string(REPLACE "/cmake/AMReXCMakeModules" ""
@@ -56,7 +56,7 @@ else ()
 endif ()
 set(AMREX_TOP_DIR "${AMREX_TOP_DIR_DEFAULT}" CACHE INTERNAL "Top level AMReX directory")
 
-if (AMReX_FOUND)
+if (AMReX_DIR)
    # AMReX is pre-installed and used as a library
    set(AMREX_C_SCRIPTS_DIR "${AMREX_TOP_DIR}/share/amrex/C_scripts"
        CACHE INTERNAL "Path to AMReX' C_scripts dir")

From 601cc4ee80e0d7435a5b099202459bbb1f16b7a6 Mon Sep 17 00:00:00 2001
From: Klaus <kweide@users.noreply.github.com>
Date: Tue, 31 Oct 2023 14:07:41 -0500
Subject: [PATCH 02/14] Give FlashFluxRegisters ways to accumulate data in
 registers (#3597)

I introduce variant methods that allow adding to previously stored data
in Flash(-X) flux registers, rather than just copying over the older
data.

For Flash-X'ers: Something like this is needed to get flux correction
working in the nontelescoping variant of the current Spark Hydro
implementation.

## Summary

## Additional background

## Checklist

The proposed changes:
- [ ] fix a bug or incorrect behavior in AMReX
- [x] add new capabilities to AMReX
- [ ] changes answers in the test suite to more than roundoff level
- [ ] are likely to significantly affect the results of downstream AMReX
users
- [ ] include documentation in the code and/or rst files, if appropriate

---------

Co-authored-by: Weiqun Zhang <weiqunzhang@lbl.gov>
---
 .../AmrCore/AMReX_FlashFluxRegister.H         | 416 ++++++++++++++++++
 .../AmrCore/AMReX_FlashFluxRegister.cpp       | 280 +-----------
 .../AmrCore/AMReX_flash_fluxregister_fi.cpp   |  35 ++
 .../AmrCore/AMReX_flash_fluxregister_mod.F90  |  85 +++-
 4 files changed, 544 insertions(+), 272 deletions(-)

diff --git a/Src/F_Interfaces/AmrCore/AMReX_FlashFluxRegister.H b/Src/F_Interfaces/AmrCore/AMReX_FlashFluxRegister.H
index d946d33fdbc..3756a4bd1f2 100644
--- a/Src/F_Interfaces/AmrCore/AMReX_FlashFluxRegister.H
+++ b/Src/F_Interfaces/AmrCore/AMReX_FlashFluxRegister.H
@@ -5,6 +5,7 @@
 #include <AMReX_MultiFab.H>
 #include <AMReX_iMultiFab.H>
 #include <AMReX_Geometry.H>
+#include <AMReX_OpenMP.H>
 
 #include <map>
 
@@ -42,6 +43,18 @@ public:
     void store (int fine_global_index, int dir, FArrayBox const& fine_flux, FArrayBox const& area,
                 const int* isFluxDensity, Real sf);
 
+    // flux_in_register += scaling_factor * \sum{fine_flux} / (ref_ratio[0]*ref_ratio[1]*ref_ratio[2])
+    void add (int fine_global_index, int dir, FArrayBox const& fine_flux, Real sf);
+
+    // flux_in_register += scaling_factor * \sum{fine_flux * area}
+    void add (int fine_global_index, int dir, FArrayBox const& fine_flux, FArrayBox const& area,
+                Real sf);
+
+    // flux_in_register += scaling_factor * \sum{fine_flux * area}, if the component is flux density
+    //                     scaling_factor * \sum{fine_flux}       , otherwise
+    void add (int fine_global_index, int dir, FArrayBox const& fine_flux, FArrayBox const& area,
+                const int* isFluxDensity, Real sf);
+
     void communicate ();
 
     // crse_flux = flux_in_register * scaling_factor
@@ -63,6 +76,20 @@ public:
     void load (int crse_global_index, int dir, FArrayBox& crse_flux, FArrayBox const& cflux,
                FArrayBox const& area, const int* isFluxDensity, Real sf_f, Real sf_c) const;
 
+    enum struct OpType { Store, Add };
+
+    template <OpType op>
+    void store_or_add (int fine_global_index, int dir, FArrayBox const& fine_flux, Real sf);
+
+    template <OpType op>
+    void store_or_add (int fine_global_index, int dir, FArrayBox const& fine_flux, FArrayBox const& area,
+                       Real sf);
+
+    template <OpType op>
+    void store_or_add (int fine_global_index, int dir, FArrayBox const& fine_flux, FArrayBox const& area,
+                       const int* isFluxDensity, Real sf);
+
+
 protected:
 
     BoxArray m_fine_grids;
@@ -86,6 +113,395 @@ protected:
     mutable Vector<Gpu::DeviceVector<int> > m_d_ifd;
 };
 
+template <FlashFluxRegister::OpType op>
+void FlashFluxRegister::store_or_add (int fine_global_index, int dir,
+                                      FArrayBox const& fine_flux, Real sf)
+{
+    AMREX_ASSERT(dir < AMREX_SPACEDIM);
+    auto found = m_fine_map.find(fine_global_index);
+    if (found != m_fine_map.end()) {
+        const int ncomp = m_ncomp;
+        Array<FArrayBox*,AMREX_SPACEDIM> const& fab_a = found->second;
+        if (fab_a[dir]) {
+            Box const& b = fab_a[dir]->box();
+            Array4<Real> const& dest = fab_a[dir]->array();
+            Array4<Real const> const& src = fine_flux.const_array();
+            if (dir == 0) {
+#if (AMREX_SPACEDIM == 1)
+                AMREX_HOST_DEVICE_PARALLEL_FOR_4D (b, ncomp, i, j, k, n,
+                {
+                    amrex::ignore_unused(j,k,dest);
+                    auto rhs = src(2*i,0,0,n)*sf;
+                    if constexpr (op == OpType::Store) {
+                        dest(i,0,0,n) = rhs;
+                    } else {
+                        dest(i,0,0,n) += rhs;
+                    }
+                });
+#endif
+#if (AMREX_SPACEDIM == 2)
+                AMREX_HOST_DEVICE_PARALLEL_FOR_4D (b, ncomp, i, j, k, n,
+                {
+                    amrex::ignore_unused(k,dest);
+                    auto rhs = (src(2*i,2*j  ,0,n) +
+                                src(2*i,2*j+1,0,n)) * (Real(0.5)*sf);
+                    if constexpr (op == OpType::Store) {
+                        dest(i,j,0,n) = rhs;
+                    } else {
+                        dest(i,j,0,n) += rhs;
+                    }
+                });
+#endif
+#if (AMREX_SPACEDIM == 3)
+                AMREX_HOST_DEVICE_PARALLEL_FOR_4D (b, ncomp, i, j, k, n,
+                {
+                    auto rhs = (src(2*i,2*j  ,2*k  ,n) +
+                                src(2*i,2*j+1,2*k  ,n) +
+                                src(2*i,2*j  ,2*k+1,n) +
+                                src(2*i,2*j+1,2*k+1,n)) * (Real(0.25)*sf);
+                    amrex::ignore_unused(dest); // for cuda
+                    if constexpr (op == OpType::Store) {
+                        dest(i,j,k,n) = rhs;
+                    } else {
+                        dest(i,j,k,n) += rhs;
+                    }
+                });
+#endif
+            }
+#if (AMREX_SPACEDIM >= 2)
+            else if (dir == 1) {
+#if (AMREX_SPACEDIM == 2)
+                AMREX_HOST_DEVICE_PARALLEL_FOR_4D (b, ncomp, i, j, k, n,
+                {
+                    amrex::ignore_unused(k,dest);
+                    auto rhs = (src(2*i  ,2*j,0,n) +
+                                src(2*i+1,2*j,0,n)) * (Real(0.5)*sf);
+                    if constexpr (op == OpType::Store) {
+                        dest(i,j,0,n) = rhs;
+                    } else {
+                        dest(i,j,0,n) += rhs;
+                    }
+                });
+#endif
+#if (AMREX_SPACEDIM == 3)
+                AMREX_HOST_DEVICE_PARALLEL_FOR_4D (b, ncomp, i, j, k, n,
+                {
+                    auto rhs = (src(2*i  ,2*j,2*k  ,n) +
+                                src(2*i+1,2*j,2*k  ,n) +
+                                src(2*i  ,2*j,2*k+1,n) +
+                                src(2*i+1,2*j,2*k+1,n)) * (Real(0.25)*sf);
+                    amrex::ignore_unused(dest); // for cuda
+                    if constexpr (op == OpType::Store) {
+                        dest(i,j,k,n) = rhs;
+                    } else {
+                        dest(i,j,k,n) += rhs;
+                    }
+                });
+#endif
+            }
+#if (AMREX_SPACEDIM == 3)
+            else {
+                AMREX_HOST_DEVICE_PARALLEL_FOR_4D (b, ncomp, i, j, k, n,
+                {
+                    auto rhs = (src(2*i  ,2*j  ,2*k,n) +
+                                src(2*i+1,2*j  ,2*k,n) +
+                                src(2*i  ,2*j+1,2*k,n) +
+                                src(2*i+1,2*j+1,2*k,n)) * (Real(0.25)*sf);
+                    amrex::ignore_unused(dest); // for cuda
+                    if constexpr (op == OpType::Store) {
+                        dest(i,j,k,n) = rhs;
+                    } else {
+                        dest(i,j,k,n) += rhs;
+                    }
+                });
+            }
+#endif
+#endif
+        }
+    }
+}
+
+template <FlashFluxRegister::OpType op>
+void FlashFluxRegister::store_or_add (int fine_global_index, int dir,
+                                      FArrayBox const& fine_flux,
+                                      FArrayBox const& fine_area, Real sf)
+{
+    AMREX_ASSERT(dir < AMREX_SPACEDIM);
+    auto found = m_fine_map.find(fine_global_index);
+    if (found != m_fine_map.end()) {
+        const int ncomp = m_ncomp;
+        Array<FArrayBox*,AMREX_SPACEDIM> const& fab_a = found->second;
+        if (fab_a[dir]) {
+            Box const& b = fab_a[dir]->box();
+            Array4<Real> const& dest = fab_a[dir]->array();
+            Array4<Real const> const& src = fine_flux.const_array();
+            Array4<Real const> const& area = fine_area.const_array();
+            if (dir == 0) {
+#if (AMREX_SPACEDIM == 1)
+                AMREX_HOST_DEVICE_PARALLEL_FOR_4D (b, ncomp, i, j, k, n,
+                {
+                    amrex::ignore_unused(j,k,dest);
+                    auto rhs = src(2*i,0,0,n)*area(2*i,0,0)*sf;
+                    if constexpr (op == OpType::Store) {
+                        dest(i,0,0,n) = rhs;
+                    } else {
+                        dest(i,0,0,n) += rhs;
+                    }
+                });
+#endif
+#if (AMREX_SPACEDIM == 2)
+                AMREX_HOST_DEVICE_PARALLEL_FOR_4D (b, ncomp, i, j, k, n,
+                {
+                    amrex::ignore_unused(k,dest);
+                    auto rhs = (src(2*i,2*j  ,0,n)*area(2*i,2*j  ,0) +
+                                src(2*i,2*j+1,0,n)*area(2*i,2*j+1,0)) * sf;
+                    if constexpr (op == OpType::Store) {
+                        dest(i,j,0,n) = rhs;
+                    } else {
+                        dest(i,j,0,n) += rhs;
+                    }
+                });
+#endif
+#if (AMREX_SPACEDIM == 3)
+                AMREX_HOST_DEVICE_PARALLEL_FOR_4D (b, ncomp, i, j, k, n,
+                {
+                    auto rhs = (src(2*i,2*j  ,2*k  ,n)*area(2*i,2*j  ,2*k  ) +
+                                src(2*i,2*j+1,2*k  ,n)*area(2*i,2*j+1,2*k  ) +
+                                src(2*i,2*j  ,2*k+1,n)*area(2*i,2*j  ,2*k+1) +
+                                src(2*i,2*j+1,2*k+1,n)*area(2*i,2*j+1,2*k+1)) * sf;
+                    amrex::ignore_unused(dest); // for cuda
+                    if constexpr (op == OpType::Store) {
+                        dest(i,j,k,n) = rhs;
+                    } else {
+                        dest(i,j,k,n) += rhs;
+                    }
+                });
+#endif
+            }
+#if (AMREX_SPACEDIM >= 2)
+            else if (dir == 1) {
+#if (AMREX_SPACEDIM == 2)
+                AMREX_HOST_DEVICE_PARALLEL_FOR_4D (b, ncomp, i, j, k, n,
+                {
+                    amrex::ignore_unused(k,dest);
+                    auto rhs = (src(2*i  ,2*j,0,n)*area(2*i  ,2*j,0) +
+                                src(2*i+1,2*j,0,n)*area(2*i+1,2*j,0)) * sf;
+                    if constexpr (op == OpType::Store) {
+                        dest(i,j,0,n) = rhs;
+                    } else {
+                        dest(i,j,0,n) += rhs;
+                    }
+                });
+#endif
+#if (AMREX_SPACEDIM == 3)
+                AMREX_HOST_DEVICE_PARALLEL_FOR_4D (b, ncomp, i, j, k, n,
+                {
+                    auto rhs = (src(2*i  ,2*j,2*k  ,n)*area(2*i  ,2*j,2*k  ) +
+                                src(2*i+1,2*j,2*k  ,n)*area(2*i+1,2*j,2*k  ) +
+                                src(2*i  ,2*j,2*k+1,n)*area(2*i  ,2*j,2*k+1) +
+                                src(2*i+1,2*j,2*k+1,n)*area(2*i+1,2*j,2*k+1)) * sf;
+                    amrex::ignore_unused(dest); // for cuda
+                    if constexpr (op == OpType::Store) {
+                        dest(i,j,k,n) = rhs;
+                    } else {
+                        dest(i,j,k,n) += rhs;
+                    }
+                });
+#endif
+            }
+#if (AMREX_SPACEDIM == 3)
+            else {
+                AMREX_HOST_DEVICE_PARALLEL_FOR_4D (b, ncomp, i, j, k, n,
+                {
+                    auto rhs = (src(2*i  ,2*j  ,2*k,n)*area(2*i  ,2*j  ,2*k) +
+                                src(2*i+1,2*j  ,2*k,n)*area(2*i+1,2*j  ,2*k) +
+                                src(2*i  ,2*j+1,2*k,n)*area(2*i  ,2*j+1,2*k) +
+                                src(2*i+1,2*j+1,2*k,n)*area(2*i+1,2*j+1,2*k)) * sf;
+                    amrex::ignore_unused(dest); // for cuda
+                    if constexpr (op == OpType::Store) {
+                        dest(i,j,k,n) = rhs;
+                    } else {
+                        dest(i,j,k,n) += rhs;
+                    }
+                });
+            }
+#endif
+#endif
+        }
+    }
+}
+
+template <FlashFluxRegister::OpType op>
+void FlashFluxRegister::store_or_add (int fine_global_index, int dir,
+                                      FArrayBox const& fine_flux,
+                                      FArrayBox const& fine_area,
+                                      const int* isFluxDensity, Real sf)
+{
+    auto& h_ifd = m_h_ifd[OpenMP::get_thread_num()];
+    auto& d_ifd = m_d_ifd[OpenMP::get_thread_num()];
+
+    AMREX_ASSERT(dir < AMREX_SPACEDIM);
+    auto found = m_fine_map.find(fine_global_index);
+    if (found != m_fine_map.end()) {
+        const int ncomp = m_ncomp;
+        Array<FArrayBox*,AMREX_SPACEDIM> const& fab_a = found->second;
+        if (fab_a[dir]) {
+            bool allsame = true;
+            for (int n = 0; n < m_ncomp; ++n) {
+                if (h_ifd[n] != isFluxDensity[n]) {
+                    allsame = false;
+                    h_ifd[n] = isFluxDensity[n];
+                }
+            }
+            if (d_ifd.empty()) {
+                allsame = false;
+                d_ifd.resize(m_ncomp);
+            }
+            if (! allsame) {
+                Gpu::copyAsync(Gpu::HostToDevice(), h_ifd.begin(), h_ifd.end(), d_ifd.begin());
+            }
+
+            Box const& b = fab_a[dir]->box();
+            Array4<Real> const& dest = fab_a[dir]->array();
+            Array4<Real const> const& src = fine_flux.const_array();
+            Array4<Real const> const& area = fine_area.const_array();
+            const int* ifd = d_ifd.data();
+            if (dir == 0) {
+#if (AMREX_SPACEDIM == 1)
+                AMREX_HOST_DEVICE_PARALLEL_FOR_4D (b, ncomp, i, j, k, n,
+                {
+                    amrex::ignore_unused(j,k,dest);
+                    Real rhs;
+                    if (ifd[n]) {
+                        rhs = src(2*i,0,0,n)*area(2*i,0,0)*sf;
+                    } else {
+                        rhs = src(2*i,0,0,n)*sf;
+                    }
+                    if constexpr (op == OpType::Store) {
+                        dest(i,0,0,n) = rhs;
+                    } else {
+                        dest(i,0,0,n) += rhs;
+                    }
+                });
+#endif
+#if (AMREX_SPACEDIM == 2)
+                AMREX_HOST_DEVICE_PARALLEL_FOR_4D (b, ncomp, i, j, k, n,
+                {
+                    amrex::ignore_unused(k,dest);
+                    Real rhs;
+                    if (ifd[n]) {
+                        rhs = (src(2*i,2*j  ,0,n)*area(2*i,2*j  ,0) +
+                               src(2*i,2*j+1,0,n)*area(2*i,2*j+1,0)) * sf;
+                    } else {
+                        rhs = (src(2*i,2*j  ,0,n) +
+                               src(2*i,2*j+1,0,n)) * sf;
+                    }
+                    if constexpr (op == OpType::Store) {
+                        dest(i,j,0,n) = rhs;
+                    } else {
+                        dest(i,j,0,n) += rhs;
+                    }
+                });
+#endif
+#if (AMREX_SPACEDIM == 3)
+                AMREX_HOST_DEVICE_PARALLEL_FOR_4D (b, ncomp, i, j, k, n,
+                {
+                    Real rhs;
+                    if (ifd[n]) {
+                        rhs = (src(2*i,2*j  ,2*k  ,n)*area(2*i,2*j  ,2*k  ) +
+                               src(2*i,2*j+1,2*k  ,n)*area(2*i,2*j+1,2*k  ) +
+                               src(2*i,2*j  ,2*k+1,n)*area(2*i,2*j  ,2*k+1) +
+                               src(2*i,2*j+1,2*k+1,n)*area(2*i,2*j+1,2*k+1)) * sf;
+                    } else {
+                        rhs = (src(2*i,2*j  ,2*k  ,n) +
+                               src(2*i,2*j+1,2*k  ,n) +
+                               src(2*i,2*j  ,2*k+1,n) +
+                               src(2*i,2*j+1,2*k+1,n)) * sf;
+                    }
+                    amrex::ignore_unused(dest); // for cuda
+                    if constexpr (op == OpType::Store) {
+                        dest(i,j,k,n) = rhs;
+                    } else {
+                        dest(i,j,k,n) += rhs;
+                    }
+                });
+#endif
+            }
+#if (AMREX_SPACEDIM >= 2)
+            else if (dir == 1) {
+#if (AMREX_SPACEDIM == 2)
+                AMREX_HOST_DEVICE_PARALLEL_FOR_4D (b, ncomp, i, j, k, n,
+                {
+                    amrex::ignore_unused(k,dest);
+                    Real rhs;
+                    if (ifd[n]) {
+                        rhs = (src(2*i  ,2*j,0,n)*area(2*i  ,2*j,0) +
+                               src(2*i+1,2*j,0,n)*area(2*i+1,2*j,0)) * sf;
+                    } else {
+                        rhs = (src(2*i  ,2*j,0,n) +
+                               src(2*i+1,2*j,0,n)) * sf;
+                    }
+                    if constexpr (op == OpType::Store) {
+                        dest(i,j,0,n) = rhs;
+                    } else {
+                        dest(i,j,0,n) += rhs;
+                    }
+                });
+#endif
+#if (AMREX_SPACEDIM == 3)
+                AMREX_HOST_DEVICE_PARALLEL_FOR_4D (b, ncomp, i, j, k, n,
+                {
+                    Real rhs;
+                    if (ifd[n]) {
+                        rhs = (src(2*i  ,2*j,2*k  ,n)*area(2*i  ,2*j,2*k  ) +
+                               src(2*i+1,2*j,2*k  ,n)*area(2*i+1,2*j,2*k  ) +
+                               src(2*i  ,2*j,2*k+1,n)*area(2*i  ,2*j,2*k+1) +
+                               src(2*i+1,2*j,2*k+1,n)*area(2*i+1,2*j,2*k+1)) * sf;
+                    } else {
+                        rhs = (src(2*i  ,2*j,2*k  ,n) +
+                               src(2*i+1,2*j,2*k  ,n) +
+                               src(2*i  ,2*j,2*k+1,n) +
+                               src(2*i+1,2*j,2*k+1,n)) * sf;
+                    }
+                    amrex::ignore_unused(dest); // for cuda
+                    if constexpr (op == OpType::Store) {
+                        dest(i,j,k,n) = rhs;
+                    } else {
+                        dest(i,j,k,n) += rhs;
+                    }
+                });
+#endif
+            }
+#if (AMREX_SPACEDIM == 3)
+            else {
+                AMREX_HOST_DEVICE_PARALLEL_FOR_4D (b, ncomp, i, j, k, n,
+                {
+                    Real rhs;
+                    if (ifd[n]) {
+                        rhs = (src(2*i  ,2*j  ,2*k,n)*area(2*i  ,2*j  ,2*k) +
+                               src(2*i+1,2*j  ,2*k,n)*area(2*i+1,2*j  ,2*k) +
+                               src(2*i  ,2*j+1,2*k,n)*area(2*i  ,2*j+1,2*k) +
+                               src(2*i+1,2*j+1,2*k,n)*area(2*i+1,2*j+1,2*k)) * sf;
+                    } else {
+                        rhs = (src(2*i  ,2*j  ,2*k,n) +
+                               src(2*i+1,2*j  ,2*k,n) +
+                               src(2*i  ,2*j+1,2*k,n) +
+                               src(2*i+1,2*j+1,2*k,n)) * sf;
+                    }
+                    amrex::ignore_unused(dest); // for cuda
+                    if constexpr (op == OpType::Store) {
+                        dest(i,j,k,n) = rhs;
+                    } else {
+                        dest(i,j,k,n) += rhs;
+                    }
+                });
+            }
+#endif
+#endif
+        }
+    }
+}
+
 }
 
 #endif
diff --git a/Src/F_Interfaces/AmrCore/AMReX_FlashFluxRegister.cpp b/Src/F_Interfaces/AmrCore/AMReX_FlashFluxRegister.cpp
index abe74c41125..0e804f76a12 100644
--- a/Src/F_Interfaces/AmrCore/AMReX_FlashFluxRegister.cpp
+++ b/Src/F_Interfaces/AmrCore/AMReX_FlashFluxRegister.cpp
@@ -1,5 +1,4 @@
 #include <AMReX_FlashFluxRegister.H>
-#include <AMReX_OpenMP.H>
 
 namespace amrex {
 
@@ -178,279 +177,36 @@ void FlashFluxRegister::define (const BoxArray& fba, const BoxArray& cba,
 
 void FlashFluxRegister::store (int fine_global_index, int dir, FArrayBox const& fine_flux, Real sf)
 {
-    AMREX_ASSERT(dir < AMREX_SPACEDIM);
-    auto found = m_fine_map.find(fine_global_index);
-    if (found != m_fine_map.end()) {
-        const int ncomp = m_ncomp;
-        Array<FArrayBox*,AMREX_SPACEDIM> const& fab_a = found->second;
-        if (fab_a[dir]) {
-            Box const& b = fab_a[dir]->box();
-            Array4<Real> const& dest = fab_a[dir]->array();
-            Array4<Real const> const& src = fine_flux.const_array();
-            if (dir == 0) {
-#if (AMREX_SPACEDIM == 1)
-                AMREX_HOST_DEVICE_PARALLEL_FOR_4D (b, ncomp, i, j, k, n,
-                {
-                    amrex::ignore_unused(j,k);
-                    dest(i,0,0,n) = src(2*i,0,0,n)*sf;
-                });
-#endif
-#if (AMREX_SPACEDIM == 2)
-                AMREX_HOST_DEVICE_PARALLEL_FOR_4D (b, ncomp, i, j, k, n,
-                {
-                    amrex::ignore_unused(k);
-                    dest(i,j,0,n) = (src(2*i,2*j  ,0,n) +
-                                     src(2*i,2*j+1,0,n)) * (Real(0.5)*sf);
-                });
-#endif
-#if (AMREX_SPACEDIM == 3)
-                AMREX_HOST_DEVICE_PARALLEL_FOR_4D (b, ncomp, i, j, k, n,
-                {
-                    dest(i,j,k,n) = (src(2*i,2*j  ,2*k  ,n) +
-                                     src(2*i,2*j+1,2*k  ,n) +
-                                     src(2*i,2*j  ,2*k+1,n) +
-                                     src(2*i,2*j+1,2*k+1,n)) * (Real(0.25)*sf);
-                });
-#endif
-            }
-#if (AMREX_SPACEDIM >= 2)
-            else if (dir == 1) {
-#if (AMREX_SPACEDIM == 2)
-                AMREX_HOST_DEVICE_PARALLEL_FOR_4D (b, ncomp, i, j, k, n,
-                {
-                    amrex::ignore_unused(k);
-                    dest(i,j,0,n) = (src(2*i  ,2*j,0,n) +
-                                     src(2*i+1,2*j,0,n)) * (Real(0.5)*sf);
-                });
-#endif
-#if (AMREX_SPACEDIM == 3)
-                AMREX_HOST_DEVICE_PARALLEL_FOR_4D (b, ncomp, i, j, k, n,
-                {
-                    dest(i,j,k,n) = (src(2*i  ,2*j,2*k  ,n) +
-                                     src(2*i+1,2*j,2*k  ,n) +
-                                     src(2*i  ,2*j,2*k+1,n) +
-                                     src(2*i+1,2*j,2*k+1,n)) * (Real(0.25)*sf);
-                });
-#endif
-            }
-#if (AMREX_SPACEDIM == 3)
-            else {
-                AMREX_HOST_DEVICE_PARALLEL_FOR_4D (b, ncomp, i, j, k, n,
-                {
-                    dest(i,j,k,n) = (src(2*i  ,2*j  ,2*k,n) +
-                                     src(2*i+1,2*j  ,2*k,n) +
-                                     src(2*i  ,2*j+1,2*k,n) +
-                                     src(2*i+1,2*j+1,2*k,n)) * (Real(0.25)*sf);
-                });
-            }
-#endif
-#endif
-        }
-    }
+    store_or_add<OpType::Store>(fine_global_index, dir, fine_flux, sf);
 }
 
 void FlashFluxRegister::store (int fine_global_index, int dir, FArrayBox const& fine_flux,
                                FArrayBox const& fine_area, Real sf)
 {
-    AMREX_ASSERT(dir < AMREX_SPACEDIM);
-    auto found = m_fine_map.find(fine_global_index);
-    if (found != m_fine_map.end()) {
-        const int ncomp = m_ncomp;
-        Array<FArrayBox*,AMREX_SPACEDIM> const& fab_a = found->second;
-        if (fab_a[dir]) {
-            Box const& b = fab_a[dir]->box();
-            Array4<Real> const& dest = fab_a[dir]->array();
-            Array4<Real const> const& src = fine_flux.const_array();
-            Array4<Real const> const& area = fine_area.const_array();
-            if (dir == 0) {
-#if (AMREX_SPACEDIM == 1)
-                AMREX_HOST_DEVICE_PARALLEL_FOR_4D (b, ncomp, i, j, k, n,
-                {
-                    amrex::ignore_unused(j,k);
-                    dest(i,0,0,n) = src(2*i,0,0,n)*area(2*i,0,0)*sf;
-                });
-#endif
-#if (AMREX_SPACEDIM == 2)
-                AMREX_HOST_DEVICE_PARALLEL_FOR_4D (b, ncomp, i, j, k, n,
-                {
-                    amrex::ignore_unused(k);
-                    dest(i,j,0,n) = (src(2*i,2*j  ,0,n)*area(2*i,2*j  ,0) +
-                                     src(2*i,2*j+1,0,n)*area(2*i,2*j+1,0)) * sf;
-                });
-#endif
-#if (AMREX_SPACEDIM == 3)
-                AMREX_HOST_DEVICE_PARALLEL_FOR_4D (b, ncomp, i, j, k, n,
-                {
-                    dest(i,j,k,n) = (src(2*i,2*j  ,2*k  ,n)*area(2*i,2*j  ,2*k  ) +
-                                     src(2*i,2*j+1,2*k  ,n)*area(2*i,2*j+1,2*k  ) +
-                                     src(2*i,2*j  ,2*k+1,n)*area(2*i,2*j  ,2*k+1) +
-                                     src(2*i,2*j+1,2*k+1,n)*area(2*i,2*j+1,2*k+1)) * sf;
-                });
-#endif
-            }
-#if (AMREX_SPACEDIM >= 2)
-            else if (dir == 1) {
-#if (AMREX_SPACEDIM == 2)
-                AMREX_HOST_DEVICE_PARALLEL_FOR_4D (b, ncomp, i, j, k, n,
-                {
-                    amrex::ignore_unused(k);
-                    dest(i,j,0,n) = (src(2*i  ,2*j,0,n)*area(2*i  ,2*j,0) +
-                                     src(2*i+1,2*j,0,n)*area(2*i+1,2*j,0)) * sf;
-                });
-#endif
-#if (AMREX_SPACEDIM == 3)
-                AMREX_HOST_DEVICE_PARALLEL_FOR_4D (b, ncomp, i, j, k, n,
-                {
-                    dest(i,j,k,n) = (src(2*i  ,2*j,2*k  ,n)*area(2*i  ,2*j,2*k  ) +
-                                     src(2*i+1,2*j,2*k  ,n)*area(2*i+1,2*j,2*k  ) +
-                                     src(2*i  ,2*j,2*k+1,n)*area(2*i  ,2*j,2*k+1) +
-                                     src(2*i+1,2*j,2*k+1,n)*area(2*i+1,2*j,2*k+1)) * sf;
-                });
-#endif
-            }
-#if (AMREX_SPACEDIM == 3)
-            else {
-                AMREX_HOST_DEVICE_PARALLEL_FOR_4D (b, ncomp, i, j, k, n,
-                {
-                    dest(i,j,k,n) = (src(2*i  ,2*j  ,2*k,n)*area(2*i  ,2*j  ,2*k) +
-                                     src(2*i+1,2*j  ,2*k,n)*area(2*i+1,2*j  ,2*k) +
-                                     src(2*i  ,2*j+1,2*k,n)*area(2*i  ,2*j+1,2*k) +
-                                     src(2*i+1,2*j+1,2*k,n)*area(2*i+1,2*j+1,2*k)) * sf;
-                });
-            }
-#endif
-#endif
-        }
-    }
+    store_or_add<OpType::Store>(fine_global_index, dir, fine_flux, fine_area, sf);
 }
 
 void FlashFluxRegister::store (int fine_global_index, int dir, FArrayBox const& fine_flux,
                                FArrayBox const& fine_area, const int* isFluxDensity, Real sf)
 {
-    auto& h_ifd = m_h_ifd[OpenMP::get_thread_num()];
-    auto& d_ifd = m_d_ifd[OpenMP::get_thread_num()];
+    store_or_add<OpType::Store>(fine_global_index, dir, fine_flux, fine_area, isFluxDensity, sf);
+}
 
-    AMREX_ASSERT(dir < AMREX_SPACEDIM);
-    auto found = m_fine_map.find(fine_global_index);
-    if (found != m_fine_map.end()) {
-        const int ncomp = m_ncomp;
-        Array<FArrayBox*,AMREX_SPACEDIM> const& fab_a = found->second;
-        if (fab_a[dir]) {
-            bool allsame = true;
-            for (int n = 0; n < m_ncomp; ++n) {
-                if (h_ifd[n] != isFluxDensity[n]) {
-                    allsame = false;
-                    h_ifd[n] = isFluxDensity[n];
-                }
-            }
-            if (d_ifd.empty()) {
-                allsame = false;
-                d_ifd.resize(m_ncomp);
-            }
-            if (! allsame) {
-                Gpu::copyAsync(Gpu::HostToDevice(), h_ifd.begin(), h_ifd.end(), d_ifd.begin());
-            }
+void FlashFluxRegister::add (int fine_global_index, int dir, FArrayBox const& fine_flux, Real sf)
+{
+    store_or_add<OpType::Add>(fine_global_index, dir, fine_flux, sf);
+}
 
-            Box const& b = fab_a[dir]->box();
-            Array4<Real> const& dest = fab_a[dir]->array();
-            Array4<Real const> const& src = fine_flux.const_array();
-            Array4<Real const> const& area = fine_area.const_array();
-            const int* ifd = d_ifd.data();
-            if (dir == 0) {
-#if (AMREX_SPACEDIM == 1)
-                AMREX_HOST_DEVICE_PARALLEL_FOR_4D (b, ncomp, i, j, k, n,
-                {
-                    amrex::ignore_unused(j,k);
-                    if (ifd[n]) {
-                        dest(i,0,0,n) = src(2*i,0,0,n)*area(2*i,0,0)*sf;
-                    } else {
-                        dest(i,0,0,n) = src(2*i,0,0,n)*sf;
-                    }
-                });
-#endif
-#if (AMREX_SPACEDIM == 2)
-                AMREX_HOST_DEVICE_PARALLEL_FOR_4D (b, ncomp, i, j, k, n,
-                {
-                    amrex::ignore_unused(k);
-                    if (ifd[n]) {
-                        dest(i,j,0,n) = (src(2*i,2*j  ,0,n)*area(2*i,2*j  ,0) +
-                                         src(2*i,2*j+1,0,n)*area(2*i,2*j+1,0)) * sf;
-                    } else {
-                        dest(i,j,0,n) = (src(2*i,2*j  ,0,n) +
-                                         src(2*i,2*j+1,0,n)) * sf;
-                    }
-                });
-#endif
-#if (AMREX_SPACEDIM == 3)
-                AMREX_HOST_DEVICE_PARALLEL_FOR_4D (b, ncomp, i, j, k, n,
-                {
-                    if (ifd[n]) {
-                        dest(i,j,k,n) = (src(2*i,2*j  ,2*k  ,n)*area(2*i,2*j  ,2*k  ) +
-                                         src(2*i,2*j+1,2*k  ,n)*area(2*i,2*j+1,2*k  ) +
-                                         src(2*i,2*j  ,2*k+1,n)*area(2*i,2*j  ,2*k+1) +
-                                         src(2*i,2*j+1,2*k+1,n)*area(2*i,2*j+1,2*k+1)) * sf;
-                    } else {
-                        dest(i,j,k,n) = (src(2*i,2*j  ,2*k  ,n) +
-                                         src(2*i,2*j+1,2*k  ,n) +
-                                         src(2*i,2*j  ,2*k+1,n) +
-                                         src(2*i,2*j+1,2*k+1,n)) * sf;
-                    }
-                });
-#endif
-            }
-#if (AMREX_SPACEDIM >= 2)
-            else if (dir == 1) {
-#if (AMREX_SPACEDIM == 2)
-                AMREX_HOST_DEVICE_PARALLEL_FOR_4D (b, ncomp, i, j, k, n,
-                {
-                    amrex::ignore_unused(k);
-                    if (ifd[n]) {
-                        dest(i,j,0,n) = (src(2*i  ,2*j,0,n)*area(2*i  ,2*j,0) +
-                                         src(2*i+1,2*j,0,n)*area(2*i+1,2*j,0)) * sf;
-                    } else {
-                        dest(i,j,0,n) = (src(2*i  ,2*j,0,n) +
-                                         src(2*i+1,2*j,0,n)) * sf;
-                    }
-                });
-#endif
-#if (AMREX_SPACEDIM == 3)
-                AMREX_HOST_DEVICE_PARALLEL_FOR_4D (b, ncomp, i, j, k, n,
-                {
-                    if (ifd[n]) {
-                        dest(i,j,k,n) = (src(2*i  ,2*j,2*k  ,n)*area(2*i  ,2*j,2*k  ) +
-                                         src(2*i+1,2*j,2*k  ,n)*area(2*i+1,2*j,2*k  ) +
-                                         src(2*i  ,2*j,2*k+1,n)*area(2*i  ,2*j,2*k+1) +
-                                         src(2*i+1,2*j,2*k+1,n)*area(2*i+1,2*j,2*k+1)) * sf;
-                    } else {
-                        dest(i,j,k,n) = (src(2*i  ,2*j,2*k  ,n) +
-                                         src(2*i+1,2*j,2*k  ,n) +
-                                         src(2*i  ,2*j,2*k+1,n) +
-                                         src(2*i+1,2*j,2*k+1,n)) * sf;
-                    }
-                });
-#endif
-            }
-#if (AMREX_SPACEDIM == 3)
-            else {
-                AMREX_HOST_DEVICE_PARALLEL_FOR_4D (b, ncomp, i, j, k, n,
-                {
-                    if (ifd[n]) {
-                        dest(i,j,k,n) = (src(2*i  ,2*j  ,2*k,n)*area(2*i  ,2*j  ,2*k) +
-                                         src(2*i+1,2*j  ,2*k,n)*area(2*i+1,2*j  ,2*k) +
-                                         src(2*i  ,2*j+1,2*k,n)*area(2*i  ,2*j+1,2*k) +
-                                         src(2*i+1,2*j+1,2*k,n)*area(2*i+1,2*j+1,2*k)) * sf;
-                    } else {
-                        dest(i,j,k,n) = (src(2*i  ,2*j  ,2*k,n) +
-                                         src(2*i+1,2*j  ,2*k,n) +
-                                         src(2*i  ,2*j+1,2*k,n) +
-                                         src(2*i+1,2*j+1,2*k,n)) * sf;
-                    }
-                });
-            }
-#endif
-#endif
-        }
-    }
+void FlashFluxRegister::add (int fine_global_index, int dir, FArrayBox const& fine_flux,
+                             FArrayBox const& fine_area, Real sf)
+{
+    store_or_add<OpType::Add>(fine_global_index, dir, fine_flux, fine_area, sf);
+}
+
+void FlashFluxRegister::add (int fine_global_index, int dir, FArrayBox const& fine_flux,
+                             FArrayBox const& fine_area, const int* isFluxDensity, Real sf)
+{
+    store_or_add<OpType::Add>(fine_global_index, dir, fine_flux, fine_area, isFluxDensity, sf);
 }
 
 void FlashFluxRegister::communicate ()
diff --git a/Src/F_Interfaces/AmrCore/AMReX_flash_fluxregister_fi.cpp b/Src/F_Interfaces/AmrCore/AMReX_flash_fluxregister_fi.cpp
index e8c2bace6d5..562405a889e 100644
--- a/Src/F_Interfaces/AmrCore/AMReX_flash_fluxregister_fi.cpp
+++ b/Src/F_Interfaces/AmrCore/AMReX_flash_fluxregister_fi.cpp
@@ -119,6 +119,41 @@ extern "C" {
         flux_reg->store(fgid, dir, fab, areafab, ifd, scaling_factor);
     }
 
+    void amrex_fi_flash_fluxregister_add (FlashFluxRegister* flux_reg, int fgid, int dir,
+                                            Real const* flux, const int* flo, const int* fhi, int nc,
+                                            Real scaling_factor)
+    {
+        Box bx;
+        bx = Box(IntVect(flo), IntVect(fhi));
+        bx.shiftHalf(dir,-1);
+        const FArrayBox fab(bx,nc,const_cast<Real*>(flux));
+        flux_reg->add(fgid, dir, fab, scaling_factor);
+    }
+
+    void amrex_fi_flash_fluxregister_add_area (FlashFluxRegister* flux_reg, int fgid, int dir,
+                                                 Real const* flux, const int* flo, const int* fhi, int nc,
+                                                 Real const* area, Real scaling_factor)
+    {
+        Box bx;
+        bx = Box(IntVect(flo), IntVect(fhi));
+        bx.shiftHalf(dir,-1);
+        const FArrayBox fab(bx,nc,const_cast<Real*>(flux));
+        const FArrayBox areafab(bx,1,const_cast<Real*>(area));
+        flux_reg->add(fgid, dir, fab, areafab, scaling_factor);
+    }
+
+    void amrex_fi_flash_fluxregister_add_area_ifd (FlashFluxRegister* flux_reg, int fgid, int dir,
+                                                     Real const* flux, const int* flo, const int* fhi, int nc,
+                                                     Real const* area, const int* ifd, Real scaling_factor)
+    {
+        Box bx;
+        bx = Box(IntVect(flo), IntVect(fhi));
+        bx.shiftHalf(dir,-1);
+        const FArrayBox fab(bx,nc,const_cast<Real*>(flux));
+        const FArrayBox areafab(bx,1,const_cast<Real*>(area));
+        flux_reg->add(fgid, dir, fab, areafab, ifd, scaling_factor);
+    }
+
     void amrex_fi_flash_fluxregister_communicate (FlashFluxRegister* flux_reg)
     {
         flux_reg->communicate();
diff --git a/Src/F_Interfaces/AmrCore/AMReX_flash_fluxregister_mod.F90 b/Src/F_Interfaces/AmrCore/AMReX_flash_fluxregister_mod.F90
index e13cc0e2543..dc0f00fdd0b 100644
--- a/Src/F_Interfaces/AmrCore/AMReX_flash_fluxregister_mod.F90
+++ b/Src/F_Interfaces/AmrCore/AMReX_flash_fluxregister_mod.F90
@@ -138,6 +138,36 @@ subroutine amrex_fi_flash_fluxregister_store_area_ifd (fr,cgid,dir,flux,flo,fhi,
        real(amrex_real), value :: scale
      end subroutine amrex_fi_flash_fluxregister_store_area_ifd
 
+     subroutine amrex_fi_flash_fluxregister_add (fr,cgid,dir,flux,flo,fhi,nc,scale) bind(c)
+       import
+       implicit none
+       type(c_ptr), value :: fr
+       real(amrex_real), intent(in) :: flux(*)
+       integer(c_int), value, intent(in) :: cgid, dir, nc
+       integer(c_int), intent(in) :: flo(*), fhi(*)
+       real(amrex_real), value :: scale
+     end subroutine amrex_fi_flash_fluxregister_add
+
+     subroutine amrex_fi_flash_fluxregister_add_area (fr,cgid,dir,flux,flo,fhi,nc,area,scale) bind(c)
+       import
+       implicit none
+       type(c_ptr), value :: fr
+       real(amrex_real), intent(in) :: flux(*), area(*)
+       integer(c_int), value, intent(in) :: cgid, dir, nc
+       integer(c_int), intent(in) :: flo(*), fhi(*)
+       real(amrex_real), value :: scale
+     end subroutine amrex_fi_flash_fluxregister_add_area
+
+     subroutine amrex_fi_flash_fluxregister_add_area_ifd (fr,cgid,dir,flux,flo,fhi,nc,area,ifd,scale) bind(c)
+       import
+       implicit none
+       type(c_ptr), value :: fr
+       real(amrex_real), intent(in) :: flux(*), area(*)
+       integer(c_int), value, intent(in) :: cgid, dir, nc
+       integer(c_int), intent(in) :: flo(*), fhi(*), ifd(*)
+       real(amrex_real), value :: scale
+     end subroutine amrex_fi_flash_fluxregister_add_area_ifd
+
      subroutine amrex_fi_flash_fluxregister_communicate (fr) bind(c)
        import
        implicit none
@@ -186,50 +216,80 @@ subroutine amrex_flash_fluxregister_communicate (this)
     call amrex_fi_flash_fluxregister_communicate(this%p)
   end subroutine amrex_flash_fluxregister_communicate
 
-  subroutine amrex_flash_fluxregister_store (this, flux, flo, fhi, grid_idx, dir, scale)
+  subroutine amrex_flash_fluxregister_store (this, flux, flo, fhi, grid_idx, dir, addit, scale)
     class(amrex_flash_fluxregister), intent(inout) :: this
     integer, intent(in) :: flo(*), fhi(*), grid_idx, dir
     real(amrex_real), intent(in) :: flux(flo(1):fhi(1),flo(2):fhi(2),flo(3):fhi(3),flo(4):fhi(4))
+    logical,          optional, intent(in) :: addit
     real(amrex_real), optional, intent(in) :: scale
-    !
+    logical          :: my_addit
     real(amrex_real) :: my_scale
+    if (present(addit)) then
+       my_addit = addit
+    else
+       my_addit = .FALSE.
+    end if
     if (present(scale)) then
        my_scale = scale
     else
        my_scale = 1._amrex_real
     end if
-    call amrex_fi_flash_fluxregister_store(this%p, grid_idx, dir, &
-         flux, flo, fhi, fhi(4)-flo(4)+1, my_scale)
+    if (my_addit) then
+       call amrex_fi_flash_fluxregister_add(this%p, grid_idx, dir, &
+            flux, flo, fhi, fhi(4)-flo(4)+1, my_scale)
+    else
+       call amrex_fi_flash_fluxregister_store(this%p, grid_idx, dir, &
+            flux, flo, fhi, fhi(4)-flo(4)+1, my_scale)
+    end if
   end subroutine amrex_flash_fluxregister_store
 
-  subroutine amrex_flash_fluxregister_store_area (this, flux, area, flo, fhi, grid_idx, dir, scale)
+  subroutine amrex_flash_fluxregister_store_area (this, flux, area, flo, fhi, grid_idx, dir, addit, scale)
     class(amrex_flash_fluxregister), intent(inout) :: this
     integer, intent(in) :: flo(*), fhi(*), grid_idx, dir
     real(amrex_real), intent(in) :: flux(flo(1):fhi(1),flo(2):fhi(2),flo(3):fhi(3),flo(4):fhi(4))
     real(amrex_real), intent(in) :: area(flo(1):fhi(1),flo(2):fhi(2),flo(3):fhi(3))
+    logical,          optional, intent(in) :: addit
     real(amrex_real), optional, intent(in) :: scale
     !
+    logical          :: my_addit
     real(amrex_real) :: my_scale
+    if (present(addit)) then
+       my_addit = addit
+    else
+       my_addit = .FALSE.
+    end if
     if (present(scale)) then
        my_scale = scale
     else
        my_scale = 1._amrex_real
     end if
-    call amrex_fi_flash_fluxregister_store_area(this%p, grid_idx, dir, &
-         flux, flo, fhi, fhi(4)-flo(4)+1, area, my_scale)
+    if (my_addit) then
+       call amrex_fi_flash_fluxregister_add_area(this%p, grid_idx, dir, &
+            flux, flo, fhi, fhi(4)-flo(4)+1, area, my_scale)
+    else
+       call amrex_fi_flash_fluxregister_store_area(this%p, grid_idx, dir, &
+            flux, flo, fhi, fhi(4)-flo(4)+1, area, my_scale)
+    end if
   end subroutine amrex_flash_fluxregister_store_area
 
   subroutine amrex_flash_fluxregister_store_area_ifd (this, flux, area, flo, fhi, &
-       isFluxDensity, grid_idx, dir, scale)
+       isFluxDensity, grid_idx, dir, addit, scale)
     class(amrex_flash_fluxregister), intent(inout) :: this
     integer, intent(in) :: flo(*), fhi(*), grid_idx, dir
     real(amrex_real), intent(in) :: flux(flo(1):fhi(1),flo(2):fhi(2),flo(3):fhi(3),flo(4):fhi(4))
     real(amrex_real), intent(in) :: area(flo(1):fhi(1),flo(2):fhi(2),flo(3):fhi(3))
     logical, intent(in) :: isFluxDensity(flo(4):fhi(4))
+    logical,          optional, intent(in) :: addit
     real(amrex_real), optional, intent(in) :: scale
     !
+    logical          :: my_addit
     real(amrex_real) :: my_scale
     integer(c_int) :: ifd(flo(4):fhi(4))
+    if (present(addit)) then
+       my_addit = addit
+    else
+       my_addit = .FALSE.
+    end if
     if (present(scale)) then
        my_scale = scale
     else
@@ -240,8 +300,13 @@ subroutine amrex_flash_fluxregister_store_area_ifd (this, flux, area, flo, fhi,
     elsewhere
        ifd = 0
     endwhere
-    call amrex_fi_flash_fluxregister_store_area_ifd(this%p, grid_idx, dir, &
-         flux, flo, fhi, fhi(4)-flo(4)+1, area, ifd, my_scale)
+    if (my_addit) then
+       call amrex_fi_flash_fluxregister_add_area_ifd(this%p, grid_idx, dir, &
+            flux, flo, fhi, fhi(4)-flo(4)+1, area, ifd, my_scale)
+    else
+       call amrex_fi_flash_fluxregister_store_area_ifd(this%p, grid_idx, dir, &
+            flux, flo, fhi, fhi(4)-flo(4)+1, area, ifd, my_scale)
+    end if
   end subroutine amrex_flash_fluxregister_store_area_ifd
 
   subroutine amrex_flash_fluxregister_load_1 (this, flux, flo, fhi, grid_idx, dir, scale)

From ae7b64bcf6a4ed36dd03e17357cae83b0a394912 Mon Sep 17 00:00:00 2001
From: Weiqun Zhang <WeiqunZhang@lbl.gov>
Date: Wed, 1 Nov 2023 08:55:58 -0700
Subject: [PATCH 03/14] Update CHANGES for 23.11 (#3613)

---
 CHANGES | 49 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 49 insertions(+)

diff --git a/CHANGES b/CHANGES
index ae56cdd5ffe..a7a22fd97bd 100644
--- a/CHANGES
+++ b/CHANGES
@@ -1,3 +1,52 @@
+# 23.11
+
+  -- Give FlashFluxRegisters ways to accumulate data in registers (#3597)
+
+  -- `AMReXBuildInfo.cmake`: AMReX_DIR (#3609)
+
+  -- update doc for amrex::Abort on GPU (#3605)
+
+  -- Add runtime particle components to HDF5 wrapper (#3596)
+
+  -- Windows: Fix Installed AMReXBuildInfo.cmake (#3606)
+
+  -- Print AMReX version at the beginning of Initialize (#3604)
+
+  -- Install Move Tools to `shared/amrex` (#3599)
+
+  -- Revert "Add ability for GCC 8 in CMake to build fgradient which uses std::filesystem" (#3601)
+
+  -- Avoid std::filesystem (#3602)
+
+  -- Fix Assertion in MLEBNodeFDLaplacian (#3594)
+
+  -- Fix a memory "leak" in VisMF's persistent streams (#3592)
+
+  -- RealVect Static: Export (#3589)
+
+  -- change MaxCnt from 4 to max(4,max_level+1) for how many iterations we… (#3588)
+     … allow in creation of the initial grid hierarchy
+
+  -- Add Bittree CI (#3577)
+
+  -- BCType::ext_dir_cc (#3581)
+
+  -- Disable CCache in Windows CIs (#3566)
+
+  -- Fix ICC CI by Freeing up Disk Space (#3583)
+
+  -- Docs: Link pyAMReX (#3582)
+
+  -- NodeABecLaplacian: Reuse (#3579)
+
+  -- simplify how 2d surface integrals are computed (#3571)
+
+  -- Adding bittree interface to improve regridding performance in octree mode (#3555)
+
+  -- MLNodeABecLaplacian (#3559)
+
+  -- Fix Boundary Centroid in a Corner Case in 2D (#3568)
+
 # 23.10
 
   -- Bugfix typo in AMReX_SundialsIntegrator.H Nvar vs NVar, the

From 1269a5bf41a8c4f0a832581532fda6e723d9aadc Mon Sep 17 00:00:00 2001
From: Rakesh Roy <137397847+rakesroy@users.noreply.github.com>
Date: Thu, 2 Nov 2023 00:17:30 +0530
Subject: [PATCH 04/14] use hipPointerAttribute_t.type as HIP is removing
 hipPointerAttribute_t.memoryType (#3610)

## Summary
This replaces hipPointerAttribute_t.memoryType with
hipPointerAttribute_t.type.

## Additional background
In ROCm6.0 hipPointerAttribute_t.memoryType will be removed from HIP.
Instead hipPointerAttribute_t.type to be used.
This is causing build failure in
https://github.com/Exawind/amr-wind.git.
hipPointerAttribute_t.type has been existing since ROCm5.5, so this
change will be backward compatible till ROCm5.5.

## Checklist

The proposed changes:
- [x] fix a bug or incorrect behavior in AMReX
- [ ] add new capabilities to AMReX
- [ ] changes answers in the test suite to more than roundoff level
- [ ] are likely to significantly affect the results of downstream AMReX
users
- [ ] include documentation in the code and/or rst files, if appropriate

Co-authored-by: Rakesh Roy <rakesh.roy@amd.com>
---
 Src/Base/AMReX_GpuUtility.H | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/Src/Base/AMReX_GpuUtility.H b/Src/Base/AMReX_GpuUtility.H
index a1fa3cdd9dc..ce98556fc5c 100644
--- a/Src/Base/AMReX_GpuUtility.H
+++ b/Src/Base/AMReX_GpuUtility.H
@@ -63,7 +63,11 @@ namespace Gpu {
 #if defined(AMREX_USE_HIP)
         hipPointerAttribute_t attrib;
         hipError_t r = hipPointerGetAttributes(&attrib, p);
+#if defined(HIP_VERSION_MAJOR) && (HIP_VERSION_MAJOR < 6)
         return r == hipSuccess && attrib.memoryType == hipMemoryTypeDevice;
+#else
+        return r == hipSuccess && attrib.type == hipMemoryTypeDevice;
+#endif // (HIP_VERSION_MAJOR) && (HIP_VERSION_MAJOR < 6)
 #elif defined(AMREX_USE_CUDA)
         CUpointer_attribute attrib = CU_POINTER_ATTRIBUTE_MEMORY_TYPE;
         CUmemorytype mem_type = static_cast<CUmemorytype>(0);
@@ -83,7 +87,11 @@ namespace Gpu {
 #if defined(AMREX_USE_HIP)
         hipPointerAttribute_t attrib;
         hipError_t r = hipPointerGetAttributes(&attrib, p);
+#if defined(HIP_VERSION_MAJOR) && (HIP_VERSION_MAJOR < 6)
         return r == hipSuccess && attrib.memoryType == hipMemoryTypeHost;
+#else
+        return r == hipSuccess && attrib.type == hipMemoryTypeHost;
+#endif // (HIP_VERSION_MAJOR) && (HIP_VERSION_MAJOR < 6)
 #elif defined(AMREX_USE_CUDA)
         CUpointer_attribute attrib = CU_POINTER_ATTRIBUTE_MEMORY_TYPE;
         CUmemorytype mem_type = static_cast<CUmemorytype>(0);
@@ -106,9 +114,15 @@ namespace Gpu {
         } else {
             hipPointerAttribute_t attrib;
             hipError_t r = hipPointerGetAttributes(&attrib, p);
+#if defined(HIP_VERSION_MAJOR) && (HIP_VERSION_MAJOR < 6)
             return r == hipSuccess &&
                 (attrib.memoryType == hipMemoryTypeHost   ||
                  attrib.memoryType == hipMemoryTypeDevice);
+#else
+            return r == hipSuccess &&
+                (attrib.type == hipMemoryTypeHost   ||
+                 attrib.type == hipMemoryTypeDevice);
+#endif // (HIP_VERSION_MAJOR) && (HIP_VERSION_MAJOR < 6)
         }
 #elif defined(AMREX_USE_CUDA)
         CUpointer_attribute attrib = CU_POINTER_ATTRIBUTE_MEMORY_TYPE;

From f5400421a4b1ba84be3dda98040a49ff2ef89149 Mon Sep 17 00:00:00 2001
From: Weiqun Zhang <WeiqunZhang@lbl.gov>
Date: Wed, 1 Nov 2023 12:02:02 -0700
Subject: [PATCH 05/14] FillRandom: Use MKL host API (#3536)

---
 Src/Base/AMReX_Random.cpp     | 40 ++++++++++++-----------------------
 Tools/CMake/AMReXSYCL.cmake   |  2 +-
 Tools/GNUMake/comps/dpcpp.mak |  2 +-
 3 files changed, 16 insertions(+), 28 deletions(-)

diff --git a/Src/Base/AMReX_Random.cpp b/Src/Base/AMReX_Random.cpp
index 9e1059e6798..cc791a11fef 100644
--- a/Src/Base/AMReX_Random.cpp
+++ b/Src/Base/AMReX_Random.cpp
@@ -19,7 +19,7 @@ namespace
 namespace amrex {
 #ifdef AMREX_USE_SYCL
     sycl_rng_descr* rand_engine_descr = nullptr;
-//xxxxx    oneapi::mkl::rng::philox4x32x10* gpu_rand_generator = nullptr;
+    oneapi::mkl::rng::philox4x32x10* gpu_rand_generator = nullptr;
 #else
     amrex::randState_t* gpu_rand_state = nullptr;
     amrex::randGenerator_t gpu_rand_generator = nullptr;
@@ -44,8 +44,8 @@ void ResizeRandomSeed (amrex::ULong gpu_seed)
     rand_engine_descr = new sycl_rng_descr
         (Gpu::Device::streamQueue(), sycl::range<1>(N), gpu_seed, 1);
 
-//xxxxx    gpu_rand_generator = new std::remove_pointer_t<decltype(gpu_rand_generator)>
-//        (Gpu::Device::streamQueue(), gpu_seed+1234ULL);
+    gpu_rand_generator = new std::remove_pointer_t<decltype(gpu_rand_generator)>
+        (Gpu::Device::streamQueue(), gpu_seed+1234ULL);
 
 #elif defined(AMREX_USE_CUDA) || defined(AMREX_USE_HIP)
 
@@ -212,11 +212,11 @@ DeallocateRandomSeedDevArray ()
         Gpu::streamSynchronize();
         rand_engine_descr = nullptr;
     }
-//xxxxx    if (gpu_rand_generator != nullptr) {
-//        delete gpu_rand_generator;
-//        Gpu::streamSynchronize();
-//        gpu_rand_generator = nullptr;
-//    }
+    if (gpu_rand_generator != nullptr) {
+        delete gpu_rand_generator;
+        Gpu::streamSynchronize();
+        gpu_rand_generator = nullptr;
+    }
 #else
     if (gpu_rand_state != nullptr)
     {
@@ -258,15 +258,9 @@ void FillRandom (Real* p, Long N)
 
 #elif defined(AMREX_USE_SYCL)
 
-//xxxxx    oneapi::mkl::rng::uniform<Real> distr;
-//    auto event = oneapi::mkl::rng::generate(distr, gpu_rand_generator, N, p);
-//    event.wait();
-
-    amrex::ParallelForRNG(N, [=] AMREX_GPU_DEVICE (Long i, RandomEngine const& eng)
-    {
-        p[i] = Random(eng);
-    });
-    Gpu::streamSynchronize();
+    oneapi::mkl::rng::uniform<Real> distr;
+    auto event = oneapi::mkl::rng::generate(distr, *gpu_rand_generator, N, p);
+    event.wait();
 
 #else
     std::uniform_real_distribution<Real> distribution(Real(0.0), Real(1.0));
@@ -299,15 +293,9 @@ void FillRandomNormal (Real* p, Long N, Real mean, Real stddev)
 
 #elif defined(AMREX_USE_SYCL)
 
-//xxxxx    oneapi::mkl::rng::gaussian<Real> distr(mean, stddev);
-//    auto event = oneapi::mkl::rng::generate(distr, gpu_rand_generator, N, p);
-//    event.wait();
-
-    amrex::ParallelForRNG(N, [=] AMREX_GPU_DEVICE (Long i, RandomEngine const& eng)
-    {
-        p[i] = RandomNormal(mean, stddev, eng);
-    });
-    Gpu::streamSynchronize();
+    oneapi::mkl::rng::gaussian<Real> distr(mean, stddev);
+    auto event = oneapi::mkl::rng::generate(distr, *gpu_rand_generator, N, p);
+    event.wait();
 
 #else
 
diff --git a/Tools/CMake/AMReXSYCL.cmake b/Tools/CMake/AMReXSYCL.cmake
index 42eb5c4802b..a67571dc412 100644
--- a/Tools/CMake/AMReXSYCL.cmake
+++ b/Tools/CMake/AMReXSYCL.cmake
@@ -53,7 +53,7 @@ endif()
 #
 target_link_options( SYCL
    INTERFACE
-   $<${_cxx_sycl}:-fsycl -fsycl-device-lib=libc,libm-fp32,libm-fp64> )
+   $<${_cxx_sycl}:-qmkl=sequential -fsycl -fsycl-device-lib=libc,libm-fp32,libm-fp64> )
 
 
 # TODO: use $<LINK_LANG_AND_ID:> genex for CMake >=3.17
diff --git a/Tools/GNUMake/comps/dpcpp.mak b/Tools/GNUMake/comps/dpcpp.mak
index 6e490d9c063..3bcf5cb4372 100644
--- a/Tools/GNUMake/comps/dpcpp.mak
+++ b/Tools/GNUMake/comps/dpcpp.mak
@@ -123,7 +123,7 @@ ifneq ($(BL_NO_FORT),TRUE)
   endif
 endif
 
-LDFLAGS += -fsycl-device-lib=libc,libm-fp32,libm-fp64
+LDFLAGS += -qmkl=sequential -fsycl-device-lib=libc,libm-fp32,libm-fp64
 
 ifdef SYCL_PARALLEL_LINK_JOBS
 LDFLAGS += -fsycl-max-parallel-link-jobs=$(SYCL_PARALLEL_LINK_JOBS)

From 606a94c69d640a03d5fbf35d11f1c46bf9676e47 Mon Sep 17 00:00:00 2001
From: Axel Huebl <axel.huebl@plasma.ninja>
Date: Wed, 1 Nov 2023 15:11:03 -0700
Subject: [PATCH 06/14] [Breaking] Prefix `amrex_` to each plotfile Tool
 (#3600)

## Summary

Packaging AMReX for Conda, we received feedback that our plotfile tool
names are too generic and can collide with other packages.

Thus, we propose to rename them with a common prefix.

- [x] decide prefix or suffix (suffix: `<TAB>` completion will work;
prefix: easier for new users)
- [x] decide name: `plt`/`amrex`/`amr`/...
  - Weiqun and Axel prefer `amrex_` so far
- [ ] also change for GNUmake
- [ ] update [test
harness](https://github.com/AMReX-Codes/regression_testing)
- [ ] update user-facing documentation for tools
- [x] ~update
[tutorials](https://github.com/AMReX-Codes/amrex-tutorials)?~

## Additional background

https://github.com/conda-forge/staged-recipes/pull/24294

## Checklist

The proposed changes:
- [ ] fix a bug or incorrect behavior in AMReX
- [ ] add new capabilities to AMReX
- [ ] changes answers in the test suite to more than roundoff level
- [ ] are likely to significantly affect the results of downstream AMReX
users
- [ ] include documentation in the code and/or rst files, if appropriate

---------

Co-authored-by: Weiqun Zhang <weiqunzhang@lbl.gov>
---
 .github/workflows/clang.yml   | 2 +-
 .github/workflows/gcc.yml     | 2 +-
 Tools/Plotfile/CMakeLists.txt | 5 +++++
 3 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/clang.yml b/.github/workflows/clang.yml
index eea7e576af6..ec469bb5de6 100644
--- a/.github/workflows/clang.yml
+++ b/.github/workflows/clang.yml
@@ -64,7 +64,7 @@ jobs:
             CLANG_TIDY_ARGS="--config-file=${{github.workspace}}/.clang-tidy --warnings-as-errors=*"
 
         export PATH=/tmp/my-amrex/bin:$PATH
-        which fcompare
+        which amrex_fcompare
 
         ctest --output-on-failure
 
diff --git a/.github/workflows/gcc.yml b/.github/workflows/gcc.yml
index 6915a246018..afc2044bdd7 100644
--- a/.github/workflows/gcc.yml
+++ b/.github/workflows/gcc.yml
@@ -60,7 +60,7 @@ jobs:
             CLANG_TIDY_ARGS="--config-file=${{github.workspace}}/.clang-tidy --warnings-as-errors=*"
 
         export PATH=/tmp/my-amrex/bin:$PATH
-        which fcompare
+        which amrex_fcompare
 
         ctest --output-on-failure
 
diff --git a/Tools/Plotfile/CMakeLists.txt b/Tools/Plotfile/CMakeLists.txt
index 0efc55c9a16..f462445c952 100644
--- a/Tools/Plotfile/CMakeLists.txt
+++ b/Tools/Plotfile/CMakeLists.txt
@@ -27,6 +27,11 @@ foreach( _exe IN LISTS _exe_names)
    if (AMReX_CUDA)
       set_source_files_properties(${_exe}.cpp PROPERTIES LANGUAGE CUDA)
    endif()
+
+    # Add prefix to each tool's name to make them unique when installed.
+    # This avoids potential collisions of names on user systems, e.g., in
+    # software packages (Spack/Conda/Debian/...).
+    set_target_properties(${_exe} PROPERTIES OUTPUT_NAME "amrex_${_exe}")
 endforeach()
 
 

From a7afcba3cffd86acc748edf39a8a2f33c973bf5d Mon Sep 17 00:00:00 2001
From: Axel Huebl <axel.huebl@plasma.ninja>
Date: Thu, 2 Nov 2023 08:45:19 -0700
Subject: [PATCH 07/14] `amrex.omp_threads`: Can Avoid SMT (#3607)

## Summary

In all our applications in BLAST, the OpenMP default to use all [logical
cores on modern
CPUs](https://en.wikipedia.org/wiki/Simultaneous_multithreading) results
in significantly slower performance than just using the physical cores
with AMReX. Thus, we introduce a new option `amrex.omp_threads` that
enables control over the OpenMP threads at startup and has - for most
popular systems - an implementation to find out the actual number of
physical threads and default to it.

For codes, users that change the default to `amrex.omp_threads = nosmt`,
the `OMP_NUM_THREADS` variable will still take precedence. This is a bit
unusual (because CLI options usually have higher precedence than env
vars - and they do if the user provides a number here), but done
intentionally: this way, codes like WarpX can set the `nosmt` default
and HPC job scripts will set the exact, preferably benchmarked number of
threads as usual without surprises.

- [x] document

## Tests Performed for AMReX OMP Backend

Tests were performed with very small examples, WarpX 3D LWFA test as
checked in or AMReX AMRCore 3d test.

- [x] Ubuntu 22.04 Laptop w/ 12th Gen Intel i9-12900H: @ax3l
  - 20 logical cores; the first 12 logical cores use 2x SMT/HT
  - 20 virtual (default) -> 14 physical (`amrex.omp_threads = nosmt`)
    - faster runtime!
- [x] Perlmutter (SUSE Linux Enterprise 15.4, kernel 5.14.21)
- [CPU node](https://docs.nersc.gov/systems/perlmutter/architecture/)
with 2x [AMD EPYC
7763](https://www.amd.com/en/products/cpu/amd-epyc-7763)
  - 2x SMT - 256 default, 128 with `amrex.omp_threads = nosmt`
    - faster runtime!
- [x] Frontier (SUSE Linux Enterprise 15.4, kernel 5.14.21)
  - 1x AMD EPYC 7763 64-Core Processor (w/ 2x SMT enabled)
  - 2x SMT - 128 default - 64 with `amrex.omp_threads = nosmt`
    - faster runtime!
- The ideal result might also be lower, due to first cores used by OS
and [low-noise
cores](https://docs.olcf.ornl.gov/systems/frontier_user_guide.html#low-noise-mode-layout)
after that. But that is an orthogonal question and should be set in job
scripts: `#SBATCH --ntasks-per-node=8` `#SBATCH --cpus-per-task=7`
`#SBATCH --gpus-per-task=1`
- [x] Summit (RHEL 8.2, kernel 4.18.0)
- 2x IBM Power9 (each 22 physical cores each, each 6 disabled/hidden for
OS?, 4x SMT enabled; cpuinfo says 128 total)
  - 4x SMT - 128 default, 32 with `amrex.omp_threads = nosmt`
    - faster runtime!
- [x] [Lassen](https://hpc.llnl.gov/hardware/compute-platforms/lassen)
(RHEL 7.9, kernel 4.14.0)
- 2x IBM Power9 (each 22 physical cores, each 2 reserved for OS?, 4x SMT
enabled)
  - 4x SMT - 160 default, 44 with `amrex.omp_threads = nosmt`
    - faster runtime!
- The ideal result might be even down to 40, but that is an orthogonal
question and should be set in job scripts.
- [x] macOS M1 (arm64/aarch64) mini:
  - no SMT/HT - 8 default, 8 with `amrex.omp_threads = nosmt`
- [x] macOS (OSX Ventura 13.5.2, 2.8 GHz Quad-Core Intel Core i7-8569U)
Intel x86_64 @n01r
    - 2x SMT - 8 default, 4 with `amrex.omp_threads = nosmt`
    - faster runtime!
- [x] macOS (OSX Ventura 13.5.2) M1 Max on mac studio @RTSandberg
  - no SMT/HT - 10 default, 10 with `amrex.omp_threads = nosmt`
- [ ] some BSD/FreeBSD system?
  - no user requests
  - low priority, we just keep the default for now
- [ ] Windows... looking for a system

## Additional background

## Checklist

The proposed changes:
- [ ] fix a bug or incorrect behavior in AMReX
- [x] add new capabilities to AMReX
- [ ] changes answers in the test suite to more than roundoff level
- [ ] are likely to significantly affect the results of downstream AMReX
users
- [ ] include documentation in the code and/or rst files, if appropriate

---------

Co-authored-by: Weiqun Zhang <WeiqunZhang@lbl.gov>
---
 .../source/InputsComputeBackends.rst          |  21 +++
 .../source/Inputs_Chapter.rst                 |   1 +
 Src/Base/AMReX.cpp                            |  15 +-
 Src/Base/AMReX_OpenMP.H                       |  15 +-
 Src/Base/AMReX_OpenMP.cpp                     | 177 ++++++++++++++++++
 Src/Base/CMakeLists.txt                       |   1 +
 Src/Base/Make.package                         |   1 +
 7 files changed, 224 insertions(+), 7 deletions(-)
 create mode 100644 Docs/sphinx_documentation/source/InputsComputeBackends.rst
 create mode 100644 Src/Base/AMReX_OpenMP.cpp

diff --git a/Docs/sphinx_documentation/source/InputsComputeBackends.rst b/Docs/sphinx_documentation/source/InputsComputeBackends.rst
new file mode 100644
index 00000000000..26e5d527508
--- /dev/null
+++ b/Docs/sphinx_documentation/source/InputsComputeBackends.rst
@@ -0,0 +1,21 @@
+.. _Chap:InputsComputeBackends:
+
+Compute Backends
+================
+
+The following inputs must be preceded by ``amrex.`` and determine runtime options of CPU or GPU compute implementations.
+
++------------------------+-----------------------------------------------------------------------+-------------+------------+
+| Parameter              | Description                                                           |   Type      | Default    |
++========================+=======================================================================+=============+============+
+| ``omp_threads``        | If OpenMP is enabled, this can be used to set the default number of   |   String    | ``system`` |
+|                        | threads. The special value ``nosmt`` can be used to avoid using       |   or Int    |            |
+|                        | threads for virtual cores (aka Hyperthreading or SMT), as is default  |             |            |
+|                        | in OpenMP, and instead only spawns threads equal to the number of     |             |            |
+|                        | physical cores in the system.                                         |             |            |
+|                        | For the values ``system`` and ``nosmt``, the environment variable     |             |            |
+|                        | ``OMP_NUM_THREADS`` takes precedence. For Integer values,             |             |            |
+|                        | ``OMP_NUM_THREADS`` is ignored.                                       |             |            |
++------------------------+-----------------------------------------------------------------------+-------------+------------+
+
+For GPU-specific parameters, see also the :ref:`GPU chapter <sec:gpu:parameters>`.
diff --git a/Docs/sphinx_documentation/source/Inputs_Chapter.rst b/Docs/sphinx_documentation/source/Inputs_Chapter.rst
index 0a64aeb492c..43ead40b3c6 100644
--- a/Docs/sphinx_documentation/source/Inputs_Chapter.rst
+++ b/Docs/sphinx_documentation/source/Inputs_Chapter.rst
@@ -9,6 +9,7 @@ Run-time Inputs
    InputsProblemDefinition
    InputsTimeStepping
    InputsLoadBalancing
+   InputsComputeBackends
    InputsPlotFiles
    InputsCheckpoint
 
diff --git a/Src/Base/AMReX.cpp b/Src/Base/AMReX.cpp
index 147f8275c57..4449dab1955 100644
--- a/Src/Base/AMReX.cpp
+++ b/Src/Base/AMReX.cpp
@@ -52,6 +52,7 @@
 #endif
 
 #ifdef AMREX_USE_OMP
+#include <AMReX_OpenMP.H>
 #include <omp.h>
 #endif
 
@@ -72,7 +73,9 @@
 #include <iostream>
 #include <iomanip>
 #include <new>
+#include <optional>
 #include <stack>
+#include <string>
 #include <thread>
 #include <limits>
 #include <vector>
@@ -459,15 +462,17 @@ amrex::Initialize (int& argc, char**& argv, bool build_parm_parse,
 #endif
 
 #ifdef AMREX_USE_OMP
+    amrex::OpenMP::init_threads();
+
+    // status output
     if (system::verbose > 0) {
 //    static_assert(_OPENMP >= 201107, "OpenMP >= 3.1 is required.");
         amrex::Print() << "OMP initialized with "
                        << omp_get_max_threads()
                        << " OMP threads\n";
     }
-#endif
 
-#if defined(AMREX_USE_MPI) && defined(AMREX_USE_OMP)
+    // warn if over-subscription is detected
     if (system::verbose > 0) {
         auto ncores = int(std::thread::hardware_concurrency());
         if (ncores != 0 && // It might be zero according to the C++ standard.
@@ -476,8 +481,10 @@ amrex::Initialize (int& argc, char**& argv, bool build_parm_parse,
             amrex::Print(amrex::ErrorStream())
                 << "AMReX Warning: You might be oversubscribing CPU cores with OMP threads.\n"
                 << "               There are " << ncores << " cores per node.\n"
-                << "               There are " << ParallelDescriptor::NProcsPerNode() << " MPI ranks per node.\n"
-                << "               But OMP is initialized with " << omp_get_max_threads() << " threads per rank.\n"
+#if defined(AMREX_USE_MPI)
+                << "               There are " << ParallelDescriptor::NProcsPerNode() << " MPI ranks (processes) per node.\n"
+#endif
+                << "               But OMP is initialized with " << omp_get_max_threads() << " threads per process.\n"
                 << "               You should consider setting OMP_NUM_THREADS="
                 << ncores/ParallelDescriptor::NProcsPerNode() << " or less in the environment.\n";
         }
diff --git a/Src/Base/AMReX_OpenMP.H b/Src/Base/AMReX_OpenMP.H
index 8eb8ada4513..ce267b9be73 100644
--- a/Src/Base/AMReX_OpenMP.H
+++ b/Src/Base/AMReX_OpenMP.H
@@ -11,10 +11,12 @@ namespace amrex::OpenMP {
     inline int get_max_threads () { return omp_get_max_threads(); }
     inline int get_thread_num  () { return omp_get_thread_num();  }
     inline int in_parallel     () { return omp_in_parallel();     }
+    inline void set_num_threads (int num) { omp_set_num_threads(num); }
 
+    void init_threads ();
 }
 
-#else
+#else // AMREX_USE_OMP
 
 namespace amrex::OpenMP {
 
@@ -22,9 +24,16 @@ namespace amrex::OpenMP {
     constexpr int get_max_threads () { return 1; }
     constexpr int get_thread_num  () { return 0; }
     constexpr int in_parallel     () { return false; }
-
+    constexpr void set_num_threads (int) { /* nothing */ }
+    constexpr void init_threads () { /* nothing */ }
 }
 
-#endif
+#endif // AMREX_USE_OMP
+
+namespace amrex {
+    /** ... */
+    int
+    numUniquePhysicalCores();
+}
 
 #endif
diff --git a/Src/Base/AMReX_OpenMP.cpp b/Src/Base/AMReX_OpenMP.cpp
new file mode 100644
index 00000000000..5ddd9944411
--- /dev/null
+++ b/Src/Base/AMReX_OpenMP.cpp
@@ -0,0 +1,177 @@
+#include <AMReX_OpenMP.H>
+#include <AMReX.H>
+#include <AMReX_ParmParse.H>
+#include <AMReX_Print.H>
+
+#if defined(__APPLE__)
+#include <sys/types.h>
+#include <sys/sysctl.h>
+#endif
+
+#if defined(_WIN32)
+#include <windows.h>
+#endif
+
+#include <cstdlib>
+#include <fstream>
+#include <iostream>
+#include <optional>
+#include <set>
+#include <sstream>
+#include <string>
+#include <thread>
+#include <vector>
+
+
+namespace amrex
+{
+    int
+    numUniquePhysicalCores ()
+    {
+        int ncores;
+
+#if defined(__APPLE__)
+        size_t len = sizeof(ncores);
+        // See hw.physicalcpu and hw.physicalcpu_max
+        //   https://developer.apple.com/documentation/kernel/1387446-sysctlbyname/determining_system_capabilities/
+        //   https://developer.apple.com/documentation/kernel/1387446-sysctlbyname
+        if (sysctlbyname("hw.physicalcpu", &ncores, &len, NULL, 0) == -1) {
+            if (system::verbose > 0) {
+                amrex::Print() << "numUniquePhysicalCores(): Error receiving hw.physicalcpu! "
+                               << "Defaulting to visible cores.\n";
+            }
+            ncores = int(std::thread::hardware_concurrency());
+        }
+#elif defined(__linux__)
+        std::set<std::vector<int>> uniqueThreadSets;
+        int cpuIndex = 0;
+
+        while (true) {
+            // for each logical CPU in cpuIndex from 0...N-1
+            std::string path = "/sys/devices/system/cpu/cpu" + std::to_string(cpuIndex) + "/topology/thread_siblings_list";
+            std::ifstream file(path);
+            if (!file.is_open()) {
+                break; // no further CPUs to check
+            }
+
+            // find its siblings
+            std::vector<int> siblings;
+            std::string line;
+            if (std::getline(file, line)) {
+                std::stringstream ss(line);
+                std::string token;
+
+                // Possible syntax: 0-3, 8-11, 14,17
+                // https://github.com/torvalds/linux/blob/v6.5/Documentation/ABI/stable/sysfs-devices-system-cpu#L68-L72
+                while (std::getline(ss, token, ',')) {
+                    size_t dashPos = token.find('-');
+                    if (dashPos != std::string::npos) {
+                        // Range detected
+                        int start = std::stoi(token.substr(0, dashPos));
+                        int end = std::stoi(token.substr(dashPos + 1));
+                        for (int i = start; i <= end; ++i) {
+                            siblings.push_back(i);
+                        }
+                    } else {
+                        siblings.push_back(std::stoi(token));
+                    }
+                }
+            }
+
+            // and record the siblings group
+            // (assumes: ascending and unique sets per cpuIndex)
+            uniqueThreadSets.insert(siblings);
+            cpuIndex++;
+        }
+
+        if (cpuIndex == 0) {
+            if (system::verbose > 0) {
+                amrex::Print() << "numUniquePhysicalCores(): Error reading CPU info.\n";
+            }
+            ncores = int(std::thread::hardware_concurrency());
+        } else {
+            ncores = int(uniqueThreadSets.size());
+        }
+#elif defined(_WIN32)
+        DWORD length = 0;
+        bool result = GetLogicalProcessorInformation(NULL, &length);
+
+        if (!result) {
+            if (system::verbose > 0) {
+                amrex::Print() << "numUniquePhysicalCores(): Failed to get logical processor information! "
+                               << "Defaulting to visible cores.\n";
+            }
+            ncores = int(std::thread::hardware_concurrency());
+        }
+        else {
+            std::vector<SYSTEM_LOGICAL_PROCESSOR_INFORMATION> buffer(length / sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION));
+            if (!GetLogicalProcessorInformation(&buffer[0], &length)) {
+                if (system::verbose > 0) {
+                    amrex::Print() << "numUniquePhysicalCores(): Failed to get logical processor information! "
+                                   << "Defaulting to visible cores.\n";
+                }
+                ncores = int(std::thread::hardware_concurrency());
+            } else {
+                ncores = 0;
+                for (const auto& info : buffer) {
+                    if (info.Relationship == RelationProcessorCore) {
+                        ncores++;
+                    }
+                }
+            }
+        }
+#else
+        // TODO:
+        //   BSD
+        if (system::verbose > 0) {
+            amrex::Print() << "numUniquePhysicalCores(): Unknown system. Defaulting to visible cores.\n";
+        }
+        ncores = int(std::thread::hardware_concurrency());
+#endif
+        return ncores;
+    }
+} // namespace amrex
+
+#ifdef AMREX_USE_OMP
+namespace amrex::OpenMP
+{
+    void init_threads ()
+    {
+        amrex::ParmParse pp("amrex");
+        std::string omp_threads = "system";
+        pp.queryAdd("omp_threads", omp_threads);
+
+        auto to_int = [](std::string const & str_omp_threads) {
+            std::optional<int> num;
+            try { num = std::stoi(str_omp_threads); }
+            catch (...) { /* nothing */ }
+            return num;
+        };
+
+        if (omp_threads == "system") {
+            // default or OMP_NUM_THREADS environment variable
+        } else if (omp_threads == "nosmt") {
+            char const *env_omp_num_threads = std::getenv("OMP_NUM_THREADS");
+            if (env_omp_num_threads != nullptr && amrex::system::verbose > 1) {
+                amrex::Print() << "amrex.omp_threads was set to nosmt,"
+                               << "but OMP_NUM_THREADS was set. Will keep "
+                               << "OMP_NUM_THREADS=" << env_omp_num_threads << ".\n";
+            } else {
+                omp_set_num_threads(numUniquePhysicalCores());
+            }
+        } else {
+            std::optional<int> num_omp_threads = to_int(omp_threads);
+            if (num_omp_threads.has_value()) {
+                omp_set_num_threads(num_omp_threads.value());
+            }
+            else {
+                if (amrex::system::verbose > 0) {
+                    amrex::Print() << "amrex.omp_threads has an unknown value: "
+                                   << omp_threads
+                                   << " (try system, nosmt, or a positive integer)\n";
+                }
+            }
+        }
+    }
+} // namespace amrex::OpenMP
+#endif // AMREX_USE_OMP
diff --git a/Src/Base/CMakeLists.txt b/Src/Base/CMakeLists.txt
index 544de3aed8c..459ec3bd7c4 100644
--- a/Src/Base/CMakeLists.txt
+++ b/Src/Base/CMakeLists.txt
@@ -53,6 +53,7 @@ foreach(D IN LISTS AMReX_SPACEDIM)
        AMReX_ParallelDescriptor.H
        AMReX_ParallelDescriptor.cpp
        AMReX_OpenMP.H
+       AMReX_OpenMP.cpp
        AMReX_ParallelReduce.H
        AMReX_ForkJoin.H
        AMReX_ForkJoin.cpp
diff --git a/Src/Base/Make.package b/Src/Base/Make.package
index 29b4c25dc84..276887ebd79 100644
--- a/Src/Base/Make.package
+++ b/Src/Base/Make.package
@@ -38,6 +38,7 @@ C$(AMREX_BASE)_headers += AMReX_REAL.H AMReX_INT.H AMReX_CONSTANTS.H AMReX_SPACE
 C$(AMREX_BASE)_sources += AMReX_DistributionMapping.cpp AMReX_ParallelDescriptor.cpp
 C$(AMREX_BASE)_headers += AMReX_DistributionMapping.H AMReX_ParallelDescriptor.H
 C$(AMREX_BASE)_headers += AMReX_OpenMP.H
+C$(AMREX_BASE)_sources += AMReX_OpenMP.cpp
 
 C$(AMREX_BASE)_headers += AMReX_ParallelReduce.H
 

From d36463103daed09a40cdea235041a6ab79ff280c Mon Sep 17 00:00:00 2001
From: Ben Wibking <ben@wibking.com>
Date: Tue, 7 Nov 2023 00:19:10 -0500
Subject: [PATCH 08/14] replace AMREX_DEVICE_COMPILE with AMREX_IF_ON_DEVICE
 and AMREX_IF_ON_HOST (#3591)

## Summary
This adds the macros `AMREX_IF_ON_DEVICE((code_for_device))` and
`AMREX_IF_ON_HOST((code_for_host))` that are compatible with single-pass
host/device compilation (as used by `nvc++ -cuda`), as well as backward
compatible with all other compilers.

This also replaces all uses of `AMREX_DEVICE_COMPILE` with these macros.

Fixes https://github.com/AMReX-Codes/amrex/issues/3586.

## Additional background
Single-pass compilation evalutes the preprocessor macros once for each
source file. This means that preprocessor conditionals cannot be used to
choose between host and device code. In particular, NVHPC with `-cuda`
does not support `__CUDA_ARCH__`, instead requiring the use of the `if
target` construct. This creates portable macros that work for either
single-pass or two-pass compilation, but requires restructuring of any
code that uses AMREX_DEVICE_COMPILE so that the code appears as a macro
argument.

This PR will allow using NVHPC with `-cuda` as the unified host/device
compiler for AMReX. In the future, single-pass compilers for other
backends may be available, e.g., SYCL
(https://dl.acm.org/doi/abs/10.1145/3585341.3585351).

AMReX can be configured to build with `nvc++ -cuda` using CMake:
```
cmake .. -DAMReX_GPU_BACKEND=CUDA -DCMAKE_C_COMPILER=nvc -DCMAKE_CXX_COMPILER=nvc++ -DCMAKE_CUDA_COMPILER=nvc++ -DCMAKE_CUDA_COMPILER_ID=NVCXX -DCMAKE_CUDA_ARCHITECTURES=80 -DCMAKE_CUDA_COMPILER_FORCED=ON -DCMAKE_CUDA_COMPILE_FEATURES=cuda_std_17 -DAMReX_GPU_RDC=OFF -DCMAKE_CXX_FLAGS="-cuda --gcc-toolchain=$(which gcc)" -DCMAKE_CUDA_FLAGS="-cuda --gcc-toolchain=$(which gcc)" -DAMReX_ENABLE_TESTS=ON -DCMAKE_CUDA_HOST_LINK_LAUNCHER=nvc++ -DCMAKE_CUDA_LINK_EXECUTABLE="<CMAKE_CUDA_HOST_LINK_LAUNCHER> <FLAGS> <LINK_FLAGS> <OBJECTS> -o <TARGET> <LINK_LIBRARIES>"
```
CMake hacks
(https://github.com/NVIDIA/cub/blob/0fc3c3701632a4be906765b73be20a9ad0da603d/cmake/CubCompilerHacks.cmake)
are tested with CMake 3.22.1 and NVHPC 23.5, 23.7, and 23.9 (earlier
versions do not work). However, it currently fails to link the
executables for the tests due to a [compiler/linker
bug](https://forums.developer.nvidia.com/t/nvc-cuda-fails-to-link-code-when-using-device-curand-functions/270401/5).

(Note that by default, `nvcc` preserves denormals, whereas `nvc++` does
not. Also, `nvc++` generates relocatable device code by default, whereas
`nvcc` does not.)

## Checklist

The proposed changes:
- [ ] fix a bug or incorrect behavior in AMReX
- [ ] add new capabilities to AMReX
- [ ] changes answers in the test suite to more than roundoff level
- [ ] are likely to significantly affect the results of downstream AMReX
users
- [ ] include documentation in the code and/or rst files, if appropriate

---------

Co-authored-by: Weiqun Zhang <WeiqunZhang@lbl.gov>
---
 Docs/sphinx_documentation/source/GPU.rst |   9 +-
 Src/Base/AMReX.H                         |  60 +++----
 Src/Base/AMReX_Algorithm.H               | 178 +++++++++++---------
 Src/Base/AMReX_Array4.H                  |  46 ++---
 Src/Base/AMReX_FabArray.H                |   7 +-
 Src/Base/AMReX_GpuAtomic.H               | 204 ++++++++++++-----------
 Src/Base/AMReX_GpuLaunch.H               |  29 ++--
 Src/Base/AMReX_GpuQualifiers.H           |  29 ++++
 Src/Base/AMReX_GpuRange.H                |  52 +++---
 Src/Base/AMReX_GpuUtility.H              |  10 +-
 Src/Base/AMReX_Math.H                    |  76 +++++----
 Src/Base/AMReX_Random.H                  | 109 ++++++------
 Src/Base/AMReX_TableData.H               | 100 +++++------
 Src/Base/Parser/AMReX_IParser.H          |  28 +---
 Src/Base/Parser/AMReX_Parser.H           |  35 ++--
 Src/Base/Parser/AMReX_Parser_Y.H         |   4 +-
 Src/EB/AMReX_EB2_GeometryShop.H          |  13 +-
 17 files changed, 518 insertions(+), 471 deletions(-)

diff --git a/Docs/sphinx_documentation/source/GPU.rst b/Docs/sphinx_documentation/source/GPU.rst
index 90dbcc9f26f..aff060e9166 100644
--- a/Docs/sphinx_documentation/source/GPU.rst
+++ b/Docs/sphinx_documentation/source/GPU.rst
@@ -489,11 +489,10 @@ GPU support.
 When AMReX is compiled with ``USE_OMP_OFFLOAD=TRUE``,
 ``AMREX_USE_OMP_OFFLOAD`` is defined.
 
-In addition to AMReX's preprocessor macros, CUDA provides the
-``__CUDA_ARCH__`` macro which is only defined when in device code.
-HIP and Sycl provide similar macros.
-``AMREX_DEVICE_COMPILE`` should be used when a ``__host__ __device__``
-function requires separate code for the CPU and GPU implementations.
+The macros ``AMREX_IF_ON_DEVICE((code_for_device))`` and
+``AMREX_IF_ON_HOST((code_for_host))`` should be used when a
+``__host__ __device__`` function requires separate code for the
+CPU and GPU implementations.
 
 .. ===================================================================
 
diff --git a/Src/Base/AMReX.H b/Src/Base/AMReX.H
index c539a1d8e75..2b88553bcdf 100644
--- a/Src/Base/AMReX.H
+++ b/Src/Base/AMReX.H
@@ -113,16 +113,15 @@ namespace amrex
 
     AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
     void Error (const char* msg = nullptr) {
-#if AMREX_DEVICE_COMPILE
 #if defined(NDEBUG)
-        amrex::ignore_unused(msg);
+        AMREX_IF_ON_DEVICE((amrex::ignore_unused(msg);))
 #else
-        if (msg) { AMREX_DEVICE_PRINTF("Error %s\n", msg); }
-        AMREX_DEVICE_ASSERT(0);
-#endif
-#else
-        Error_host("Error", msg);
+        AMREX_IF_ON_DEVICE((
+                if (msg) { AMREX_DEVICE_PRINTF("Error %s\n", msg); }
+                AMREX_DEVICE_ASSERT(0);
+        ))
 #endif
+        AMREX_IF_ON_HOST((Error_host("Error", msg);))
     }
 
     //! Print out warning message to cerr.
@@ -132,15 +131,12 @@ namespace amrex
 
     AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
     void Warning (const char * msg) {
-#if AMREX_DEVICE_COMPILE
 #if defined(NDEBUG)
-        amrex::ignore_unused(msg);
-#else
-        if (msg) { AMREX_DEVICE_PRINTF("Warning %s\n", msg); }
-#endif
+        AMREX_IF_ON_DEVICE((amrex::ignore_unused(msg);))
 #else
-        Warning_host(msg);
+        AMREX_IF_ON_DEVICE((if (msg) { AMREX_DEVICE_PRINTF("Warning %s\n", msg); }))
 #endif
+        AMREX_IF_ON_HOST((Warning_host(msg);))
     }
 
     //! Print out message to cerr and exit via abort().
@@ -148,16 +144,15 @@ namespace amrex
 
     AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
     void Abort (const char * msg = nullptr) {
-#if AMREX_DEVICE_COMPILE
 #if defined(NDEBUG)
-        amrex::ignore_unused(msg);
+        AMREX_IF_ON_DEVICE((amrex::ignore_unused(msg);))
 #else
-        if (msg) { AMREX_DEVICE_PRINTF("Abort %s\n", msg); }
-        AMREX_DEVICE_ASSERT(0);
-#endif
-#else
-        Error_host("Abort", msg);
+        AMREX_IF_ON_DEVICE((
+                if (msg) { AMREX_DEVICE_PRINTF("Abort %s\n", msg); }
+                AMREX_DEVICE_ASSERT(0);
+        ))
 #endif
+        AMREX_IF_ON_HOST((Error_host("Abort", msg);))
     }
 
     /**
@@ -170,22 +165,21 @@ namespace amrex
 
     AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
     void Assert (const char* EX, const char* file, int line, const char* msg = nullptr) {
-#if AMREX_DEVICE_COMPILE
 #if defined(NDEBUG)
-        amrex::ignore_unused(EX,file,line,msg);
-#else
-        if (msg) {
-            AMREX_DEVICE_PRINTF("Assertion `%s' failed, file \"%s\", line %d, Msg: %s",
-                                EX, file, line, msg);
-        } else {
-            AMREX_DEVICE_PRINTF("Assertion `%s' failed, file \"%s\", line %d",
-                                EX, file, line);
-        }
-        AMREX_DEVICE_ASSERT(0);
-#endif
+        AMREX_IF_ON_DEVICE((amrex::ignore_unused(EX,file,line,msg);))
 #else
-        Assert_host(EX,file,line,msg);
+        AMREX_IF_ON_DEVICE((
+                if (msg) {
+                AMREX_DEVICE_PRINTF("Assertion `%s' failed, file \"%s\", line %d, Msg: %s",
+                                        EX, file, line, msg);
+                } else {
+                AMREX_DEVICE_PRINTF("Assertion `%s' failed, file \"%s\", line %d",
+                                        EX, file, line);
+                }
+                AMREX_DEVICE_ASSERT(0);
+        ))
 #endif
+        AMREX_IF_ON_HOST((Assert_host(EX,file,line,msg);))
     }
 
     /**
diff --git a/Src/Base/AMReX_Algorithm.H b/Src/Base/AMReX_Algorithm.H
index b418f3cc1c0..31889df4425 100644
--- a/Src/Base/AMReX_Algorithm.H
+++ b/Src/Base/AMReX_Algorithm.H
@@ -161,51 +161,52 @@ namespace amrex
     AMREX_GPU_HOST_DEVICE
     ItType upper_bound (ItType first, ItType last, const ValType& val)
     {
-#if AMREX_DEVICE_COMPILE
-        std::ptrdiff_t count = last-first;
-        while(count>0){
-            auto it = first;
-            const auto step = count/2;
-            it += step;
-            if (!(val < *it)){
-                first = ++it;
-                count -= step + 1;
+        AMREX_IF_ON_DEVICE((
+            std::ptrdiff_t count = last-first;
+            while(count>0){
+                auto it = first;
+                const auto step = count/2;
+                it += step;
+                if (!(val < *it)){
+                    first = ++it;
+                    count -= step + 1;
+                }
+                else{
+                    count = step;
+                }
             }
-            else{
-                count = step;
-            }
-        }
-
-        return first;
-#else
-        return std::upper_bound(first, last, val);
-#endif
+            return first;
+        ))
+        AMREX_IF_ON_HOST((
+            return std::upper_bound(first, last, val);
+        ))
     }
 
     template<typename ItType, typename ValType>
     AMREX_GPU_HOST_DEVICE
     ItType lower_bound (ItType first, ItType last, const ValType& val)
     {
-#ifdef AMREX_DEVICE_COMPILE
-        std::ptrdiff_t count = last-first;
-        while(count>0)
-        {
-            auto it = first;
-            const auto step = count/2;
-            it += step;
-            if (*it < val){
-                first = ++it;
-                count -= step + 1;
-            }
-            else{
-                count = step;
+        AMREX_IF_ON_DEVICE((
+            std::ptrdiff_t count = last-first;
+            while(count>0)
+            {
+                auto it = first;
+                const auto step = count/2;
+                it += step;
+                if (*it < val){
+                    first = ++it;
+                    count -= step + 1;
+                }
+                else{
+                    count = step;
+                }
             }
-        }
 
-        return first;
-#else
-        return std::lower_bound(first, last, val);
-#endif
+            return first;
+        ))
+        AMREX_IF_ON_HOST((
+            return std::lower_bound(first, last, val);
+        ))
     }
 
 namespace detail {
@@ -239,83 +240,100 @@ int builtin_clz_wrapper (clzll_tag, T x) noexcept
     return static_cast<int>(__builtin_clzll(x) - (sizeof(unsigned long long) * CHAR_BIT - sizeof(T) * CHAR_BIT));
 }
 
-#ifdef AMREX_USE_CUDA
-
-// likewise with CUDA, there are __clz functions that take (signed) int and long long int
-template <typename T, typename = typename std::enable_if<sizeof(T) <= sizeof(int)>::type>
-AMREX_GPU_DEVICE AMREX_FORCE_INLINE
-int clz_wrapper (clz_tag, T x) noexcept
-{
-    return __clz((int) x) - (sizeof(int) * CHAR_BIT - sizeof(T) * CHAR_BIT);
-}
-
-template <typename T, typename = typename std::enable_if<sizeof(T) <= sizeof(long long int)>::type>
-AMREX_GPU_DEVICE AMREX_FORCE_INLINE
-int clz_wrapper (clzll_tag, T x) noexcept
-{
-    return __clzll((long long int) x) - (sizeof(long long int) * CHAR_BIT - sizeof(T) * CHAR_BIT);
 }
-#endif
 
-}
+template <class T, typename std::enable_if_t<std::is_same_v<std::decay_t<T>,std::uint8_t>  ||
+                                             std::is_same_v<std::decay_t<T>,std::uint16_t> ||
+                                             std::is_same_v<std::decay_t<T>,std::uint32_t> ||
+                                             std::is_same_v<std::decay_t<T>,std::uint64_t>, int> = 0>
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+int clz (T x) noexcept;
 
 AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
-int clz (std::uint8_t x) noexcept
+int clz_generic (std::uint8_t x) noexcept
 {
-#if (AMREX_DEVICE_COMPILE && defined(AMREX_USE_CUDA)) // all supported cuda versions have __clz
-    return detail::clz_wrapper(detail::clz_tag{}, x);
-#elif (!AMREX_DEVICE_COMPILE && AMREX_HAS_BUILTIN_CLZ)
-    return detail::builtin_clz_wrapper(detail::clz_tag{}, x);
-#else
     static constexpr int clz_lookup[16] = { 4, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0 };
     auto upper = x >> 4;
     auto lower = x & 0xF;
     return upper ? clz_lookup[upper] : 4 + clz_lookup[lower];
-#endif
 }
 
 AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
-int clz (std::uint16_t x) noexcept
+int clz_generic (std::uint16_t x) noexcept
 {
-#if (AMREX_DEVICE_COMPILE && defined(AMREX_USE_CUDA)) // all supported cuda versions have __clz
-    return detail::clz_wrapper(detail::clz_tag{}, x);
-#elif (!AMREX_DEVICE_COMPILE && AMREX_HAS_BUILTIN_CLZ)
-    return detail::builtin_clz_wrapper(detail::clz_tag{}, x);
-#else
     auto upper = std::uint8_t(x >> 8);
     auto lower = std::uint8_t(x & 0xFF);
     return upper ? clz(upper) : 8 + clz(lower);
-#endif
 }
 
 AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
-int clz (std::uint32_t x) noexcept
+int clz_generic (std::uint32_t x) noexcept
 {
-#if (AMREX_DEVICE_COMPILE && defined(AMREX_USE_CUDA)) // all supported cuda versions have __clz
-    return detail::clz_wrapper(detail::clz_tag{}, x);
-#elif (!AMREX_DEVICE_COMPILE && AMREX_HAS_BUILTIN_CLZ)
-    return detail::builtin_clz_wrapper(detail::clz_tag{}, x);
-#else
     auto upper = std::uint16_t(x >> 16);
     auto lower = std::uint16_t(x & 0xFFFF);
     return upper ? clz(upper) : 16 + clz(lower);
-#endif
 }
 
 AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
-int clz (std::uint64_t x) noexcept
+int clz_generic (std::uint64_t x) noexcept
 {
-#if (AMREX_DEVICE_COMPILE && defined(AMREX_USE_CUDA)) // all supported cuda versions have __clz
-    return detail::clz_wrapper(detail::clz_tag{}, x);
-#elif (!AMREX_DEVICE_COMPILE && AMREX_HAS_BUILTIN_CLZ)
-    return detail::builtin_clz_wrapper(detail::clz_tag{}, x);
-#else
     auto upper = std::uint32_t(x >> 32);
     auto lower = std::uint32_t(x & 0xFFFFFFFF);
     return upper ? clz(upper) : 32 + clz(lower);
+}
+
+#if defined AMREX_USE_CUDA
+
+namespace detail {
+    // likewise with CUDA, there are __clz functions that take (signed) int and long long int
+    template <typename T, typename = typename std::enable_if<sizeof(T) <= sizeof(int)>::type>
+    AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+    int clz_wrapper (clz_tag, T x) noexcept
+    {
+        return __clz((int) x) - (sizeof(int) * CHAR_BIT - sizeof(T) * CHAR_BIT);
+    }
+
+    template <typename T, typename = typename std::enable_if<sizeof(T) <= sizeof(long long int)>::type>
+    AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+    int clz_wrapper (clzll_tag, T x) noexcept
+    {
+        return __clzll((long long int) x) - (sizeof(long long int) * CHAR_BIT - sizeof(T) * CHAR_BIT);
+    }
+}
+
+template <class T, typename std::enable_if_t<std::is_same_v<std::decay_t<T>,std::uint8_t>  ||
+                                             std::is_same_v<std::decay_t<T>,std::uint16_t> ||
+                                             std::is_same_v<std::decay_t<T>,std::uint32_t> ||
+                                             std::is_same_v<std::decay_t<T>,std::uint64_t>, int> >
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+int clz (T x) noexcept
+{
+    AMREX_IF_ON_DEVICE((return detail::clz_wrapper(detail::clz_tag{}, x);))
+#if AMREX_HAS_BUILTIN_CLZ
+    AMREX_IF_ON_HOST((return detail::builtin_clz_wrapper(detail::clz_tag{}, x);))
+#else
+    AMREX_IF_ON_HOST((return clz_generic(x);))
 #endif
 }
 
+#else // !defined AMREX_USE_CUDA
+
+template <class T, typename std::enable_if_t<std::is_same_v<std::decay_t<T>,std::uint8_t>  ||
+                                             std::is_same_v<std::decay_t<T>,std::uint16_t> ||
+                                             std::is_same_v<std::decay_t<T>,std::uint32_t> ||
+                                             std::is_same_v<std::decay_t<T>,std::uint64_t>, int> >
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+int clz (T x) noexcept
+{
+#if (!AMREX_DEVICE_COMPILE && AMREX_HAS_BUILTIN_CLZ)
+    return detail::builtin_clz_wrapper(detail::clz_tag{}, x);
+#else
+    return clz_generic(x);
+#endif
+}
+
+#endif // defined AMREX_USE_CUDA
+
 }
 
 #endif
diff --git a/Src/Base/AMReX_Array4.H b/Src/Base/AMReX_Array4.H
index b2ff0fcb549..8d7a4a44f30 100644
--- a/Src/Base/AMReX_Array4.H
+++ b/Src/Base/AMReX_Array4.H
@@ -42,13 +42,14 @@ namespace amrex {
         U& operator[] (int n) const noexcept {
 #if defined(AMREX_DEBUG) || defined(AMREX_BOUND_CHECK)
             if (n < 0 || n >= ncomp) {
-#if AMREX_DEVICE_COMPILE
-                AMREX_DEVICE_PRINTF(" %d is out of bound (0:%d)", n, ncomp-1);
-#else
-                std::stringstream ss;
-                ss << " " << n << " is out of bound: (0:" << ncomp-1 << ")";
-                amrex::Abort(ss.str());
-#endif
+                AMREX_IF_ON_DEVICE((
+                    AMREX_DEVICE_PRINTF(" %d is out of bound (0:%d)", n, ncomp-1);
+                ))
+                AMREX_IF_ON_HOST((
+                    std::stringstream ss;
+                    ss << " " << n << " is out of bound: (0:" << ncomp-1 << ")";
+                    amrex::Abort(ss.str());
+                ))
             }
 #endif
             return p[n*stride];
@@ -233,21 +234,22 @@ namespace amrex {
         {
             if (i<begin.x || i>=end.x || j<begin.y || j>=end.y || k<begin.z || k>=end.z
                 || n < 0 || n >= ncomp) {
-#if AMREX_DEVICE_COMPILE
-                AMREX_DEVICE_PRINTF(" (%d,%d,%d,%d) is out of bound (%d:%d,%d:%d,%d:%d,0:%d)\n",
-                                    i, j, k, n, begin.x, end.x-1, begin.y, end.y-1,
-                                    begin.z, end.z-1, ncomp-1);
-                amrex::Abort();
-#else
-                std::stringstream ss;
-                ss << " (" << i << "," << j << "," << k << "," <<  n
-                   << ") is out of bound ("
-                   << begin.x << ":" << end.x-1 << ","
-                   << begin.y << ":" << end.y-1 << ","
-                   << begin.z << ":" << end.z-1 << ","
-                   << "0:" << ncomp-1 << ")";
-                amrex::Abort(ss.str());
-#endif
+                AMREX_IF_ON_DEVICE((
+                    AMREX_DEVICE_PRINTF(" (%d,%d,%d,%d) is out of bound (%d:%d,%d:%d,%d:%d,0:%d)\n",
+                                        i, j, k, n, begin.x, end.x-1, begin.y, end.y-1,
+                                        begin.z, end.z-1, ncomp-1);
+                    amrex::Abort();
+                ))
+                AMREX_IF_ON_HOST((
+                    std::stringstream ss;
+                    ss << " (" << i << "," << j << "," << k << "," <<  n
+                    << ") is out of bound ("
+                    << begin.x << ":" << end.x-1 << ","
+                    << begin.y << ":" << end.y-1 << ","
+                    << begin.z << ":" << end.z-1 << ","
+                    << "0:" << ncomp-1 << ")";
+                    amrex::Abort(ss.str());
+                ))
             }
         }
 #endif
diff --git a/Src/Base/AMReX_FabArray.H b/Src/Base/AMReX_FabArray.H
index a8839a4bcc0..e507dab153b 100644
--- a/Src/Base/AMReX_FabArray.H
+++ b/Src/Base/AMReX_FabArray.H
@@ -198,11 +198,8 @@ struct MultiArray4
 {
     AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
     Array4<T> const& operator[] (int li) const noexcept {
-#if AMREX_DEVICE_COMPILE
-        return dp[li];
-#else
-        return hp[li];
-#endif
+        AMREX_IF_ON_DEVICE((return dp[li];))
+        AMREX_IF_ON_HOST((return hp[li];))
     }
 
 #ifdef AMREX_USE_GPU
diff --git a/Src/Base/AMReX_GpuAtomic.H b/Src/Base/AMReX_GpuAtomic.H
index 55fc351156a..deea6ae932e 100644
--- a/Src/Base/AMReX_GpuAtomic.H
+++ b/Src/Base/AMReX_GpuAtomic.H
@@ -132,17 +132,17 @@ namespace detail {
     AMREX_GPU_DEVICE AMREX_FORCE_INLINE
     T Add_device (T* const sum, T const value) noexcept
     {
-#if defined(__CUDA_ARCH__) && defined(AMREX_USE_CUDA) || \
-    defined(__HIP_DEVICE_COMPILE__) && defined(AMREX_USE_HIP)
-        return atomicAdd(sum, value);
-#elif defined(__SYCL_DEVICE_ONLY__)
+#if defined(__SYCL_DEVICE_ONLY__)
         constexpr auto mo = sycl::memory_order::relaxed;
         constexpr auto ms = sycl::memory_scope::device;
         sycl::atomic_ref<T,mo,ms,AS> a{*sum};
         return a.fetch_add(value);
 #else
-        amrex::ignore_unused(sum, value);
-        return T(); // should never get here, but have to return something
+        AMREX_IF_ON_DEVICE(( return atomicAdd(sum, value); ))
+        AMREX_IF_ON_HOST((
+            amrex::ignore_unused(sum, value);
+            return T(); // should never get here, but have to return something
+        ))
 #endif
     }
 
@@ -175,7 +175,7 @@ namespace detail {
 
 #endif
 
-#if defined(AMREX_USE_CUDA) && (__CUDA_ARCH__ < 600)
+#if defined(AMREX_USE_CUDA) && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 600)
 
     AMREX_GPU_DEVICE AMREX_FORCE_INLINE
     double Add_device (double* const sum, double const value) noexcept
@@ -195,17 +195,16 @@ namespace detail {
     AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
     T Add (T* sum, T value) noexcept
     {
-#if AMREX_DEVICE_COMPILE
 #ifdef AMREX_USE_SYCL
-        return Add_device<T,AS>(sum, value);
-#else
-        return Add_device(sum, value);
-#endif
+        AMREX_IF_ON_DEVICE((return Add_device<T,AS>(sum, value);))
 #else
-        auto old = *sum;
-        *sum += value;
-        return old;
+        AMREX_IF_ON_DEVICE((return Add_device(sum, value);))
 #endif
+        AMREX_IF_ON_HOST((
+            auto old = *sum;
+            *sum += value;
+            return old;
+        ))
     }
 
 ////////////////////////////////////////////////////////////////////////
@@ -252,18 +251,19 @@ namespace detail {
     AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
     bool If (T* const add, T const value, Op&& op, Cond&& cond) noexcept
     {
-#if AMREX_DEVICE_COMPILE
-        return If_device(add, value, std::forward<Op>(op), std::forward<Cond>(cond));
-#else
-        T old = *add;
-        T const tmp = op(old, value);
-        if (cond(tmp)) {
-            *add = tmp;
-            return true;
-        } else {
-            return false;
-        }
-#endif
+        AMREX_IF_ON_DEVICE((
+            return If_device(add, value, std::forward<Op>(op), std::forward<Cond>(cond));
+        ))
+        AMREX_IF_ON_HOST((
+            T old = *add;
+            T const tmp = op(old, value);
+            if (cond(tmp)) {
+                *add = tmp;
+                return true;
+            } else {
+                return false;
+            }
+        ))
     }
 
 ////////////////////////////////////////////////////////////////////////
@@ -278,14 +278,11 @@ namespace detail {
     AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
     void AddNoRet (T* sum, T value) noexcept
     {
-#if AMREX_DEVICE_COMPILE
-#ifdef AMREX_USE_SYCL
+#if defined(__SYCL_DEVICE_ONLY__)
         Add_device<T,AS>(sum, value);
 #else
-        Add_device(sum, value);
-#endif
-#else
-        *sum += value;
+        AMREX_IF_ON_DEVICE((Add_device(sum, value);))
+        AMREX_IF_ON_HOST((*sum += value;))
 #endif
     }
 
@@ -293,14 +290,11 @@ namespace detail {
     AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
     void AddNoRet (float* const sum, float const value) noexcept
     {
-#if AMREX_DEVICE_COMPILE
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wdeprecated-declarations"
-        atomicAddNoRet(sum, value);
+        AMREX_IF_ON_DEVICE((atomicAddNoRet(sum, value);))
 #pragma clang diagnostic pop
-#else
-        *sum += value;
-#endif
+        AMREX_IF_ON_HOST((*sum += value;))
     }
 #endif
 
@@ -314,18 +308,18 @@ namespace detail {
     AMREX_GPU_DEVICE AMREX_FORCE_INLINE
     T Min_device (T* const m, T const value) noexcept
     {
-#if defined(__CUDA_ARCH__) && defined(AMREX_USE_CUDA) || \
-    defined(__HIP_DEVICE_COMPILE__) && defined(AMREX_USE_HIP)
-        return atomicMin(m, value);
-#elif defined(__SYCL_DEVICE_ONLY__)
+#if defined(__SYCL_DEVICE_ONLY__)
         constexpr auto mo = sycl::memory_order::relaxed;
         constexpr auto ms = sycl::memory_scope::device;
         constexpr auto as = sycl::access::address_space::global_space;
         sycl::atomic_ref<T,mo,ms,as> a{*m};
         return a.fetch_min(value);
 #else
-        amrex::ignore_unused(m,value);
-        return T(); // should never get here, but have to return something
+        AMREX_IF_ON_DEVICE(( return atomicMin(m, value); ))
+        AMREX_IF_ON_HOST((
+            amrex::ignore_unused(m,value);
+            return T(); // should never get here, but have to return something
+        ))
 #endif
     }
 
@@ -357,13 +351,14 @@ namespace detail {
     AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
     T Min (T* const m, T const value) noexcept
     {
-#if AMREX_DEVICE_COMPILE
-        return Min_device(m, value);
-#else
-        auto const old = *m;
-        *m = (*m) < value ? (*m) : value;
-        return old;
-#endif
+        AMREX_IF_ON_DEVICE((
+            return Min_device(m, value);
+        ))
+        AMREX_IF_ON_HOST((
+            auto const old = *m;
+            *m = (*m) < value ? (*m) : value;
+            return old;
+        ))
     }
 
 ////////////////////////////////////////////////////////////////////////
@@ -376,18 +371,18 @@ namespace detail {
     AMREX_GPU_DEVICE AMREX_FORCE_INLINE
     T Max_device (T* const m, T const value) noexcept
     {
-#if defined(__CUDA_ARCH__) && defined(AMREX_USE_CUDA) || \
-    defined(__HIP_DEVICE_COMPILE__) && defined(AMREX_USE_HIP)
-        return atomicMax(m, value);
-#elif defined(__SYCL_DEVICE_ONLY__)
+#if defined(__SYCL_DEVICE_ONLY__)
         constexpr auto mo = sycl::memory_order::relaxed;
         constexpr auto ms = sycl::memory_scope::device;
         constexpr auto as = sycl::access::address_space::global_space;
         sycl::atomic_ref<T,mo,ms,as> a{*m};
         return a.fetch_max(value);
 #else
-        amrex::ignore_unused(m,value);
-        return T(); // should never get here, but have to return something
+        AMREX_IF_ON_DEVICE(( return atomicMax(m, value); ))
+        AMREX_IF_ON_HOST((
+            amrex::ignore_unused(m,value);
+            return T(); // should never get here, but have to return something
+        ))
 #endif
     }
 
@@ -419,13 +414,14 @@ namespace detail {
     AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
     T Max (T* const m, T const value) noexcept
     {
-#if AMREX_DEVICE_COMPILE
-        return Max_device(m, value);
-#else
-        auto const old = *m;
-        *m = (*m) > value ? (*m) : value;
-        return old;
-#endif
+        AMREX_IF_ON_DEVICE((
+            return Max_device(m, value);
+        ))
+        AMREX_IF_ON_HOST((
+            auto const old = *m;
+            *m = (*m) > value ? (*m) : value;
+            return old;
+        ))
     }
 
 ////////////////////////////////////////////////////////////////////////
@@ -435,19 +431,21 @@ namespace detail {
     AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
     int LogicalOr (int* const m, int const value) noexcept
     {
-#if defined(__CUDA_ARCH__) && defined(AMREX_USE_CUDA) || \
-    defined(__HIP_DEVICE_COMPILE__) && defined(AMREX_USE_HIP)
-        return atomicOr(m, value);
-#elif defined(__SYCL_DEVICE_ONLY__)
+#if defined(__SYCL_DEVICE_ONLY__)
         constexpr auto mo = sycl::memory_order::relaxed;
         constexpr auto ms = sycl::memory_scope::device;
         constexpr auto as = sycl::access::address_space::global_space;
         sycl::atomic_ref<int,mo,ms,as> a{*m};
         return a.fetch_or(value);
 #else
-        int const old = *m;
-        *m = (*m) || value;
-        return old;
+        AMREX_IF_ON_DEVICE((
+            return atomicOr(m, value);
+        ))
+        AMREX_IF_ON_HOST((
+            int const old = *m;
+            *m = (*m) || value;
+            return old;
+        ))
 #endif
     }
 
@@ -458,19 +456,21 @@ namespace detail {
     AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
     int LogicalAnd (int* const m, int const value) noexcept
     {
-#if defined(__CUDA_ARCH__) && defined(AMREX_USE_CUDA) || \
-    defined(__HIP_DEVICE_COMPILE__) && defined(AMREX_USE_HIP)
-        return atomicAnd(m, value ? ~0x0 : 0);
-#elif defined(__SYCL_DEVICE_ONLY__)
+#if defined(__SYCL_DEVICE_ONLY__)
         constexpr auto mo = sycl::memory_order::relaxed;
         constexpr auto ms = sycl::memory_scope::device;
         constexpr auto as = sycl::access::address_space::global_space;
         sycl::atomic_ref<int,mo,ms,as> a{*m};
         return a.fetch_and(value ? ~0x0 : 0);
 #else
-        int const old = *m;
-        *m = (*m) && value;
-        return old;
+        AMREX_IF_ON_DEVICE((
+            return atomicAnd(m, value ? ~0x0 : 0);
+        ))
+        AMREX_IF_ON_HOST((
+            int const old = *m;
+            *m = (*m) && value;
+            return old;
+        ))
 #endif
     }
 
@@ -482,19 +482,21 @@ namespace detail {
     AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
     T Exch (T* address, T val) noexcept
     {
-#if defined(__CUDA_ARCH__) && defined(AMREX_USE_CUDA) || \
-    defined(__HIP_DEVICE_COMPILE__) && defined(AMREX_USE_HIP)
-        return atomicExch(address, val);
-#elif defined(__SYCL_DEVICE_ONLY__)
+#if defined(__SYCL_DEVICE_ONLY__)
         constexpr auto mo = sycl::memory_order::relaxed;
         constexpr auto ms = sycl::memory_scope::device;
         constexpr auto as = sycl::access::address_space::global_space;
         sycl::atomic_ref<T,mo,ms,as> a{*address};
         return a.exchange(val);
 #else
-        auto const old = *address;
-        *address = val;
-        return old;
+        AMREX_IF_ON_DEVICE((
+            return atomicExch(address, val);
+        ))
+        AMREX_IF_ON_HOST((
+            auto const old = *address;
+            *address = val;
+            return old;
+        ))
 #endif
     }
 
@@ -506,10 +508,7 @@ namespace detail {
     AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
     T CAS (T* const address, T compare, T const val) noexcept
     {           // cannot be T const compare because of compare_exchange_strong
-#if defined(__CUDA_ARCH__) && defined(AMREX_USE_CUDA) || \
-    defined(__HIP_DEVICE_COMPILE__) && defined(AMREX_USE_HIP)
-        return atomicCAS(address, compare, val);
-#elif defined(__SYCL_DEVICE_ONLY__)
+#if defined(__SYCL_DEVICE_ONLY__)
         constexpr auto mo = sycl::memory_order::relaxed;
         constexpr auto ms = sycl::memory_scope::device;
         constexpr auto as = sycl::access::address_space::global_space;
@@ -517,9 +516,14 @@ namespace detail {
         a.compare_exchange_strong(compare, val);
         return compare;
 #else
-        auto const old = *address;
-        *address = (old == compare ? val : old);
-        return old;
+        AMREX_IF_ON_DEVICE((
+            return atomicCAS(address, compare, val);
+        ))
+        AMREX_IF_ON_HOST((
+            auto const old = *address;
+            *address = (old == compare ? val : old);
+            return old;
+        ))
 #endif
     }
 }
@@ -527,17 +531,21 @@ namespace detail {
 namespace HostDevice::Atomic {
 
     template <class T>
-    AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
-    void Add (T* const sum, T const value) noexcept
+    AMREX_FORCE_INLINE
+    void Add_Host (T* const sum, T const value) noexcept
     {
-#if AMREX_DEVICE_COMPILE
-        Gpu::Atomic::AddNoRet(sum,value);
-#else
 #ifdef AMREX_USE_OMP
 #pragma omp atomic update
 #endif
         *sum += value;
-#endif
+    }
+
+    template <class T>
+    AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+    void Add (T* const sum, T const value) noexcept
+    {
+        AMREX_IF_ON_DEVICE((Gpu::Atomic::AddNoRet(sum,value);))
+        AMREX_IF_ON_HOST((Add_Host(sum,value);))
     }
 
 }
diff --git a/Src/Base/AMReX_GpuLaunch.H b/Src/Base/AMReX_GpuLaunch.H
index c1870d2ef58..c4ba7dd86bc 100644
--- a/Src/Base/AMReX_GpuLaunch.H
+++ b/Src/Base/AMReX_GpuLaunch.H
@@ -103,20 +103,21 @@ namespace Gpu {
     inline
     Box getThreadBox (const Box& bx, Long offset) noexcept
     {
-#if AMREX_DEVICE_COMPILE
-        const auto len = bx.length3d();
-        Long k = offset / (len[0]*len[1]);
-        Long j = (offset - k*(len[0]*len[1])) / len[0];
-        Long i = (offset - k*(len[0]*len[1])) - j*len[0];
-        IntVect iv{AMREX_D_DECL(static_cast<int>(i),
-                                static_cast<int>(j),
-                                static_cast<int>(k))};
-        iv += bx.smallEnd();
-        return (bx & Box(iv,iv,bx.type()));
-#else
-        amrex::ignore_unused(offset);
-        return bx;
-#endif
+        AMREX_IF_ON_DEVICE((
+            const auto len = bx.length3d();
+            Long k = offset / (len[0]*len[1]);
+            Long j = (offset - k*(len[0]*len[1])) / len[0];
+            Long i = (offset - k*(len[0]*len[1])) - j*len[0];
+            IntVect iv{AMREX_D_DECL(static_cast<int>(i),
+                                    static_cast<int>(j),
+                                    static_cast<int>(k))};
+            iv += bx.smallEnd();
+            return (bx & Box(iv,iv,bx.type()));
+        ))
+        AMREX_IF_ON_HOST((
+            amrex::ignore_unused(offset);
+            return bx;
+        ))
     }
 
 // ************************************************
diff --git a/Src/Base/AMReX_GpuQualifiers.H b/Src/Base/AMReX_GpuQualifiers.H
index 1c0b5731762..4fba23a849a 100644
--- a/Src/Base/AMReX_GpuQualifiers.H
+++ b/Src/Base/AMReX_GpuQualifiers.H
@@ -8,6 +8,12 @@
 #include <hip/hip_runtime.h>
 #endif
 
+#if defined(AMREX_USE_CUDA) && (defined(AMREX_CXX_PGI) || defined(AMREX_CXX_NVHPC))
+#include <nv/target>
+#define AMREX_IF_ON_DEVICE(CODE) NV_IF_TARGET(NV_IS_DEVICE, CODE)
+#define AMREX_IF_ON_HOST(CODE) NV_IF_TARGET(NV_IS_HOST, CODE)
+#endif
+
 #define AMREX_GPU_HOST __host__
 #define AMREX_GPU_DEVICE __device__
 #define AMREX_GPU_GLOBAL __global__
@@ -31,6 +37,29 @@
 
 #define AMREX_DEVICE_COMPILE (__CUDA_ARCH__ || __HIP_DEVICE_COMPILE__ || __SYCL_DEVICE_ONLY__)
 
+// Remove surrounding parentheses if present
+#define AMREX_IMPL_STRIP_PARENS(X) AMREX_IMPL_ESC(AMREX_IMPL_ISH X)
+#define AMREX_IMPL_ISH(...) AMREX_IMPL_ISH __VA_ARGS__
+#define AMREX_IMPL_ESC(...) AMREX_IMPL_ESC_(__VA_ARGS__)
+#define AMREX_IMPL_ESC_(...) AMREX_IMPL_VAN_##__VA_ARGS__
+#define AMREX_IMPL_VAN_AMREX_IMPL_ISH
+
+#if !defined(AMREX_IF_ON_DEVICE) && !defined(AMREX_IF_ON_HOST)
+#if (defined(AMREX_USE_CUDA) && defined(__CUDA_ARCH__)) ||         \
+    (defined(AMREX_USE_HIP) && defined(__HIP_DEVICE_COMPILE__)) || \
+    (defined(AMREX_USE_SYCL) && defined(__SYCL_DEVICE_ONLY__))
+#define AMREX_IF_ON_DEVICE(CODE) \
+  { AMREX_IMPL_STRIP_PARENS(CODE) }
+#define AMREX_IF_ON_HOST(CODE) \
+  {}
+#else
+#define AMREX_IF_ON_DEVICE(CODE) \
+  {}
+#define AMREX_IF_ON_HOST(CODE) \
+  { AMREX_IMPL_STRIP_PARENS(CODE) }
+#endif
+#endif
+
 #ifdef AMREX_USE_SYCL
 # include <sycl/sycl.hpp>
 #endif
diff --git a/Src/Base/AMReX_GpuRange.H b/Src/Base/AMReX_GpuRange.H
index b8d2ab89d08..be5071dbf8a 100644
--- a/Src/Base/AMReX_GpuRange.H
+++ b/Src/Base/AMReX_GpuRange.H
@@ -32,31 +32,31 @@ Long at (T const& /*b*/, Long offset) noexcept { return offset; }
 AMREX_GPU_HOST_DEVICE
 AMREX_FORCE_INLINE Long size (Box const& b) noexcept
 {
-#if AMREX_DEVICE_COMPILE
-    return b.numPts();
-#else
-    amrex::ignore_unused(b);
-    return 1;
-#endif
+    AMREX_IF_ON_DEVICE((return b.numPts();))
+    AMREX_IF_ON_HOST((
+        amrex::ignore_unused(b);
+        return 1;
+    ))
 }
 
 AMREX_GPU_HOST_DEVICE
 AMREX_FORCE_INLINE Box at (Box const& b, Long offset) noexcept
 {
-#if AMREX_DEVICE_COMPILE
-    auto len = b.length3d();
-    Long k = offset / (len[0]*len[1]);
-    Long j = (offset - k*(len[0]*len[1])) / len[0];
-    Long i = (offset - k*(len[0]*len[1])) - j*len[0];
-    IntVect iv{AMREX_D_DECL(static_cast<int>(i),
-                            static_cast<int>(j),
-                            static_cast<int>(k))};
-    iv += b.smallEnd();
-    return Box(iv,iv,b.type());
-#else
-    amrex::ignore_unused(offset);
-    return b;
-#endif
+    AMREX_IF_ON_DEVICE((
+        auto len = b.length3d();
+        Long k = offset / (len[0]*len[1]);
+        Long j = (offset - k*(len[0]*len[1])) / len[0];
+        Long i = (offset - k*(len[0]*len[1])) - j*len[0];
+        IntVect iv{AMREX_D_DECL(static_cast<int>(i),
+                                static_cast<int>(j),
+                                static_cast<int>(k))};
+        iv += b.smallEnd();
+        return Box(iv,iv,b.type());
+    ))
+    AMREX_IF_ON_HOST((
+        amrex::ignore_unused(offset);
+        return b;
+    ))
 }
 
 template <typename T>
@@ -92,13 +92,15 @@ struct range_impl
 
     [[nodiscard]] AMREX_GPU_HOST_DEVICE
     iterator begin () const noexcept {
-#if defined(__CUDA_ARCH__) && defined(AMREX_USE_CUDA) || \
-    defined(__HIP_DEVICE_COMPILE__) && defined(AMREX_USE_HIP)
-        return iterator(m_b, blockDim.x*blockIdx.x+threadIdx.x, blockDim.x*gridDim.x);
-#elif defined (__SYCL_DEVICE_ONLY__)
+#if defined (__SYCL_DEVICE_ONLY__)
         return iterator(m_b, m_gid, m_grange);
 #else
-        return iterator(m_b,0,1);
+        AMREX_IF_ON_DEVICE((
+            return iterator(m_b, blockDim.x*blockIdx.x+threadIdx.x, blockDim.x*gridDim.x);
+        ))
+        AMREX_IF_ON_HOST((
+            return iterator(m_b,0,1);
+        ))
 #endif
     }
 
diff --git a/Src/Base/AMReX_GpuUtility.H b/Src/Base/AMReX_GpuUtility.H
index ce98556fc5c..4adc111f5e2 100644
--- a/Src/Base/AMReX_GpuUtility.H
+++ b/Src/Base/AMReX_GpuUtility.H
@@ -26,8 +26,9 @@ namespace Gpu {
     template <typename T>
     AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
     T LDG (Array4<T> const& a, int i, int j, int k) noexcept {
-#if defined(__CUDA_ARCH__) && defined(AMREX_USE_CUDA)
-        return __ldg(a.ptr(i,j,k));
+#if defined(AMREX_USE_CUDA)
+        AMREX_IF_ON_DEVICE((return __ldg(a.ptr(i,j,k));))
+        AMREX_IF_ON_HOST((return a(i,j,k);))
 #else
         return a(i,j,k);
 #endif
@@ -36,8 +37,9 @@ namespace Gpu {
     template <typename T>
     AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
     T LDG (Array4<T> const& a, int i, int j, int k, int n) noexcept {
-#if defined(__CUDA_ARCH__) && defined(AMREX_USE_CUDA)
-        return __ldg(a.ptr(i,j,k,n));
+#if defined(AMREX_USE_CUDA)
+        AMREX_IF_ON_DEVICE((return __ldg(a.ptr(i,j,k,n));))
+        AMREX_IF_ON_HOST((return a(i,j,k,n);))
 #else
         return a(i,j,k,n);
 #endif
diff --git a/Src/Base/AMReX_Math.H b/Src/Base/AMReX_Math.H
index 769b9bf50f4..506289d03d5 100644
--- a/Src/Base/AMReX_Math.H
+++ b/Src/Base/AMReX_Math.H
@@ -68,11 +68,9 @@ double cospi (double x)
 {
 #if defined(AMREX_USE_SYCL)
     return sycl::cospi(x);
-#elif defined(__CUDA_ARCH__) && defined(AMREX_USE_CUDA) || \
-      defined(__HIP_DEVICE_COMPILE__) && defined(AMREX_USE_HIP)
-    return ::cospi(x);
 #else
-    return std::cos(pi<double>()*x);
+    AMREX_IF_ON_DEVICE(( return ::cospi(x); ))
+    AMREX_IF_ON_HOST(( return std::cos(pi<double>()*x); ))
 #endif
 }
 
@@ -82,11 +80,9 @@ float cospi (float x)
 {
 #if defined(AMREX_USE_SYCL)
     return sycl::cospi(x);
-#elif defined(__CUDA_ARCH__) && defined(AMREX_USE_CUDA) || \
-      defined(__HIP_DEVICE_COMPILE__) && defined(AMREX_USE_HIP)
-    return ::cospif(x);
 #else
-    return std::cos(pi<float>()*x);
+    AMREX_IF_ON_DEVICE(( return ::cospif(x); ))
+    AMREX_IF_ON_HOST(( return std::cos(pi<float>()*x); ))
 #endif
 }
 
@@ -96,11 +92,9 @@ double sinpi (double x)
 {
 #if defined(AMREX_USE_SYCL)
     return sycl::sinpi(x);
-#elif defined(__CUDA_ARCH__) && defined(AMREX_USE_CUDA) || \
-      defined(__HIP_DEVICE_COMPILE__) && defined(AMREX_USE_HIP)
-    return ::sinpi(x);
 #else
-    return std::sin(pi<double>()*x);
+    AMREX_IF_ON_DEVICE(( return ::sinpi(x); ))
+    AMREX_IF_ON_HOST(( return std::sin(pi<double>()*x); ))
 #endif
 }
 
@@ -110,14 +104,32 @@ float sinpi (float x)
 {
 #if defined(AMREX_USE_SYCL)
     return sycl::sinpi(x);
-#elif defined(__CUDA_ARCH__) && defined(AMREX_USE_CUDA) || \
-      defined(__HIP_DEVICE_COMPILE__) && defined(AMREX_USE_HIP)
-    return ::sinpif(x);
 #else
-    return std::sin(pi<float>()*x);
+    AMREX_IF_ON_DEVICE(( return ::sinpif(x); ))
+    AMREX_IF_ON_HOST(( return std::sin(pi<float>()*x); ))
 #endif
 }
 
+namespace detail {
+    AMREX_FORCE_INLINE void sincos (double x, double* sinx, double* cosx) {
+#if defined(_GNU_SOURCE) && !defined(__APPLE__)
+        ::sincos(x, sinx, cosx);
+#else
+        *sinx = std::sin(x);
+        *cosx = std::cos(x);
+#endif
+    }
+
+    AMREX_FORCE_INLINE void sincosf (float x, float* sinx, float* cosx) {
+#if defined(_GNU_SOURCE) && !defined(__APPLE__)
+        ::sincosf(x, sinx, cosx);
+#else
+        *sinx = std::sin(x);
+        *cosx = std::cos(x);
+#endif
+    }
+}
+
 //! Return sine and cosine of given number
 AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
 std::pair<double,double> sincos (double x)
@@ -125,13 +137,9 @@ std::pair<double,double> sincos (double x)
     std::pair<double,double> r;
 #if defined(AMREX_USE_SYCL)
     r.first = sycl::sincos(x, sycl::private_ptr<double>(&r.second));
-#elif defined(__CUDA_ARCH__) && defined(AMREX_USE_CUDA) || \
-      defined(__HIP_DEVICE_COMPILE__) && defined(AMREX_USE_HIP) || \
-      (defined(_GNU_SOURCE) && !defined(__APPLE__))
-    ::sincos(x, &r.first, &r.second);
 #else
-    r.first  = std::sin(x);
-    r.second = std::cos(x);
+    AMREX_IF_ON_DEVICE(( ::sincos(x, &r.first, &r.second); ))
+    AMREX_IF_ON_HOST(( detail::sincos(x, &r.first, &r.second); ))
 #endif
     return r;
 }
@@ -143,13 +151,9 @@ std::pair<float,float> sincos (float x)
     std::pair<float,float> r;
 #if defined(AMREX_USE_SYCL)
     r.first = sycl::sincos(x, sycl::private_ptr<float>(&r.second));
-#elif defined(__CUDA_ARCH__) && defined(AMREX_USE_CUDA) || \
-      defined(__HIP_DEVICE_COMPILE__) && defined(AMREX_USE_HIP) || \
-      (defined(_GNU_SOURCE) && !defined(__APPLE__))
-    ::sincosf(x, &r.first, &r.second);
 #else
-    r.first  = std::sin(x);
-    r.second = std::cos(x);
+    AMREX_IF_ON_DEVICE(( ::sincosf(x, &r.first, &r.second); ))
+    AMREX_IF_ON_HOST(( detail::sincosf(x, &r.first, &r.second); ))
 #endif
     return r;
 }
@@ -159,11 +163,11 @@ AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
 std::pair<double,double> sincospi (double x)
 {
     std::pair<double,double> r;
-#if defined(__CUDA_ARCH__) && defined(AMREX_USE_CUDA) || \
-    defined(__HIP_DEVICE_COMPILE__) && defined(AMREX_USE_HIP)
-    ::sincospi(x, &r.first, &r.second);
-#else
+#if defined(AMREX_USE_SYCL)
     r = sincos(pi<double>()*x);
+#else
+    AMREX_IF_ON_DEVICE(( ::sincospi(x, &r.first, &r.second); ))
+    AMREX_IF_ON_HOST(( r = sincos(pi<double>()*x); ))
 #endif
     return r;
 }
@@ -173,11 +177,11 @@ AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
 std::pair<float,float> sincospi (float x)
 {
     std::pair<float,float> r;
-#if defined(__CUDA_ARCH__) && defined(AMREX_USE_CUDA) || \
-    defined(__HIP_DEVICE_COMPILE__) && defined(AMREX_USE_HIP)
-    ::sincospif(x, &r.first, &r.second);
-#else
+#if defined(AMREX_USE_SYCL)
     r = sincos(pi<float>()*x);
+#else
+    AMREX_IF_ON_DEVICE(( ::sincospif(x, &r.first, &r.second); ))
+    AMREX_IF_ON_HOST(( r = sincos(pi<float>()*x); ))
 #endif
     return r;
 }
diff --git a/Src/Base/AMReX_Random.H b/Src/Base/AMReX_Random.H
index 675c12082d5..50b2c2693b0 100644
--- a/Src/Base/AMReX_Random.H
+++ b/Src/Base/AMReX_Random.H
@@ -23,24 +23,29 @@ namespace amrex
     AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
     Real Random (RandomEngine const& random_engine)
     {
-#if defined(__CUDA_ARCH__) && defined(AMREX_USE_CUDA)
-#ifdef BL_USE_FLOAT
-        return 1.0f - curand_uniform(random_engine.rand_state);
+#if defined (__SYCL_DEVICE_ONLY__)
+        mkl::rng::device::uniform<Real> distr;
+        return mkl::rng::device::generate(distr, *random_engine.engine);
 #else
-        return 1.0 - curand_uniform_double(random_engine.rand_state);
-#endif
-#elif defined(__HIP_DEVICE_COMPILE__) && defined(AMREX_USE_HIP)
 #ifdef BL_USE_FLOAT
-        return 1.0f - hiprand_uniform(random_engine.rand_state);
+        AMREX_IF_ON_DEVICE((
+                AMREX_HIP_OR_CUDA(
+                        return 1.0f - hiprand_uniform(random_engine.rand_state); ,
+                        return 1.0f - curand_uniform(random_engine.rand_state);
+                )
+        ))
 #else
-        return 1.0 - hiprand_uniform_double(random_engine.rand_state);
+        AMREX_IF_ON_DEVICE((
+                AMREX_HIP_OR_CUDA(
+                        return 1.0 - hiprand_uniform_double(random_engine.rand_state); ,
+                        return 1.0 - curand_uniform_double(random_engine.rand_state);
+                )
+        ))
 #endif
-#elif defined (__SYCL_DEVICE_ONLY__)
-        mkl::rng::device::uniform<Real> distr;
-        return mkl::rng::device::generate(distr, *random_engine.engine);
-#else
-        amrex::ignore_unused(random_engine);
-        return Random();
+        AMREX_IF_ON_HOST((
+                amrex::ignore_unused(random_engine);
+                return Random();
+        ))
 #endif
     }
 
@@ -56,24 +61,29 @@ namespace amrex
     AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
     Real RandomNormal (Real mean, Real stddev, RandomEngine const& random_engine)
     {
-#if defined(__CUDA_ARCH__) && defined(AMREX_USE_CUDA)
-#ifdef BL_USE_FLOAT
-        return stddev * curand_normal(random_engine.rand_state) + mean;
+#if defined (__SYCL_DEVICE_ONLY__)
+        mkl::rng::device::gaussian<Real> distr(mean, stddev);
+        return mkl::rng::device::generate(distr, *random_engine.engine);
 #else
-        return stddev * curand_normal_double(random_engine.rand_state) + mean;
-#endif
-#elif defined(__HIP_DEVICE_COMPILE__) && defined(AMREX_USE_HIP)
 #ifdef BL_USE_FLOAT
-        return stddev * hiprand_normal(random_engine.rand_state) + mean;
+        AMREX_IF_ON_DEVICE((
+                AMREX_HIP_OR_CUDA(
+                        return stddev * hiprand_normal(random_engine.rand_state) + mean; ,
+                        return stddev * curand_normal(random_engine.rand_state) + mean;
+                )
+        ))
 #else
-        return stddev * hiprand_normal_double(random_engine.rand_state) + mean;
+        AMREX_IF_ON_DEVICE((
+                AMREX_HIP_OR_CUDA(
+                        return stddev * hiprand_normal_double(random_engine.rand_state) + mean; ,
+                        return stddev * curand_normal_double(random_engine.rand_state) + mean;
+                )
+        ))
 #endif
-#elif defined (__SYCL_DEVICE_ONLY__)
-        mkl::rng::device::gaussian<Real> distr(mean, stddev);
-        return mkl::rng::device::generate(distr, *random_engine.engine);
-#else
-        amrex::ignore_unused(random_engine);
-        return RandomNormal(mean, stddev);
+        AMREX_IF_ON_HOST((
+                amrex::ignore_unused(random_engine);
+                return RandomNormal(mean, stddev);
+        ))
 #endif
     }
 
@@ -91,16 +101,20 @@ namespace amrex
     AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
     unsigned int RandomPoisson (Real lambda, RandomEngine const& random_engine)
     {
-#if defined(__CUDA_ARCH__) && defined(AMREX_USE_CUDA)
-        return curand_poisson(random_engine.rand_state, lambda);
-#elif defined(__HIP_DEVICE_COMPILE__) && defined(AMREX_USE_HIP)
-        return hiprand_poisson(random_engine.rand_state, lambda);
-#elif defined (__SYCL_DEVICE_ONLY__)
+#if defined (__SYCL_DEVICE_ONLY__)
         mkl::rng::device::poisson<unsigned int> distr(lambda);
         return mkl::rng::device::generate(distr, *random_engine.engine);
 #else
-        amrex::ignore_unused(random_engine);
-        return RandomPoisson(lambda);
+        AMREX_IF_ON_DEVICE((
+                AMREX_HIP_OR_CUDA(
+                        return hiprand_poisson(random_engine.rand_state, lambda); ,
+                        return curand_poisson(random_engine.rand_state, lambda);
+                )
+        ))
+        AMREX_IF_ON_HOST((
+                amrex::ignore_unused(random_engine);
+                return RandomPoisson(lambda);
+        ))
 #endif
     }
 
@@ -116,22 +130,23 @@ namespace amrex
     AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
     unsigned int Random_int (unsigned int n, RandomEngine const& random_engine)
     {
-#if AMREX_DEVICE_COMPILE
 #if defined(__SYCL_DEVICE_ONLY__)
         mkl::rng::device::uniform<unsigned int> distr(0,n);
         return mkl::rng::device::generate(distr, *random_engine.engine);
 #else
-        unsigned int rand;
-        constexpr unsigned int RAND_M = 4294967295; // 2**32-1
-        do {
-            AMREX_HIP_OR_CUDA( rand = hiprand(random_engine.rand_state);,
-                               rand =  curand(random_engine.rand_state) );
-        } while (rand > (RAND_M - RAND_M % n));
-        return rand % n;
-#endif
-#else
-        amrex::ignore_unused(random_engine);
-        return Random_int(n);
+        AMREX_IF_ON_DEVICE((
+                unsigned int rand;
+                constexpr unsigned int RAND_M = 4294967295; // 2**32-1
+                do {
+                AMREX_HIP_OR_CUDA( rand = hiprand(random_engine.rand_state);,
+                                rand =  curand(random_engine.rand_state) );
+                } while (rand > (RAND_M - RAND_M % n));
+                return rand % n;
+        ))
+        AMREX_IF_ON_HOST((
+                amrex::ignore_unused(random_engine);
+                return Random_int(n);
+        ))
 #endif
     }
 
diff --git a/Src/Base/AMReX_TableData.H b/Src/Base/AMReX_TableData.H
index b7572e2a1cf..842225e53f4 100644
--- a/Src/Base/AMReX_TableData.H
+++ b/Src/Base/AMReX_TableData.H
@@ -57,16 +57,17 @@ struct Table1D
     void index_assert (int i) const
     {
         if (i < begin || i >= end) {
-#if AMREX_DEVICE_COMPILE
-            AMREX_DEVICE_PRINTF(" (%d) is out of bound (%d:%d)\n",
-                                i, begin, end-1);
-            amrex::Abort();
-#else
-            std::stringstream ss;
-            ss << " (" << i << ") is out of bound ("
-               << begin << ":" << end-1 << ")";
-            amrex::Abort(ss.str());
-#endif
+            AMREX_IF_ON_DEVICE((
+                AMREX_DEVICE_PRINTF(" (%d) is out of bound (%d:%d)\n",
+                                    i, begin, end-1);
+                amrex::Abort();
+            ))
+            AMREX_IF_ON_HOST((
+                std::stringstream ss;
+                ss << " (" << i << ") is out of bound ("
+                << begin << ":" << end-1 << ")";
+                amrex::Abort(ss.str());
+            ))
         }
     }
 #endif
@@ -120,17 +121,18 @@ struct Table2D
     {
         if (i < begin[0] || i >= end[0] ||
             j < begin[1] || j >= end[1]) {
-#if AMREX_DEVICE_COMPILE
-            AMREX_DEVICE_PRINTF(" (%d,%d) is out of bound (%d:%d,%d:%d)\n",
-                                i, j, begin[0], end[0]-1, begin[1], end[1]-1);
-            amrex::Abort();
-#else
-            std::stringstream ss;
-            ss << " (" << i << "," << j << ") is out of bound ("
-               << begin[0] << ":" << end[0]-1
-               << "," << begin[1] << ":" << end[1]-1 << ")";
-            amrex::Abort(ss.str());
-#endif
+            AMREX_IF_ON_DEVICE((
+                AMREX_DEVICE_PRINTF(" (%d,%d) is out of bound (%d:%d,%d:%d)\n",
+                                    i, j, begin[0], end[0]-1, begin[1], end[1]-1);
+                amrex::Abort();
+            ))
+            AMREX_IF_ON_HOST((
+                std::stringstream ss;
+                ss << " (" << i << "," << j << ") is out of bound ("
+                << begin[0] << ":" << end[0]-1
+                << "," << begin[1] << ":" << end[1]-1 << ")";
+                amrex::Abort(ss.str());
+            ))
         }
     }
 #endif
@@ -188,19 +190,20 @@ struct Table3D
         if (i < begin[0] || i >= end[0] ||
             j < begin[1] || j >= end[1] ||
             k < begin[2] || k >= end[2]) {
-#if AMREX_DEVICE_COMPILE
-            AMREX_DEVICE_PRINTF(" (%d,%d,%d) is out of bound (%d:%d,%d:%d,%d:%d)\n",
-                                i, j, k, begin[0], end[0]-1, begin[1], end[1]-1,
-                                begin[2], end[2]-1);
-            amrex::Abort();
-#else
-            std::stringstream ss;
-            ss << " (" << i << "," << j << "," << k << ") is out of bound ("
-               << begin[0] << ":" << end[0]-1
-               << "," << begin[1] << ":" << end[1]-1
-               << "," << begin[2] << ":" << end[2]-1 << ")";
-            amrex::Abort(ss.str());
-#endif
+            AMREX_IF_ON_DEVICE((
+                AMREX_DEVICE_PRINTF(" (%d,%d,%d) is out of bound (%d:%d,%d:%d,%d:%d)\n",
+                                    i, j, k, begin[0], end[0]-1, begin[1], end[1]-1,
+                                    begin[2], end[2]-1);
+                amrex::Abort();
+            ))
+            AMREX_IF_ON_HOST((
+                std::stringstream ss;
+                ss << " (" << i << "," << j << "," << k << ") is out of bound ("
+                << begin[0] << ":" << end[0]-1
+                << "," << begin[1] << ":" << end[1]-1
+                << "," << begin[2] << ":" << end[2]-1 << ")";
+                amrex::Abort(ss.str());
+            ))
         }
     }
 #endif
@@ -262,20 +265,21 @@ struct Table4D
             j < begin[1] || j >= end[1] ||
             k < begin[2] || k >= end[2] ||
             n < begin[3] || n >= end[3]) {
-#if AMREX_DEVICE_COMPILE
-            AMREX_DEVICE_PRINTF(" (%d,%d,%d,%d) is out of bound (%d:%d,%d:%d,%d:%d,%d:%d)\n",
-                                i, j, k, n, begin[0], end[0]-1, begin[1], end[1]-1,
-                                begin[2], end[2]-1, begin[3], end[3]-1);
-            amrex::Abort();
-#else
-            std::stringstream ss;
-            ss << " (" << i << "," << j << "," << k << "," << n << ") is out of bound ("
-               << begin[0] << ":" << end[0]-1
-               << "," << begin[1] << ":" << end[1]-1
-               << "," << begin[2] << ":" << end[2]-1
-               << "," << begin[3] << ":" << end[3]-1 << ")";
-            amrex::Abort(ss.str());
-#endif
+            AMREX_IF_ON_DEVICE((
+                AMREX_DEVICE_PRINTF(" (%d,%d,%d,%d) is out of bound (%d:%d,%d:%d,%d:%d,%d:%d)\n",
+                                    i, j, k, n, begin[0], end[0]-1, begin[1], end[1]-1,
+                                    begin[2], end[2]-1, begin[3], end[3]-1);
+                amrex::Abort();
+            ))
+            AMREX_IF_ON_HOST((
+                std::stringstream ss;
+                ss << " (" << i << "," << j << "," << k << "," << n << ") is out of bound ("
+                << begin[0] << ":" << end[0]-1
+                << "," << begin[1] << ":" << end[1]-1
+                << "," << begin[2] << ":" << end[2]-1
+                << "," << begin[3] << ":" << end[3]-1 << ")";
+                amrex::Abort(ss.str());
+            ))
         }
     }
 #endif
diff --git a/Src/Base/Parser/AMReX_IParser.H b/Src/Base/Parser/AMReX_IParser.H
index 69f40252b0c..025da853c28 100644
--- a/Src/Base/Parser/AMReX_IParser.H
+++ b/Src/Base/Parser/AMReX_IParser.H
@@ -20,11 +20,8 @@ struct IParserExecutor
     [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
     int operator() () const noexcept
     {
-#if AMREX_DEVICE_COMPILE
-        return iparser_exe_eval(m_device_executor, nullptr);
-#else
-        return iparser_exe_eval(m_host_executor, nullptr);
-#endif
+        AMREX_IF_ON_DEVICE((return iparser_exe_eval(m_device_executor, nullptr);))
+        AMREX_IF_ON_HOST((return iparser_exe_eval(m_host_executor, nullptr);))
     }
 
     template <typename... Ts>
@@ -33,30 +30,21 @@ struct IParserExecutor
     operator() (Ts... var) const noexcept
     {
         amrex::GpuArray<int,N> l_var{var...};
-#if AMREX_DEVICE_COMPILE
-        return iparser_exe_eval(m_device_executor, l_var.data());
-#else
-        return iparser_exe_eval(m_host_executor, l_var.data());
-#endif
+        AMREX_IF_ON_DEVICE((return iparser_exe_eval(m_device_executor, l_var.data());))
+        AMREX_IF_ON_HOST((return iparser_exe_eval(m_host_executor, l_var.data());))
     }
 
     [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
     int operator() (GpuArray<int,N> const& var) const noexcept
     {
-#if AMREX_DEVICE_COMPILE
-        return iparser_exe_eval(m_device_executor, var.data());
-#else
-        return iparser_exe_eval(m_host_executor, var.data());
-#endif
+        AMREX_IF_ON_DEVICE((return iparser_exe_eval(m_device_executor, var.data());))
+        AMREX_IF_ON_HOST((return iparser_exe_eval(m_host_executor, var.data());))
     }
 
     [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
     explicit operator bool () const {
-#if AMREX_DEVICE_COMPILE
-        return m_device_executor != nullptr;
-#else
-        return m_host_executor != nullptr;
-#endif
+        AMREX_IF_ON_DEVICE((return m_device_executor != nullptr;))
+        AMREX_IF_ON_HOST((return m_host_executor != nullptr;))
     }
 
     char* m_host_executor = nullptr;
diff --git a/Src/Base/Parser/AMReX_Parser.H b/Src/Base/Parser/AMReX_Parser.H
index b74de941950..456910f8736 100644
--- a/Src/Base/Parser/AMReX_Parser.H
+++ b/Src/Base/Parser/AMReX_Parser.H
@@ -21,11 +21,8 @@ struct ParserExecutor
     [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
     double operator() () const noexcept
     {
-#if AMREX_DEVICE_COMPILE
-        return parser_exe_eval(m_device_executor, nullptr);
-#else
-        return parser_exe_eval(m_host_executor, nullptr);
-#endif
+        AMREX_IF_ON_DEVICE((return parser_exe_eval(m_device_executor, nullptr);))
+        AMREX_IF_ON_HOST((return parser_exe_eval(m_host_executor, nullptr);))
     }
 
     template <typename... Ts>
@@ -34,11 +31,8 @@ struct ParserExecutor
     operator() (Ts... var) const noexcept
     {
         amrex::GpuArray<double,N> l_var{var...};
-#if AMREX_DEVICE_COMPILE
-        return parser_exe_eval(m_device_executor, l_var.data());
-#else
-        return parser_exe_eval(m_host_executor, l_var.data());
-#endif
+        AMREX_IF_ON_DEVICE((return parser_exe_eval(m_device_executor, l_var.data());))
+        AMREX_IF_ON_HOST((return parser_exe_eval(m_host_executor, l_var.data());))
     }
 
     template <typename... Ts>
@@ -47,30 +41,21 @@ struct ParserExecutor
     operator() (Ts... var) const noexcept
     {
         amrex::GpuArray<double,N> l_var{var...};
-#if AMREX_DEVICE_COMPILE
-        return static_cast<float>(parser_exe_eval(m_device_executor, l_var.data()));
-#else
-        return static_cast<float>(parser_exe_eval(m_host_executor, l_var.data()));
-#endif
+        AMREX_IF_ON_DEVICE((return static_cast<float>(parser_exe_eval(m_device_executor, l_var.data()));))
+        AMREX_IF_ON_HOST((return static_cast<float>(parser_exe_eval(m_host_executor, l_var.data()));))
     }
 
     [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
     double operator() (GpuArray<double,N> const& var) const noexcept
     {
-#if AMREX_DEVICE_COMPILE
-        return parser_exe_eval(m_device_executor, var.data());
-#else
-        return parser_exe_eval(m_host_executor, var.data());
-#endif
+        AMREX_IF_ON_DEVICE((return parser_exe_eval(m_device_executor, var.data());))
+        AMREX_IF_ON_HOST((return parser_exe_eval(m_host_executor, var.data());))
     }
 
     AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
     explicit operator bool () const {
-#if AMREX_DEVICE_COMPILE
-        return m_device_executor != nullptr;
-#else
-        return m_host_executor != nullptr;
-#endif
+        AMREX_IF_ON_DEVICE((return m_device_executor != nullptr;))
+        AMREX_IF_ON_HOST((return m_host_executor != nullptr;))
     }
 
     char* m_host_executor = nullptr;
diff --git a/Src/Base/Parser/AMReX_Parser_Y.H b/Src/Base/Parser/AMReX_Parser_Y.H
index 792f796554e..e84cf9e0d59 100644
--- a/Src/Base/Parser/AMReX_Parser_Y.H
+++ b/Src/Base/Parser/AMReX_Parser_Y.H
@@ -350,7 +350,7 @@ template <typename T>
 AMREX_GPU_HOST_DEVICE AMREX_NO_INLINE
 T parser_math_comp_ellint_1 (T a)
 {
-#if defined(__GNUC__) && !defined(__clang__) && !defined(__CUDA_ARCH__)
+#if defined(__GNUC__) && !defined(__clang__) && !defined(__CUDA_ARCH__) && !defined(__NVCOMPILER)
     return std::comp_ellint_1(a);
 #else
     amrex::ignore_unused(a);
@@ -363,7 +363,7 @@ template <typename T>
 AMREX_GPU_HOST_DEVICE AMREX_NO_INLINE
 T parser_math_comp_ellint_2 (T a)
 {
-#if defined(__GNUC__) && !defined(__clang__) && !defined(__CUDA_ARCH__)
+#if defined(__GNUC__) && !defined(__clang__) && !defined(__CUDA_ARCH__) && !defined(__NVCOMPILER)
     return std::comp_ellint_2(a);
 #else
     amrex::ignore_unused(a);
diff --git a/Src/EB/AMReX_EB2_GeometryShop.H b/Src/EB/AMReX_EB2_GeometryShop.H
index ee353c13952..33931b28c5f 100644
--- a/Src/EB/AMReX_EB2_GeometryShop.H
+++ b/Src/EB/AMReX_EB2_GeometryShop.H
@@ -28,13 +28,12 @@ AMREX_GPU_HOST_DEVICE
 Real
 IF_f (F const& f, GpuArray<Real,AMREX_SPACEDIM> const& p) noexcept
 {
-#if AMREX_DEVICE_COMPILE
-    amrex::ignore_unused(f,p);
-    amrex::Error("EB2::GeometryShop: how did this happen?");
-    return 0.0;
-#else
-    return f({AMREX_D_DECL(p[0],p[1],p[2])});
-#endif
+    AMREX_IF_ON_DEVICE((
+        amrex::ignore_unused(f,p);
+        amrex::Error("EB2::GeometryShop: how did this happen?");
+        return 0.0;
+    ))
+    AMREX_IF_ON_HOST((return f({AMREX_D_DECL(p[0],p[1],p[2])});))
 }
 
 template <class F>

From 6e2b831245f6fdac9a714c64417bd5ab21fb613d Mon Sep 17 00:00:00 2001
From: Edward Basso <edwardebasso@gmail.com>
Date: Mon, 13 Nov 2023 15:00:36 -0800
Subject: [PATCH 09/14] solve_bicgstab: use linop.make instead of MF
 constructor (#3619)

## Summary

This PR replaces the explicit use of MF constructors in
```MLCGSolverT<MF>::solve_bicgstab``` with calls to the `make` method of
the linear operator associated with the MLCGSolverT object.

## Additional background

The use of `MLLinOpT<MF>::make` allows for inheritance of MLCGSolverT
without an override of `solve_bicgstab` even if the MF class lacks a
constructor with the same arguments as those MultiFab. For the MLMG
template classes, `make` should generally be used instead of explicit MF
constructors. Another PR to change this in `solve_cg` will follow once
this is fully vetted and approved.
---
 Src/LinearSolvers/MLMG/AMReX_MLCGSolver.H | 22 +++++++++-------------
 1 file changed, 9 insertions(+), 13 deletions(-)

diff --git a/Src/LinearSolvers/MLMG/AMReX_MLCGSolver.H b/Src/LinearSolvers/MLMG/AMReX_MLCGSolver.H
index 3764fa38f8a..fce7b1d5005 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLCGSolver.H
+++ b/Src/LinearSolvers/MLMG/AMReX_MLCGSolver.H
@@ -90,22 +90,18 @@ MLCGSolverT<MF>::solve_bicgstab (MF& sol, const MF& rhs, RT eps_rel, RT eps_abs)
 
     const int ncomp = sol.nComp();
 
-    const BoxArray& ba = sol.boxArray();
-    const DistributionMapping& dm = sol.DistributionMap();
-    const auto& factory = sol.Factory();
-
-    MF ph(ba, dm, ncomp, sol.nGrowVect(), MFInfo(), factory);
-    MF sh(ba, dm, ncomp, sol.nGrowVect(), MFInfo(), factory);
+    MF ph = Lp.make(amrlev, mglev, sol.nGrowVect());
+    MF sh = Lp.make(amrlev, mglev, sol.nGrowVect());
     ph.setVal(RT(0.0));
     sh.setVal(RT(0.0));
 
-    MF sorig(ba, dm, ncomp, nghost, MFInfo(), factory);
-    MF p    (ba, dm, ncomp, nghost, MFInfo(), factory);
-    MF r    (ba, dm, ncomp, nghost, MFInfo(), factory);
-    MF s    (ba, dm, ncomp, nghost, MFInfo(), factory);
-    MF rh   (ba, dm, ncomp, nghost, MFInfo(), factory);
-    MF v    (ba, dm, ncomp, nghost, MFInfo(), factory);
-    MF t    (ba, dm, ncomp, nghost, MFInfo(), factory);
+    MF sorig = Lp.make(amrlev, mglev, nghost);
+    MF p     = Lp.make(amrlev, mglev, nghost);
+    MF r     = Lp.make(amrlev, mglev, nghost);
+    MF s     = Lp.make(amrlev, mglev, nghost);
+    MF rh    = Lp.make(amrlev, mglev, nghost);
+    MF v     = Lp.make(amrlev, mglev, nghost);
+    MF t     = Lp.make(amrlev, mglev, nghost);
 
     Lp.correctionResidual(amrlev, mglev, r, sol, rhs, MLLinOpT<MF>::BCMode::Homogeneous);
 

From fa3743fd1fdd5e3f1b12d431b0f6bb4b15bb7b95 Mon Sep 17 00:00:00 2001
From: Weiqun Zhang <WeiqunZhang@lbl.gov>
Date: Mon, 13 Nov 2023 15:00:56 -0800
Subject: [PATCH 10/14] CArena: shrink_in_place and operator<< (#3621)

## Summary

Implement CArena::shrink_in_place, which is used by
PODVector::shrink_to_fit. It avoids a new memory allocation and data
movement.

Add operator<< to CArena. This helps debugging.

## Additional background

Follow-up on #3426.

## Checklist

The proposed changes:
- [ ] fix a bug or incorrect behavior in AMReX
- [x] add new capabilities to AMReX
- [ ] changes answers in the test suite to more than roundoff level
- [ ] are likely to significantly affect the results of downstream AMReX
users
- [ ] include documentation in the code and/or rst files, if appropriate
---
 Src/Base/AMReX_CArena.H   | 19 ++++----
 Src/Base/AMReX_CArena.cpp | 96 ++++++++++++++++++++++++++++++++++++++-
 2 files changed, 105 insertions(+), 10 deletions(-)

diff --git a/Src/Base/AMReX_CArena.H b/Src/Base/AMReX_CArena.H
index d68285bc878..163039df2ef 100644
--- a/Src/Base/AMReX_CArena.H
+++ b/Src/Base/AMReX_CArena.H
@@ -5,13 +5,14 @@
 #include <AMReX_Arena.H>
 
 #include <cstddef>
-#include <set>
-#include <vector>
+#include <functional>
+#include <iosfwd>
 #include <map>
 #include <mutex>
-#include <unordered_set>
-#include <functional>
+#include <set>
 #include <string>
+#include <unordered_set>
+#include <vector>
 
 namespace amrex {
 
@@ -57,7 +58,7 @@ public:
      * Try to shrink in-place
      */
     [[nodiscard]] void*
-    shrink_in_place (void* pt, std::size_t sz) final;
+    shrink_in_place (void* pt, std::size_t new_size) final;
 
     /**
     * \brief Free up allocated memory.  Merge neighboring free memory chunks
@@ -164,15 +165,15 @@ protected:
         MemStat* m_stat;
     };
 
+    //! The list of blocks allocated via ::operator new().
+    std::vector<std::pair<void*,std::size_t> > m_alloc;
+
     /**
     * \brief The type of our freelist and blocklist.
     * We use a set sorted from lo to hi memory addresses.
     */
     using NL = std::set<Node>;
 
-    //! The list of blocks allocated via ::operator new().
-    std::vector<std::pair<void*,std::size_t> > m_alloc;
-
     /**
     * \brief The free list of allocated but not currently used blocks.
     * Maintained in lo to hi memory sorted order.
@@ -198,6 +199,8 @@ protected:
 
 
     std::mutex carena_mutex;
+
+    friend std::ostream& operator<< (std::ostream& os, const CArena& arena);
 };
 
 }
diff --git a/Src/Base/AMReX_CArena.cpp b/Src/Base/AMReX_CArena.cpp
index 6f7979d4750..c47f8f5ed26 100644
--- a/Src/Base/AMReX_CArena.cpp
+++ b/Src/Base/AMReX_CArena.cpp
@@ -14,6 +14,7 @@ namespace amrex {
 
 #include <utility>
 #include <cstring>
+#include <iostream>
 
 namespace amrex {
 
@@ -203,9 +204,61 @@ CArena::alloc_in_place (void* pt, std::size_t szmin, std::size_t szmax)
 }
 
 void*
-CArena::shrink_in_place (void* /*pt*/, std::size_t sz)
+CArena::shrink_in_place (void* pt, std::size_t new_size)
 {
-    return alloc(sz); // xxxxx TODO
+    if ((pt == nullptr) || (new_size == 0)) { return nullptr; }
+
+    new_size = Arena::align(new_size);
+
+    std::lock_guard<std::mutex> lock(carena_mutex);
+
+    auto busy_it = m_busylist.find(Node(pt,nullptr,0));
+    if (busy_it == m_busylist.end()) {
+        amrex::Abort("CArena::shrink_in_place: unknown pointer");
+        return nullptr;
+    }
+    AMREX_ASSERT(m_freelist.find(*busy_it) == m_freelist.end());
+
+    auto const old_size = busy_it->size();
+
+    if (new_size > old_size) {
+        amrex::Abort("CArena::shrink_in_place: wrong size. Cannot shrink to a larger size.");
+        return nullptr;
+    } else if (new_size == old_size) {
+        return pt;
+    } else {
+        auto const leftover_size = old_size - new_size;
+
+        void* pt2 = static_cast<char*>(pt) + new_size;
+        Node new_free_node(pt2, busy_it->owner(), leftover_size);
+
+        void* pt_end = static_cast<char*>(pt) + old_size;
+        auto free_it = m_freelist.find(Node(pt_end,nullptr,0));
+        if ((free_it == m_freelist.end()) || ! new_free_node.coalescable(*free_it)) {
+            m_freelist.insert(free_it, new_free_node);
+        } else {
+            auto& node = const_cast<Node&>(*free_it);
+            // This is safe because the free list is std::set and the
+            // modification of `block` does not change the order of elements
+            // in the container, even though Node's operator< uses block.
+            node.block(pt2);
+            node.size(leftover_size + node.size());
+        }
+
+        const_cast<Node&>(*busy_it).size(new_size);
+
+        m_actually_used -= leftover_size;
+
+#ifdef AMREX_TINY_PROFILING
+        if (m_do_profiling) {
+            TinyProfiler::memory_free(old_size, busy_it->mem_stat());
+            auto* stat = TinyProfiler::memory_alloc(new_size, m_profiling_stats);
+            const_cast<Node&>(*busy_it).mem_stat(stat);
+        }
+#endif
+
+        return pt;
+    }
 }
 
 void
@@ -439,4 +492,43 @@ CArena::PrintUsage (std::ostream& os, std::string const& name, std::string const
        << m_busylist.size() << " busy blocks, " << m_freelist.size() << " free blocks\n";
 }
 
+std::ostream& operator<< (std::ostream& os, const CArena& arena)
+{
+    os << "CArea:\n"
+       << "    Hunk size: " << arena.m_hunk << "\n"
+       << "    Memory allocated: " << arena.m_used << "\n"
+       << "    Memory actually used: " << arena.m_actually_used << "\n";
+
+    if (arena.m_alloc.empty()) {
+        os << "    No memory allocations\n";
+    } else {
+        os << "    List of memory alloations: (address, size)\n";
+        for (auto const& a : arena.m_alloc) {
+            os << "        " << a.first << ", " << a.second << "\n";
+        }
+    }
+
+    if (arena.m_freelist.empty()) {
+        os << "    No free nodes\n";
+    } else {
+        os << "    List of free nodes: (address, owner, size)\n";
+        for (auto const& a : arena.m_freelist) {
+            os << "        " << a.block() << ", " << a.owner() << ", "
+               << a.size() << "\n";
+        }
+    }
+
+    if (arena.m_busylist.empty()) {
+        os << "    No busy nodes\n";
+    } else {
+        os << "    List of busy nodes: (address, owner, size)\n";
+        for (auto const& a : arena.m_busylist) {
+            os << "        " << a.block() << ", " << a.owner() << ", "
+               << a.size() << "\n";
+        }
+    }
+
+    return os;
+}
+
 }

From b7408ea6e8feca7eab2f7cf30606d066b6699814 Mon Sep 17 00:00:00 2001
From: Edward Basso <edwardebasso@gmail.com>
Date: Mon, 13 Nov 2023 18:56:02 -0800
Subject: [PATCH 11/14] solve_cg: use linop.make instead of MF constructor
 (#3627)

## Summary

This PR replaces the explicit use of `MF` constructors in
`MLCGSolverT<MF>::solve_cg` with calls to the make method of the linear
operator associated with the `MLCGSolverT` object.

## Additional background

This is a similar to the PR on `solve_bicgstab`.
---
 Src/LinearSolvers/MLMG/AMReX_MLCGSolver.H | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

diff --git a/Src/LinearSolvers/MLMG/AMReX_MLCGSolver.H b/Src/LinearSolvers/MLMG/AMReX_MLCGSolver.H
index fce7b1d5005..c99d7b319bd 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLCGSolver.H
+++ b/Src/LinearSolvers/MLMG/AMReX_MLCGSolver.H
@@ -261,17 +261,13 @@ MLCGSolverT<MF>::solve_cg (MF& sol, const MF& rhs, RT eps_rel, RT eps_abs)
 
     const int ncomp = sol.nComp();
 
-    const BoxArray& ba = sol.boxArray();
-    const DistributionMapping& dm = sol.DistributionMap();
-    const auto& factory = sol.Factory();
-
-    MF p(ba, dm, ncomp, sol.nGrowVect(), MFInfo(), factory);
+    MF p = Lp.make(amrlev, mglev, sol.nGrowVect());
     p.setVal(RT(0.0));
 
-    MF sorig(ba, dm, ncomp, nghost, MFInfo(), factory);
-    MF r    (ba, dm, ncomp, nghost, MFInfo(), factory);
-    MF z    (ba, dm, ncomp, nghost, MFInfo(), factory);
-    MF q    (ba, dm, ncomp, nghost, MFInfo(), factory);
+    MF sorig = Lp.make(amrlev, mglev, nghost);
+    MF r     = Lp.make(amrlev, mglev, nghost);
+    MF z     = Lp.make(amrlev, mglev, nghost);
+    MF q     = Lp.make(amrlev, mglev, nghost);
 
     sorig.LocalCopy(sol,0,0,ncomp,nghost);
 

From af1e1be8d7d41de9b999b4973b2024281a28f23e Mon Sep 17 00:00:00 2001
From: Weiqun Zhang <WeiqunZhang@lbl.gov>
Date: Tue, 14 Nov 2023 11:13:47 -0800
Subject: [PATCH 12/14] Plotfile Tools: GPU support (#3626)

## Summary

Add GPU support for fcompare, fextrema, fsnapshot and fvolumesum.

## Checklist

The proposed changes:
- [x] fix a bug or incorrect behavior in AMReX
- [x] add new capabilities to AMReX
- [ ] changes answers in the test suite to more than roundoff level
- [ ] are likely to significantly affect the results of downstream AMReX
users
- [ ] include documentation in the code and/or rst files, if appropriate
---
 Src/Base/AMReX_PlotFileDataImpl.cpp |   2 +-
 Tools/Plotfile/fextrema.cpp         |  35 ++++----
 Tools/Plotfile/fsnapshot.cpp        |   4 +-
 Tools/Plotfile/fvolumesum.cpp       | 130 ++++++++++------------------
 4 files changed, 66 insertions(+), 105 deletions(-)

diff --git a/Src/Base/AMReX_PlotFileDataImpl.cpp b/Src/Base/AMReX_PlotFileDataImpl.cpp
index 1fbf5044a50..b85c17ad93c 100644
--- a/Src/Base/AMReX_PlotFileDataImpl.cpp
+++ b/Src/Base/AMReX_PlotFileDataImpl.cpp
@@ -141,7 +141,7 @@ PlotFileDataImpl::get (int level, std::string const& varname) noexcept
             int gid = mfi.index();
             FArrayBox& dstfab = mf[mfi];
             std::unique_ptr<FArrayBox> srcfab(m_vismf[level]->readFAB(gid, icomp));
-            dstfab.copy<RunOn::Host>(*srcfab);
+            dstfab.copy<RunOn::Device>(*srcfab);
         }
     }
     return mf;
diff --git a/Tools/Plotfile/fextrema.cpp b/Tools/Plotfile/fextrema.cpp
index 44cfaf161d4..55596bf952d 100644
--- a/Tools/Plotfile/fextrema.cpp
+++ b/Tools/Plotfile/fextrema.cpp
@@ -1,5 +1,6 @@
 #include <AMReX.H>
 #include <AMReX_Print.H>
+#include <AMReX_ParReduce.H>
 #include <AMReX_PlotFileUtil.H>
 #include <AMReX_MultiFabUtil.H>
 #include <algorithm>
@@ -80,23 +81,23 @@ void main_main()
                                               pf.boxArray(ilev+1), ratio);
                 for (int ivar = 0; ivar < var_names.size(); ++ivar) {
                     const MultiFab& mf = pf.get(ilev, var_names[ivar]);
-                    for (MFIter mfi(mf); mfi.isValid(); ++mfi) {
-                        const Box& bx = mfi.validbox();
-                        const auto lo = amrex::lbound(bx);
-                        const auto hi = amrex::ubound(bx);
-                        const auto& ifab = mask.array(mfi);
-                        const auto& fab = mf.array(mfi);
-                        for         (int k = lo.z; k <= hi.z; ++k) {
-                            for     (int j = lo.y; j <= hi.y; ++j) {
-                                for (int i = lo.x; i <= hi.x; ++i) {
-                                    if (ifab(i,j,k) == 0) {
-                                        vvmin[ivar] = std::min(fab(i,j,k),vvmin[ivar]);
-                                        vvmax[ivar] = std::max(fab(i,j,k),vvmax[ivar]);
-                                    }
-                                }
-                            }
-                        }
-                    }
+                    auto const& ma = mf.const_arrays();
+                    auto const& ima = mask.const_arrays();
+                    auto rr = ParReduce(TypeList<ReduceOpMin,ReduceOpMax>{},
+                                        TypeList<Real,Real>{}, mf,
+                              [=] AMREX_GPU_DEVICE (int bno, int i, int j, int k)
+                                  -> GpuTuple<Real,Real>
+                              {
+                                  if (ima[bno](i,j,k) == 0) {
+                                      auto x = ma[bno](i,j,k);
+                                      return {x,x};
+                                  } else {
+                                      return {std::numeric_limits<Real>::max(),
+                                              std::numeric_limits<Real>::lowest()};
+                                  }
+                              });
+                    vvmin[ivar] = std::min(amrex::get<0>(rr), vvmin[ivar]);
+                    vvmax[ivar] = std::max(amrex::get<1>(rr), vvmax[ivar]);
                 }
             }
         }
diff --git a/Tools/Plotfile/fsnapshot.cpp b/Tools/Plotfile/fsnapshot.cpp
index e68f8a33b6d..c4b9fd3f361 100644
--- a/Tools/Plotfile/fsnapshot.cpp
+++ b/Tools/Plotfile/fsnapshot.cpp
@@ -278,7 +278,7 @@ void main_main()
         gmx = std::log10(gmx);
     }
 
-    BaseFab<unsigned char> intdat;
+    BaseFab<unsigned char> intdat(The_Pinned_Arena());
     for (int idir = ndir_begin; idir < ndir_end; ++idir) {
         intdat.resize(finebox[idir],1);
         const int width = (idir == 0) ? finebox[idir].length(1) : finebox[idir].length(0);
@@ -286,7 +286,7 @@ void main_main()
         const auto& intarr = intdat.array();
         const auto& realarr = datamf[idir].array(0);
         Real fac = Real(253.999) / (gmx-gmn);
-        amrex::LoopOnCpu(finebox[idir], [=] (int i, int j, int k)
+        amrex::ParallelFor(finebox[idir], [=] AMREX_GPU_DEVICE (int i, int j, int k)
         {
             int jj = (idir == 2) ? height - 1 - j : j;  // flip the data in second image direction
             int kk = (idir == 2) ? k : height - 1 - k;
diff --git a/Tools/Plotfile/fvolumesum.cpp b/Tools/Plotfile/fvolumesum.cpp
index 2f9f03cc522..ec6e461dcc7 100644
--- a/Tools/Plotfile/fvolumesum.cpp
+++ b/Tools/Plotfile/fvolumesum.cpp
@@ -2,6 +2,7 @@
 #include <AMReX_Print.H>
 #include <AMReX_PlotFileUtil.H>
 #include <AMReX_MultiFabUtil.H>
+#include <AMReX_ParReduce.H>
 #include <AMReX_ParallelDescriptor.H>
 #include <limits>
 #include <iterator>
@@ -80,7 +81,6 @@ void main_main()
 
     const int dim = pf.spaceDim();
 
-
     int fine_level = pf.finestLevel();
 
     Vector<Real> pos;
@@ -95,6 +95,35 @@ void main_main()
 
         Array<Real,AMREX_SPACEDIM> dx = pf.cellSize(ilev);
 
+        Real volfac = AMREX_D_TERM(dx[0], *dx[1], *dx[2]);
+
+        if (coord == 1) {
+            AMREX_ALWAYS_ASSERT(AMREX_SPACEDIM == 2);
+            // axisymmetric V = pi (r_r**2 - r_l**2) * dz
+            //                = pi dr * dz * (r_r + r_l)
+            //                = 2 pi r dr dz
+            volfac *= 2 * pi; // 2 * pi * dr * dz part here
+        } else if (coord == 2) {
+            AMREX_ALWAYS_ASSERT(AMREX_SPACEDIM == 1);
+            // 1-d spherical V = 4/3 pi (r_r**3 - r_l**3)
+            volfac *= (4.0_rt/3.0_rt) * pi; // 4/3 * pi * dr part here
+        }
+
+        auto xlo = problo[0];
+        auto dx0 = dx[0];
+        AMREX_ASSERT(coord == 0 || coord == 1 || coord == 2);
+        auto f_vol = [=] AMREX_GPU_DEVICE (int i) {
+                         if (coord == 0) {
+                             return volfac;
+                         } else if (coord == 1) {
+                             return volfac * (xlo + (Real(i)+0.5_rt)*dx0);
+                         } else {
+                             Real r_r = xlo + Real(i+1)*dx0;
+                             Real r_l = xlo + Real(i  )*dx0;
+                             return volfac * (r_r*r_r + r_l*r_r + r_l*r_l);
+                         }
+                     };
+
         if (ilev < fine_level) {
             IntVect ratio{pf.refRatio(ilev)};
             for (int idim = dim; idim < AMREX_SPACEDIM; ++idim) {
@@ -103,97 +132,28 @@ void main_main()
             const iMultiFab mask = makeFineMask(pf.boxArray(ilev), pf.DistributionMap(ilev),
                                                 pf.boxArray(ilev+1), ratio);
             const MultiFab& mf = pf.get(ilev, var_name);
-            for (MFIter mfi(mf); mfi.isValid(); ++mfi) {
-                const Box& bx = mfi.validbox();
-                if (bx.ok()) {
-                    const auto& m = mask.array(mfi);
-                    const auto& fab = mf.array(mfi);
-                    const auto lo = amrex::lbound(bx);
-                    const auto hi = amrex::ubound(bx);
-                    for (int k = lo.z; k <= hi.z; ++k) {
-                        for (int j = lo.y; j <= hi.y; ++j) {
-                            for (int i = lo.x; i <= hi.x; ++i) {
-                                if (m(i,j,k) == 0) { // not covered by fine
-                                    Array<Real,AMREX_SPACEDIM> p
-                                        = {AMREX_D_DECL(problo[0]+static_cast<Real>(i+0.5)*dx[0],
-                                                        problo[1]+static_cast<Real>(j+0.5)*dx[1],
-                                                        problo[2]+static_cast<Real>(k+0.5)*dx[2])};
-
-                                    // compute the volume
-                                    Real vol = std::numeric_limits<Real>::quiet_NaN();
-                                    if (coord == 0) {
-                                        // Cartesian
-                                        vol = 1.0_rt;
-                                        for (int idim = 0; idim < AMREX_SPACEDIM; ++idim) {
-                                            vol *= dx[idim];
-                                        }
-                                    } else if (coord == 1) {
-                                        // axisymmetric V = pi (r_r**2 - r_l**2) * dz
-                                        //                = pi dr * dz * (r_r + r_l)
-                                        //                = 2 pi r dr dz
-                                        vol = 2 * pi * p[0] * dx[0] * dx[1];
-                                    } else if (coord == 2) {
-                                        // 1-d spherical V = 4/3 pi (r_r**3 - r_l**3)
-                                        Real r_r = problo[0]+static_cast<Real>(i+1)*dx[0];
-                                        Real r_l = problo[0]+static_cast<Real>(i)*dx[0];
-                                        vol = (4.0_rt/3.0_rt) * pi * dx[0] * (r_r*r_r + r_l*r_r + r_l*r_l);
-                                    }
-
-                                    lsum += fab(i,j,k) * vol;
-                                }
-                            }
-                        }
-                    }
-                }
-            }
+            auto const& ima = mask.const_arrays();
+            auto const& ma = mf.const_arrays();
+            lsum += ParReduce(TypeList<ReduceOpSum>{}, TypeList<Real>{}, mf,
+                    [=] AMREX_GPU_DEVICE (int bno, int i, int j, int k)
+                        -> GpuTuple<Real>
+                    {
+                        return { (ima[bno](i,j,k) == 0) ? ma[bno](i,j,k)*f_vol(i) : 0._rt };
+                    });
         } else {
             const MultiFab& mf = pf.get(ilev, var_name);
-            for (MFIter mfi(mf); mfi.isValid(); ++mfi) {
-                const Box& bx = mfi.validbox();
-                if (bx.ok()) {
-                    const auto& fab = mf.array(mfi);
-                    const auto lo = amrex::lbound(bx);
-                    const auto hi = amrex::ubound(bx);
-                    for (int k = lo.z; k <= hi.z; ++k) {
-                        for (int j = lo.y; j <= hi.y; ++j) {
-                            for (int i = lo.x; i <= hi.x; ++i) {
-                                Array<Real,AMREX_SPACEDIM> p
-                                    = {AMREX_D_DECL(problo[0]+static_cast<Real>(i+0.5)*dx[0],
-                                                    problo[1]+static_cast<Real>(j+0.5)*dx[1],
-                                                    problo[2]+static_cast<Real>(k+0.5)*dx[2])};
-
-                                    // compute the volume
-                                    Real vol = std::numeric_limits<Real>::quiet_NaN();
-                                    if (coord == 0) {
-                                        // Cartesian
-                                        vol = 1.0_rt;
-                                        for (int idim = 0; idim < AMREX_SPACEDIM; ++idim) {
-                                            vol *= dx[idim];
-                                        }
-                                    } else if (coord == 1) {
-                                        // axisymmetric V = pi (r_r**2 - r_l**2) * dz
-                                        //                = pi dr * dz * (r_r + r_l)
-                                        //                = 2 pi r dr dz
-                                        vol = 2 * pi * p[0] * dx[0] * dx[1];
-                                    } else if (coord == 2) {
-                                        // 1-d spherical V = 4/3 pi (r_r**3 - r_l**3)
-                                        Real r_r = problo[0]+static_cast<Real>(i+1)*dx[0];
-                                        Real r_l = problo[0]+static_cast<Real>(i)*dx[0];
-                                        vol = (4.0_rt/3.0_rt) * pi * dx[0] * (r_r*r_r + r_l*r_r + r_l*r_l);
-                                    }
-
-                                    lsum += fab(i,j,k) * vol;
-                            }
-                        }
-                    }
-                }
-            }
+            auto const& ma = mf.const_arrays();
+            lsum += ParReduce(TypeList<ReduceOpSum>{}, TypeList<Real>{}, mf,
+                    [=] AMREX_GPU_DEVICE (int bno, int i, int j, int k)
+                        -> GpuTuple<Real>
+                    {
+                        return { ma[bno](i,j,k)*f_vol(i) };
+                    });
         }
     }
 
     ParallelDescriptor::ReduceRealSum(lsum);
 
-
     if (ParallelDescriptor::IOProcessor()) {
         std::cout << "integral of " << var_name << " = " << lsum << std::endl;
 

From 15a0bb9a8c1b34136632b16c5511375e9d56b184 Mon Sep 17 00:00:00 2001
From: Weiqun Zhang <WeiqunZhang@lbl.gov>
Date: Tue, 14 Nov 2023 19:36:40 -0800
Subject: [PATCH 13/14] SYCL: Use get_multi_ptr instead of get_pointer (#3630)

The latter has been deprecated in SYCL 2020.
---
 .github/workflows/intel.yml       |  3 +--
 Src/Base/AMReX_GpuLaunchFunctsG.H | 10 +++++-----
 2 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/intel.yml b/.github/workflows/intel.yml
index 6133e666fad..d86035d916e 100644
--- a/.github/workflows/intel.yml
+++ b/.github/workflows/intel.yml
@@ -226,10 +226,9 @@ jobs:
             -DCMAKE_VERBOSE_MAKEFILE=ON                    \
             -DAMReX_EB=ON                                  \
             -DAMReX_ENABLE_TESTS=ON                        \
-            -DAMReX_FORTRAN=ON                             \
+            -DAMReX_FORTRAN=OFF                            \
             -DCMAKE_C_COMPILER=$(which icc)                \
             -DCMAKE_CXX_COMPILER=$(which icpc)             \
-            -DCMAKE_Fortran_COMPILER=$(which ifort)        \
             -DCMAKE_CXX_COMPILER_LAUNCHER=ccache
         cmake --build build --parallel 2
         cmake --build build --target install
diff --git a/Src/Base/AMReX_GpuLaunchFunctsG.H b/Src/Base/AMReX_GpuLaunchFunctsG.H
index aea0c030152..78e9e856535 100644
--- a/Src/Base/AMReX_GpuLaunchFunctsG.H
+++ b/Src/Base/AMReX_GpuLaunchFunctsG.H
@@ -36,7 +36,7 @@ void launch (int nblocks, int nthreads_per_block, std::size_t shared_mem_bytes,
             [=] (sycl::nd_item<1> item)
             [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
             {
-                f(Gpu::Handler{&item,shared_data.get_pointer()});
+                f(Gpu::Handler{&item,shared_data.get_multi_ptr<sycl::access::decorated::yes>().get()});
             });
         });
     } catch (sycl::exception const& ex) {
@@ -82,7 +82,7 @@ void launch (int nblocks, std::size_t shared_mem_bytes, gpuStream_t stream,
             [[sycl::reqd_work_group_size(1,1,MT)]]
             [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
             {
-                f(Gpu::Handler{&item,shared_data.get_pointer()});
+                f(Gpu::Handler{&item,shared_data.get_multi_ptr<sycl::access::decorated::yes>().get()});
             });
         });
     } catch (sycl::exception const& ex) {
@@ -210,7 +210,7 @@ void ParallelFor (Gpu::KernelInfo const& info, T n, L&& f) noexcept
                          i < n; i += stride) {
                         int n_active_threads = amrex::min(n-i+(T)item.get_local_id(0),
                                                           (T)item.get_local_range(0));
-                        detail::call_f(f, i, Gpu::Handler{&item, shared_data.get_pointer(),
+                        detail::call_f(f, i, Gpu::Handler{&item, shared_data.get_multi_ptr<sycl::access::decorated::yes>().get(),
                                                           n_active_threads});
                     }
                 });
@@ -269,7 +269,7 @@ void ParallelFor (Gpu::KernelInfo const& info, Box const& box, L&& f) noexcept
                         k += lo.z;
                         int n_active_threads = amrex::min(ncells-icell+(int)item.get_local_id(0),
                                                           (int)item.get_local_range(0));
-                        detail::call_f(f, i, j, k, Gpu::Handler{&item, shared_data.get_pointer(),
+                        detail::call_f(f, i, j, k, Gpu::Handler{&item, shared_data.get_multi_ptr<sycl::access::decorated::yes>().get(),
                                                                 n_active_threads});
                     }
                 });
@@ -335,7 +335,7 @@ void ParallelFor (Gpu::KernelInfo const& info, Box const& box, T ncomp, L&& f) n
                         int n_active_threads = amrex::min(ncells-icell+(int)item.get_local_id(0),
                                                           (int)item.get_local_range(0));
                         detail::call_f(f, i, j, k, ncomp,
-                                       Gpu::Handler{&item, shared_data.get_pointer(),
+                                       Gpu::Handler{&item, shared_data.get_multi_ptr<sycl::access::decorated::yes>().get(),
                                                     n_active_threads});
                     }
                 });

From a9da2a54de5d704dda746c8b036de469ea19f4e9 Mon Sep 17 00:00:00 2001
From: Weiqun Zhang <WeiqunZhang@lbl.gov>
Date: Wed, 15 Nov 2023 12:56:44 -0800
Subject: [PATCH 14/14] SUNDIALS: Use sunrealtype instead of realtype (#3632)

The latter has been deprecated.
---
 Src/Extern/SUNDIALS/AMReX_Sundials.H          |  4 +-
 .../SUNDIALS/AMReX_SundialsIntegrator.H       | 44 +++++++++----------
 2 files changed, 24 insertions(+), 24 deletions(-)

diff --git a/Src/Extern/SUNDIALS/AMReX_Sundials.H b/Src/Extern/SUNDIALS/AMReX_Sundials.H
index 7bd09795d50..141291e4a21 100644
--- a/Src/Extern/SUNDIALS/AMReX_Sundials.H
+++ b/Src/Extern/SUNDIALS/AMReX_Sundials.H
@@ -6,7 +6,7 @@
 #include <AMReX_Sundials_Core.H>
 #include <AMReX_SUNMemory.H>
 
-static_assert(std::is_same<amrex::Real,realtype>::value,
-              "amrex::Real must be the same as SUNDIALS realtype");
+static_assert(std::is_same<amrex::Real,sunrealtype>::value,
+              "amrex::Real must be the same as SUNDIALS sunrealtype");
 
 #endif
diff --git a/Src/Extern/SUNDIALS/AMReX_SundialsIntegrator.H b/Src/Extern/SUNDIALS/AMReX_SundialsIntegrator.H
index 2623f832590..8066e9e8840 100644
--- a/Src/Extern/SUNDIALS/AMReX_SundialsIntegrator.H
+++ b/Src/Extern/SUNDIALS/AMReX_SundialsIntegrator.H
@@ -15,48 +15,48 @@
 #include <sunlinsol/sunlinsol_spgmr.h> /* access to SPGMR SUNLinearSolver        */
 #include <sunlinsol/sunlinsol_spfgmr.h> /* access to SPGMR SUNLinearSolver        */
 #include <sunnonlinsol/sunnonlinsol_fixedpoint.h> /* access to FixedPoint SUNNonlinearSolver */
-#include <sundials/sundials_types.h>   /* defs. of realtype, sunindextype, etc   */
+#include <sundials/sundials_types.h>   /* defs. of sunrealtype, sunindextype, etc   */
 
 namespace amrex {
 
 struct SundialsUserData {
-    std::function<int(realtype, N_Vector, N_Vector, void*)> f0;
-    std::function<int(realtype, N_Vector, N_Vector, void*)> f_fast;
-    std::function<int(realtype, N_Vector, N_Vector, void*)> f;
-    /* std::function<int(realtype, N_Vector*, int, void*)> StoreStage; */
-    std::function<int(realtype, N_Vector, void*)> ProcessStage;
-    std::function<int(realtype, N_Vector, void*)> PostStoreStage;
+    std::function<int(sunrealtype, N_Vector, N_Vector, void*)> f0;
+    std::function<int(sunrealtype, N_Vector, N_Vector, void*)> f_fast;
+    std::function<int(sunrealtype, N_Vector, N_Vector, void*)> f;
+    /* std::function<int(sunrealtype, N_Vector*, int, void*)> StoreStage; */
+    std::function<int(sunrealtype, N_Vector, void*)> ProcessStage;
+    std::function<int(sunrealtype, N_Vector, void*)> PostStoreStage;
 };
 
 namespace SundialsUserFun {
-    static int f0 (realtype t, N_Vector y, N_Vector ydot, void *user_data) {
+    static int f0 (sunrealtype t, N_Vector y, N_Vector ydot, void *user_data) {
         SundialsUserData* udata = static_cast<SundialsUserData*>(user_data);
         return udata->f0(t, y, ydot, user_data);
     }
 
-    static int f_fast (realtype t, N_Vector y_data, N_Vector y_rhs, void *user_data) {
+    static int f_fast (sunrealtype t, N_Vector y_data, N_Vector y_rhs, void *user_data) {
         SundialsUserData* udata = static_cast<SundialsUserData*>(user_data);
         return udata->f_fast(t, y_data, y_rhs, user_data);
     }
 
-    static int f (realtype t, N_Vector y_data, N_Vector y_rhs, void *user_data) {
+    static int f (sunrealtype t, N_Vector y_data, N_Vector y_rhs, void *user_data) {
         SundialsUserData* udata = static_cast<SundialsUserData*>(user_data);
         return udata->f(t, y_data, y_rhs, user_data);
     }
 
 /*
-    static int StoreStage (realtype t, N_Vector* f_data, int nvecs, void *user_data) {
+    static int StoreStage (sunrealtype t, N_Vector* f_data, int nvecs, void *user_data) {
         SundialsUserData* udata = static_cast<SundialsUserData*>(user_data);
         return udata->StoreStage(t, f_data, nvecs, user_data);
     }
 */
 
-    static int ProcessStage (realtype t, N_Vector y_data, void *user_data) {
+    static int ProcessStage (sunrealtype t, N_Vector y_data, void *user_data) {
         SundialsUserData* udata = static_cast<SundialsUserData*>(user_data);
         return udata->ProcessStage(t, y_data, user_data);
     }
 
-    static int PostStoreStage(realtype t, N_Vector y_data, void *user_data) {
+    static int PostStoreStage(sunrealtype t, N_Vector y_data, void *user_data) {
         SundialsUserData* udata = static_cast<SundialsUserData*>(user_data);
         return udata->PostStoreStage(t, y_data, user_data);
     }
@@ -245,7 +245,7 @@ public:
 
         /* Begin Section: SUNDIALS FUNCTION HOOKS */
         /* f routine to compute the ODE RHS function f(t,y). */
-        udata.f = [&](realtype rhs_time, N_Vector y_data, N_Vector y_rhs, void * /* user_data */) -> int {
+        udata.f = [&](sunrealtype rhs_time, N_Vector y_data, N_Vector y_rhs, void * /* user_data */) -> int {
             amrex::Vector<amrex::MultiFab> S_data;
             amrex::Vector<amrex::MultiFab> S_rhs;
 
@@ -265,7 +265,7 @@ public:
             return 0;
         };
 
-        udata.ProcessStage = [&](realtype rhs_time, N_Vector y_data, void * /* user_data */) -> int {
+        udata.ProcessStage = [&](sunrealtype rhs_time, N_Vector y_data, void * /* user_data */) -> int {
             amrex::Vector<amrex::MultiFab > S_data;
 
             const int num_vecs = N_VGetNumSubvectors_ManyVector(y_data);
@@ -421,14 +421,14 @@ public:
 
         /* Begin Section: SUNDIALS FUNCTION HOOKS */
         /* f0 routine to compute a zero-valued ODE RHS function f(t,y). */
-        udata.f0 = [&](realtype /* rhs_time */, N_Vector /* y */, N_Vector ydot, void * /* user_data */) -> int {
+        udata.f0 = [&](sunrealtype /* rhs_time */, N_Vector /* y */, N_Vector ydot, void * /* user_data */) -> int {
             // Initialize ydot to zero and return
             N_VConst(0.0, ydot);
             return 0;
         };
 
         /* f routine to compute the ODE RHS function f(t,y). */
-        udata.f_fast = [&](realtype rhs_time, N_Vector y_data, N_Vector y_rhs, void * /* user_data */) -> int {
+        udata.f_fast = [&](sunrealtype rhs_time, N_Vector y_data, N_Vector y_rhs, void * /* user_data */) -> int {
             amrex::Vector<amrex::MultiFab> S_data;
             amrex::Vector<amrex::MultiFab> S_rhs;
             amrex::Vector<amrex::MultiFab> S_stage_data;
@@ -456,7 +456,7 @@ public:
         };
 
         /* f routine to compute the ODE RHS function f(t,y). */
-        udata.f = [&](realtype rhs_time, N_Vector y_data, N_Vector y_rhs, void * /* user_data */) -> int {
+        udata.f = [&](sunrealtype rhs_time, N_Vector y_data, N_Vector y_rhs, void * /* user_data */) -> int {
             amrex::Vector<amrex::MultiFab> S_data;
             amrex::Vector<amrex::MultiFab> S_rhs;
 
@@ -476,7 +476,7 @@ public:
             return 0;
         };
 
-        udata.ProcessStage = [&](realtype rhs_time, N_Vector y_data, void * /* user_data */) -> int {
+        udata.ProcessStage = [&](sunrealtype rhs_time, N_Vector y_data, void * /* user_data */) -> int {
             amrex::Vector<amrex::MultiFab > S_data;
 
             const int num_vecs = N_VGetNumSubvectors_ManyVector(y_data);
@@ -492,7 +492,7 @@ public:
             return 0;
         };
 
-        udata.PostStoreStage = [&](realtype rhs_time, N_Vector y_data, void *user_data) -> int {
+        udata.PostStoreStage = [&](sunrealtype rhs_time, N_Vector y_data, void *user_data) -> int {
             udata.ProcessStage(rhs_time, y_data, user_data);
 
             for(int i=0; i<N_VGetNumSubvectors_ManyVector(y_data); i++)
@@ -693,7 +693,7 @@ public:
 
         /* Begin Section: SUNDIALS FUNCTION HOOKS */
         /* f routine to compute the ODE RHS function f(t,y). */
-        udata.f = [&](realtype rhs_time, N_Vector y_data, N_Vector y_rhs, void * /* user_data */) -> int {
+        udata.f = [&](sunrealtype rhs_time, N_Vector y_data, N_Vector y_rhs, void * /* user_data */) -> int {
             amrex::Vector<amrex::MultiFab> S_data;
             amrex::Vector<amrex::MultiFab> S_rhs;
 
@@ -713,7 +713,7 @@ public:
             return 0;
         };
 
-        udata.ProcessStage = [&](realtype rhs_time, N_Vector y_data, void * /* user_data */) -> int {
+        udata.ProcessStage = [&](sunrealtype rhs_time, N_Vector y_data, void * /* user_data */) -> int {
             amrex::Vector<amrex::MultiFab > S_data;
 
             const int num_vecs = N_VGetNumSubvectors_ManyVector(y_data);