From be6c6415467d09da6109d27cfa218868abc1f9db Mon Sep 17 00:00:00 2001 From: Axel Huebl Date: Thu, 26 Oct 2023 09:50:01 -0700 Subject: [PATCH 01/14] `AMReXBuildInfo.cmake`: AMReX_DIR (#3609) ## Summary The `AMReX_DIR` points if set to the CMake module path root. The old logic did not work for me in a situation (ImpactX) where: - AMReX is pre-installed but - found through a superbuild of another transient lib (ABLASTR) ## Additional background Follow-up to #3599 ## Checklist The proposed changes: - [x] fix a bug or incorrect behavior in AMReX - [ ] add new capabilities to AMReX - [ ] changes answers in the test suite to more than roundoff level - [ ] are likely to significantly affect the results of downstream AMReX users - [ ] include documentation in the code and/or rst files, if appropriate --- Tools/CMake/AMReXBuildInfo.cmake | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Tools/CMake/AMReXBuildInfo.cmake b/Tools/CMake/AMReXBuildInfo.cmake index 9afbc2ceaf4..0b87f053c2e 100644 --- a/Tools/CMake/AMReXBuildInfo.cmake +++ b/Tools/CMake/AMReXBuildInfo.cmake @@ -38,7 +38,7 @@ include(AMReXTargetHelpers) # # Set paths # -if (AMReX_FOUND) +if (AMReX_DIR) # AMReX is pre-installed and used as a library if (WIN32) # see AMReXInstallHelpers.cmake string(REPLACE "/cmake/AMReXCMakeModules" "" @@ -56,7 +56,7 @@ else () endif () set(AMREX_TOP_DIR "${AMREX_TOP_DIR_DEFAULT}" CACHE INTERNAL "Top level AMReX directory") -if (AMReX_FOUND) +if (AMReX_DIR) # AMReX is pre-installed and used as a library set(AMREX_C_SCRIPTS_DIR "${AMREX_TOP_DIR}/share/amrex/C_scripts" CACHE INTERNAL "Path to AMReX' C_scripts dir") From 601cc4ee80e0d7435a5b099202459bbb1f16b7a6 Mon Sep 17 00:00:00 2001 From: Klaus Date: Tue, 31 Oct 2023 14:07:41 -0500 Subject: [PATCH 02/14] Give FlashFluxRegisters ways to accumulate data in registers (#3597) I introduce variant methods that allow adding to previously stored data in Flash(-X) flux registers, rather than just copying over the older data. For Flash-X'ers: Something like this is needed to get flux correction working in the nontelescoping variant of the current Spark Hydro implementation. ## Summary ## Additional background ## Checklist The proposed changes: - [ ] fix a bug or incorrect behavior in AMReX - [x] add new capabilities to AMReX - [ ] changes answers in the test suite to more than roundoff level - [ ] are likely to significantly affect the results of downstream AMReX users - [ ] include documentation in the code and/or rst files, if appropriate --------- Co-authored-by: Weiqun Zhang --- .../AmrCore/AMReX_FlashFluxRegister.H | 416 ++++++++++++++++++ .../AmrCore/AMReX_FlashFluxRegister.cpp | 280 +----------- .../AmrCore/AMReX_flash_fluxregister_fi.cpp | 35 ++ .../AmrCore/AMReX_flash_fluxregister_mod.F90 | 85 +++- 4 files changed, 544 insertions(+), 272 deletions(-) diff --git a/Src/F_Interfaces/AmrCore/AMReX_FlashFluxRegister.H b/Src/F_Interfaces/AmrCore/AMReX_FlashFluxRegister.H index d946d33fdbc..3756a4bd1f2 100644 --- a/Src/F_Interfaces/AmrCore/AMReX_FlashFluxRegister.H +++ b/Src/F_Interfaces/AmrCore/AMReX_FlashFluxRegister.H @@ -5,6 +5,7 @@ #include #include #include +#include #include @@ -42,6 +43,18 @@ public: void store (int fine_global_index, int dir, FArrayBox const& fine_flux, FArrayBox const& area, const int* isFluxDensity, Real sf); + // flux_in_register += scaling_factor * \sum{fine_flux} / (ref_ratio[0]*ref_ratio[1]*ref_ratio[2]) + void add (int fine_global_index, int dir, FArrayBox const& fine_flux, Real sf); + + // flux_in_register += scaling_factor * \sum{fine_flux * area} + void add (int fine_global_index, int dir, FArrayBox const& fine_flux, FArrayBox const& area, + Real sf); + + // flux_in_register += scaling_factor * \sum{fine_flux * area}, if the component is flux density + // scaling_factor * \sum{fine_flux} , otherwise + void add (int fine_global_index, int dir, FArrayBox const& fine_flux, FArrayBox const& area, + const int* isFluxDensity, Real sf); + void communicate (); // crse_flux = flux_in_register * scaling_factor @@ -63,6 +76,20 @@ public: void load (int crse_global_index, int dir, FArrayBox& crse_flux, FArrayBox const& cflux, FArrayBox const& area, const int* isFluxDensity, Real sf_f, Real sf_c) const; + enum struct OpType { Store, Add }; + + template + void store_or_add (int fine_global_index, int dir, FArrayBox const& fine_flux, Real sf); + + template + void store_or_add (int fine_global_index, int dir, FArrayBox const& fine_flux, FArrayBox const& area, + Real sf); + + template + void store_or_add (int fine_global_index, int dir, FArrayBox const& fine_flux, FArrayBox const& area, + const int* isFluxDensity, Real sf); + + protected: BoxArray m_fine_grids; @@ -86,6 +113,395 @@ protected: mutable Vector > m_d_ifd; }; +template +void FlashFluxRegister::store_or_add (int fine_global_index, int dir, + FArrayBox const& fine_flux, Real sf) +{ + AMREX_ASSERT(dir < AMREX_SPACEDIM); + auto found = m_fine_map.find(fine_global_index); + if (found != m_fine_map.end()) { + const int ncomp = m_ncomp; + Array const& fab_a = found->second; + if (fab_a[dir]) { + Box const& b = fab_a[dir]->box(); + Array4 const& dest = fab_a[dir]->array(); + Array4 const& src = fine_flux.const_array(); + if (dir == 0) { +#if (AMREX_SPACEDIM == 1) + AMREX_HOST_DEVICE_PARALLEL_FOR_4D (b, ncomp, i, j, k, n, + { + amrex::ignore_unused(j,k,dest); + auto rhs = src(2*i,0,0,n)*sf; + if constexpr (op == OpType::Store) { + dest(i,0,0,n) = rhs; + } else { + dest(i,0,0,n) += rhs; + } + }); +#endif +#if (AMREX_SPACEDIM == 2) + AMREX_HOST_DEVICE_PARALLEL_FOR_4D (b, ncomp, i, j, k, n, + { + amrex::ignore_unused(k,dest); + auto rhs = (src(2*i,2*j ,0,n) + + src(2*i,2*j+1,0,n)) * (Real(0.5)*sf); + if constexpr (op == OpType::Store) { + dest(i,j,0,n) = rhs; + } else { + dest(i,j,0,n) += rhs; + } + }); +#endif +#if (AMREX_SPACEDIM == 3) + AMREX_HOST_DEVICE_PARALLEL_FOR_4D (b, ncomp, i, j, k, n, + { + auto rhs = (src(2*i,2*j ,2*k ,n) + + src(2*i,2*j+1,2*k ,n) + + src(2*i,2*j ,2*k+1,n) + + src(2*i,2*j+1,2*k+1,n)) * (Real(0.25)*sf); + amrex::ignore_unused(dest); // for cuda + if constexpr (op == OpType::Store) { + dest(i,j,k,n) = rhs; + } else { + dest(i,j,k,n) += rhs; + } + }); +#endif + } +#if (AMREX_SPACEDIM >= 2) + else if (dir == 1) { +#if (AMREX_SPACEDIM == 2) + AMREX_HOST_DEVICE_PARALLEL_FOR_4D (b, ncomp, i, j, k, n, + { + amrex::ignore_unused(k,dest); + auto rhs = (src(2*i ,2*j,0,n) + + src(2*i+1,2*j,0,n)) * (Real(0.5)*sf); + if constexpr (op == OpType::Store) { + dest(i,j,0,n) = rhs; + } else { + dest(i,j,0,n) += rhs; + } + }); +#endif +#if (AMREX_SPACEDIM == 3) + AMREX_HOST_DEVICE_PARALLEL_FOR_4D (b, ncomp, i, j, k, n, + { + auto rhs = (src(2*i ,2*j,2*k ,n) + + src(2*i+1,2*j,2*k ,n) + + src(2*i ,2*j,2*k+1,n) + + src(2*i+1,2*j,2*k+1,n)) * (Real(0.25)*sf); + amrex::ignore_unused(dest); // for cuda + if constexpr (op == OpType::Store) { + dest(i,j,k,n) = rhs; + } else { + dest(i,j,k,n) += rhs; + } + }); +#endif + } +#if (AMREX_SPACEDIM == 3) + else { + AMREX_HOST_DEVICE_PARALLEL_FOR_4D (b, ncomp, i, j, k, n, + { + auto rhs = (src(2*i ,2*j ,2*k,n) + + src(2*i+1,2*j ,2*k,n) + + src(2*i ,2*j+1,2*k,n) + + src(2*i+1,2*j+1,2*k,n)) * (Real(0.25)*sf); + amrex::ignore_unused(dest); // for cuda + if constexpr (op == OpType::Store) { + dest(i,j,k,n) = rhs; + } else { + dest(i,j,k,n) += rhs; + } + }); + } +#endif +#endif + } + } +} + +template +void FlashFluxRegister::store_or_add (int fine_global_index, int dir, + FArrayBox const& fine_flux, + FArrayBox const& fine_area, Real sf) +{ + AMREX_ASSERT(dir < AMREX_SPACEDIM); + auto found = m_fine_map.find(fine_global_index); + if (found != m_fine_map.end()) { + const int ncomp = m_ncomp; + Array const& fab_a = found->second; + if (fab_a[dir]) { + Box const& b = fab_a[dir]->box(); + Array4 const& dest = fab_a[dir]->array(); + Array4 const& src = fine_flux.const_array(); + Array4 const& area = fine_area.const_array(); + if (dir == 0) { +#if (AMREX_SPACEDIM == 1) + AMREX_HOST_DEVICE_PARALLEL_FOR_4D (b, ncomp, i, j, k, n, + { + amrex::ignore_unused(j,k,dest); + auto rhs = src(2*i,0,0,n)*area(2*i,0,0)*sf; + if constexpr (op == OpType::Store) { + dest(i,0,0,n) = rhs; + } else { + dest(i,0,0,n) += rhs; + } + }); +#endif +#if (AMREX_SPACEDIM == 2) + AMREX_HOST_DEVICE_PARALLEL_FOR_4D (b, ncomp, i, j, k, n, + { + amrex::ignore_unused(k,dest); + auto rhs = (src(2*i,2*j ,0,n)*area(2*i,2*j ,0) + + src(2*i,2*j+1,0,n)*area(2*i,2*j+1,0)) * sf; + if constexpr (op == OpType::Store) { + dest(i,j,0,n) = rhs; + } else { + dest(i,j,0,n) += rhs; + } + }); +#endif +#if (AMREX_SPACEDIM == 3) + AMREX_HOST_DEVICE_PARALLEL_FOR_4D (b, ncomp, i, j, k, n, + { + auto rhs = (src(2*i,2*j ,2*k ,n)*area(2*i,2*j ,2*k ) + + src(2*i,2*j+1,2*k ,n)*area(2*i,2*j+1,2*k ) + + src(2*i,2*j ,2*k+1,n)*area(2*i,2*j ,2*k+1) + + src(2*i,2*j+1,2*k+1,n)*area(2*i,2*j+1,2*k+1)) * sf; + amrex::ignore_unused(dest); // for cuda + if constexpr (op == OpType::Store) { + dest(i,j,k,n) = rhs; + } else { + dest(i,j,k,n) += rhs; + } + }); +#endif + } +#if (AMREX_SPACEDIM >= 2) + else if (dir == 1) { +#if (AMREX_SPACEDIM == 2) + AMREX_HOST_DEVICE_PARALLEL_FOR_4D (b, ncomp, i, j, k, n, + { + amrex::ignore_unused(k,dest); + auto rhs = (src(2*i ,2*j,0,n)*area(2*i ,2*j,0) + + src(2*i+1,2*j,0,n)*area(2*i+1,2*j,0)) * sf; + if constexpr (op == OpType::Store) { + dest(i,j,0,n) = rhs; + } else { + dest(i,j,0,n) += rhs; + } + }); +#endif +#if (AMREX_SPACEDIM == 3) + AMREX_HOST_DEVICE_PARALLEL_FOR_4D (b, ncomp, i, j, k, n, + { + auto rhs = (src(2*i ,2*j,2*k ,n)*area(2*i ,2*j,2*k ) + + src(2*i+1,2*j,2*k ,n)*area(2*i+1,2*j,2*k ) + + src(2*i ,2*j,2*k+1,n)*area(2*i ,2*j,2*k+1) + + src(2*i+1,2*j,2*k+1,n)*area(2*i+1,2*j,2*k+1)) * sf; + amrex::ignore_unused(dest); // for cuda + if constexpr (op == OpType::Store) { + dest(i,j,k,n) = rhs; + } else { + dest(i,j,k,n) += rhs; + } + }); +#endif + } +#if (AMREX_SPACEDIM == 3) + else { + AMREX_HOST_DEVICE_PARALLEL_FOR_4D (b, ncomp, i, j, k, n, + { + auto rhs = (src(2*i ,2*j ,2*k,n)*area(2*i ,2*j ,2*k) + + src(2*i+1,2*j ,2*k,n)*area(2*i+1,2*j ,2*k) + + src(2*i ,2*j+1,2*k,n)*area(2*i ,2*j+1,2*k) + + src(2*i+1,2*j+1,2*k,n)*area(2*i+1,2*j+1,2*k)) * sf; + amrex::ignore_unused(dest); // for cuda + if constexpr (op == OpType::Store) { + dest(i,j,k,n) = rhs; + } else { + dest(i,j,k,n) += rhs; + } + }); + } +#endif +#endif + } + } +} + +template +void FlashFluxRegister::store_or_add (int fine_global_index, int dir, + FArrayBox const& fine_flux, + FArrayBox const& fine_area, + const int* isFluxDensity, Real sf) +{ + auto& h_ifd = m_h_ifd[OpenMP::get_thread_num()]; + auto& d_ifd = m_d_ifd[OpenMP::get_thread_num()]; + + AMREX_ASSERT(dir < AMREX_SPACEDIM); + auto found = m_fine_map.find(fine_global_index); + if (found != m_fine_map.end()) { + const int ncomp = m_ncomp; + Array const& fab_a = found->second; + if (fab_a[dir]) { + bool allsame = true; + for (int n = 0; n < m_ncomp; ++n) { + if (h_ifd[n] != isFluxDensity[n]) { + allsame = false; + h_ifd[n] = isFluxDensity[n]; + } + } + if (d_ifd.empty()) { + allsame = false; + d_ifd.resize(m_ncomp); + } + if (! allsame) { + Gpu::copyAsync(Gpu::HostToDevice(), h_ifd.begin(), h_ifd.end(), d_ifd.begin()); + } + + Box const& b = fab_a[dir]->box(); + Array4 const& dest = fab_a[dir]->array(); + Array4 const& src = fine_flux.const_array(); + Array4 const& area = fine_area.const_array(); + const int* ifd = d_ifd.data(); + if (dir == 0) { +#if (AMREX_SPACEDIM == 1) + AMREX_HOST_DEVICE_PARALLEL_FOR_4D (b, ncomp, i, j, k, n, + { + amrex::ignore_unused(j,k,dest); + Real rhs; + if (ifd[n]) { + rhs = src(2*i,0,0,n)*area(2*i,0,0)*sf; + } else { + rhs = src(2*i,0,0,n)*sf; + } + if constexpr (op == OpType::Store) { + dest(i,0,0,n) = rhs; + } else { + dest(i,0,0,n) += rhs; + } + }); +#endif +#if (AMREX_SPACEDIM == 2) + AMREX_HOST_DEVICE_PARALLEL_FOR_4D (b, ncomp, i, j, k, n, + { + amrex::ignore_unused(k,dest); + Real rhs; + if (ifd[n]) { + rhs = (src(2*i,2*j ,0,n)*area(2*i,2*j ,0) + + src(2*i,2*j+1,0,n)*area(2*i,2*j+1,0)) * sf; + } else { + rhs = (src(2*i,2*j ,0,n) + + src(2*i,2*j+1,0,n)) * sf; + } + if constexpr (op == OpType::Store) { + dest(i,j,0,n) = rhs; + } else { + dest(i,j,0,n) += rhs; + } + }); +#endif +#if (AMREX_SPACEDIM == 3) + AMREX_HOST_DEVICE_PARALLEL_FOR_4D (b, ncomp, i, j, k, n, + { + Real rhs; + if (ifd[n]) { + rhs = (src(2*i,2*j ,2*k ,n)*area(2*i,2*j ,2*k ) + + src(2*i,2*j+1,2*k ,n)*area(2*i,2*j+1,2*k ) + + src(2*i,2*j ,2*k+1,n)*area(2*i,2*j ,2*k+1) + + src(2*i,2*j+1,2*k+1,n)*area(2*i,2*j+1,2*k+1)) * sf; + } else { + rhs = (src(2*i,2*j ,2*k ,n) + + src(2*i,2*j+1,2*k ,n) + + src(2*i,2*j ,2*k+1,n) + + src(2*i,2*j+1,2*k+1,n)) * sf; + } + amrex::ignore_unused(dest); // for cuda + if constexpr (op == OpType::Store) { + dest(i,j,k,n) = rhs; + } else { + dest(i,j,k,n) += rhs; + } + }); +#endif + } +#if (AMREX_SPACEDIM >= 2) + else if (dir == 1) { +#if (AMREX_SPACEDIM == 2) + AMREX_HOST_DEVICE_PARALLEL_FOR_4D (b, ncomp, i, j, k, n, + { + amrex::ignore_unused(k,dest); + Real rhs; + if (ifd[n]) { + rhs = (src(2*i ,2*j,0,n)*area(2*i ,2*j,0) + + src(2*i+1,2*j,0,n)*area(2*i+1,2*j,0)) * sf; + } else { + rhs = (src(2*i ,2*j,0,n) + + src(2*i+1,2*j,0,n)) * sf; + } + if constexpr (op == OpType::Store) { + dest(i,j,0,n) = rhs; + } else { + dest(i,j,0,n) += rhs; + } + }); +#endif +#if (AMREX_SPACEDIM == 3) + AMREX_HOST_DEVICE_PARALLEL_FOR_4D (b, ncomp, i, j, k, n, + { + Real rhs; + if (ifd[n]) { + rhs = (src(2*i ,2*j,2*k ,n)*area(2*i ,2*j,2*k ) + + src(2*i+1,2*j,2*k ,n)*area(2*i+1,2*j,2*k ) + + src(2*i ,2*j,2*k+1,n)*area(2*i ,2*j,2*k+1) + + src(2*i+1,2*j,2*k+1,n)*area(2*i+1,2*j,2*k+1)) * sf; + } else { + rhs = (src(2*i ,2*j,2*k ,n) + + src(2*i+1,2*j,2*k ,n) + + src(2*i ,2*j,2*k+1,n) + + src(2*i+1,2*j,2*k+1,n)) * sf; + } + amrex::ignore_unused(dest); // for cuda + if constexpr (op == OpType::Store) { + dest(i,j,k,n) = rhs; + } else { + dest(i,j,k,n) += rhs; + } + }); +#endif + } +#if (AMREX_SPACEDIM == 3) + else { + AMREX_HOST_DEVICE_PARALLEL_FOR_4D (b, ncomp, i, j, k, n, + { + Real rhs; + if (ifd[n]) { + rhs = (src(2*i ,2*j ,2*k,n)*area(2*i ,2*j ,2*k) + + src(2*i+1,2*j ,2*k,n)*area(2*i+1,2*j ,2*k) + + src(2*i ,2*j+1,2*k,n)*area(2*i ,2*j+1,2*k) + + src(2*i+1,2*j+1,2*k,n)*area(2*i+1,2*j+1,2*k)) * sf; + } else { + rhs = (src(2*i ,2*j ,2*k,n) + + src(2*i+1,2*j ,2*k,n) + + src(2*i ,2*j+1,2*k,n) + + src(2*i+1,2*j+1,2*k,n)) * sf; + } + amrex::ignore_unused(dest); // for cuda + if constexpr (op == OpType::Store) { + dest(i,j,k,n) = rhs; + } else { + dest(i,j,k,n) += rhs; + } + }); + } +#endif +#endif + } + } +} + } #endif diff --git a/Src/F_Interfaces/AmrCore/AMReX_FlashFluxRegister.cpp b/Src/F_Interfaces/AmrCore/AMReX_FlashFluxRegister.cpp index abe74c41125..0e804f76a12 100644 --- a/Src/F_Interfaces/AmrCore/AMReX_FlashFluxRegister.cpp +++ b/Src/F_Interfaces/AmrCore/AMReX_FlashFluxRegister.cpp @@ -1,5 +1,4 @@ #include -#include namespace amrex { @@ -178,279 +177,36 @@ void FlashFluxRegister::define (const BoxArray& fba, const BoxArray& cba, void FlashFluxRegister::store (int fine_global_index, int dir, FArrayBox const& fine_flux, Real sf) { - AMREX_ASSERT(dir < AMREX_SPACEDIM); - auto found = m_fine_map.find(fine_global_index); - if (found != m_fine_map.end()) { - const int ncomp = m_ncomp; - Array const& fab_a = found->second; - if (fab_a[dir]) { - Box const& b = fab_a[dir]->box(); - Array4 const& dest = fab_a[dir]->array(); - Array4 const& src = fine_flux.const_array(); - if (dir == 0) { -#if (AMREX_SPACEDIM == 1) - AMREX_HOST_DEVICE_PARALLEL_FOR_4D (b, ncomp, i, j, k, n, - { - amrex::ignore_unused(j,k); - dest(i,0,0,n) = src(2*i,0,0,n)*sf; - }); -#endif -#if (AMREX_SPACEDIM == 2) - AMREX_HOST_DEVICE_PARALLEL_FOR_4D (b, ncomp, i, j, k, n, - { - amrex::ignore_unused(k); - dest(i,j,0,n) = (src(2*i,2*j ,0,n) + - src(2*i,2*j+1,0,n)) * (Real(0.5)*sf); - }); -#endif -#if (AMREX_SPACEDIM == 3) - AMREX_HOST_DEVICE_PARALLEL_FOR_4D (b, ncomp, i, j, k, n, - { - dest(i,j,k,n) = (src(2*i,2*j ,2*k ,n) + - src(2*i,2*j+1,2*k ,n) + - src(2*i,2*j ,2*k+1,n) + - src(2*i,2*j+1,2*k+1,n)) * (Real(0.25)*sf); - }); -#endif - } -#if (AMREX_SPACEDIM >= 2) - else if (dir == 1) { -#if (AMREX_SPACEDIM == 2) - AMREX_HOST_DEVICE_PARALLEL_FOR_4D (b, ncomp, i, j, k, n, - { - amrex::ignore_unused(k); - dest(i,j,0,n) = (src(2*i ,2*j,0,n) + - src(2*i+1,2*j,0,n)) * (Real(0.5)*sf); - }); -#endif -#if (AMREX_SPACEDIM == 3) - AMREX_HOST_DEVICE_PARALLEL_FOR_4D (b, ncomp, i, j, k, n, - { - dest(i,j,k,n) = (src(2*i ,2*j,2*k ,n) + - src(2*i+1,2*j,2*k ,n) + - src(2*i ,2*j,2*k+1,n) + - src(2*i+1,2*j,2*k+1,n)) * (Real(0.25)*sf); - }); -#endif - } -#if (AMREX_SPACEDIM == 3) - else { - AMREX_HOST_DEVICE_PARALLEL_FOR_4D (b, ncomp, i, j, k, n, - { - dest(i,j,k,n) = (src(2*i ,2*j ,2*k,n) + - src(2*i+1,2*j ,2*k,n) + - src(2*i ,2*j+1,2*k,n) + - src(2*i+1,2*j+1,2*k,n)) * (Real(0.25)*sf); - }); - } -#endif -#endif - } - } + store_or_add(fine_global_index, dir, fine_flux, sf); } void FlashFluxRegister::store (int fine_global_index, int dir, FArrayBox const& fine_flux, FArrayBox const& fine_area, Real sf) { - AMREX_ASSERT(dir < AMREX_SPACEDIM); - auto found = m_fine_map.find(fine_global_index); - if (found != m_fine_map.end()) { - const int ncomp = m_ncomp; - Array const& fab_a = found->second; - if (fab_a[dir]) { - Box const& b = fab_a[dir]->box(); - Array4 const& dest = fab_a[dir]->array(); - Array4 const& src = fine_flux.const_array(); - Array4 const& area = fine_area.const_array(); - if (dir == 0) { -#if (AMREX_SPACEDIM == 1) - AMREX_HOST_DEVICE_PARALLEL_FOR_4D (b, ncomp, i, j, k, n, - { - amrex::ignore_unused(j,k); - dest(i,0,0,n) = src(2*i,0,0,n)*area(2*i,0,0)*sf; - }); -#endif -#if (AMREX_SPACEDIM == 2) - AMREX_HOST_DEVICE_PARALLEL_FOR_4D (b, ncomp, i, j, k, n, - { - amrex::ignore_unused(k); - dest(i,j,0,n) = (src(2*i,2*j ,0,n)*area(2*i,2*j ,0) + - src(2*i,2*j+1,0,n)*area(2*i,2*j+1,0)) * sf; - }); -#endif -#if (AMREX_SPACEDIM == 3) - AMREX_HOST_DEVICE_PARALLEL_FOR_4D (b, ncomp, i, j, k, n, - { - dest(i,j,k,n) = (src(2*i,2*j ,2*k ,n)*area(2*i,2*j ,2*k ) + - src(2*i,2*j+1,2*k ,n)*area(2*i,2*j+1,2*k ) + - src(2*i,2*j ,2*k+1,n)*area(2*i,2*j ,2*k+1) + - src(2*i,2*j+1,2*k+1,n)*area(2*i,2*j+1,2*k+1)) * sf; - }); -#endif - } -#if (AMREX_SPACEDIM >= 2) - else if (dir == 1) { -#if (AMREX_SPACEDIM == 2) - AMREX_HOST_DEVICE_PARALLEL_FOR_4D (b, ncomp, i, j, k, n, - { - amrex::ignore_unused(k); - dest(i,j,0,n) = (src(2*i ,2*j,0,n)*area(2*i ,2*j,0) + - src(2*i+1,2*j,0,n)*area(2*i+1,2*j,0)) * sf; - }); -#endif -#if (AMREX_SPACEDIM == 3) - AMREX_HOST_DEVICE_PARALLEL_FOR_4D (b, ncomp, i, j, k, n, - { - dest(i,j,k,n) = (src(2*i ,2*j,2*k ,n)*area(2*i ,2*j,2*k ) + - src(2*i+1,2*j,2*k ,n)*area(2*i+1,2*j,2*k ) + - src(2*i ,2*j,2*k+1,n)*area(2*i ,2*j,2*k+1) + - src(2*i+1,2*j,2*k+1,n)*area(2*i+1,2*j,2*k+1)) * sf; - }); -#endif - } -#if (AMREX_SPACEDIM == 3) - else { - AMREX_HOST_DEVICE_PARALLEL_FOR_4D (b, ncomp, i, j, k, n, - { - dest(i,j,k,n) = (src(2*i ,2*j ,2*k,n)*area(2*i ,2*j ,2*k) + - src(2*i+1,2*j ,2*k,n)*area(2*i+1,2*j ,2*k) + - src(2*i ,2*j+1,2*k,n)*area(2*i ,2*j+1,2*k) + - src(2*i+1,2*j+1,2*k,n)*area(2*i+1,2*j+1,2*k)) * sf; - }); - } -#endif -#endif - } - } + store_or_add(fine_global_index, dir, fine_flux, fine_area, sf); } void FlashFluxRegister::store (int fine_global_index, int dir, FArrayBox const& fine_flux, FArrayBox const& fine_area, const int* isFluxDensity, Real sf) { - auto& h_ifd = m_h_ifd[OpenMP::get_thread_num()]; - auto& d_ifd = m_d_ifd[OpenMP::get_thread_num()]; + store_or_add(fine_global_index, dir, fine_flux, fine_area, isFluxDensity, sf); +} - AMREX_ASSERT(dir < AMREX_SPACEDIM); - auto found = m_fine_map.find(fine_global_index); - if (found != m_fine_map.end()) { - const int ncomp = m_ncomp; - Array const& fab_a = found->second; - if (fab_a[dir]) { - bool allsame = true; - for (int n = 0; n < m_ncomp; ++n) { - if (h_ifd[n] != isFluxDensity[n]) { - allsame = false; - h_ifd[n] = isFluxDensity[n]; - } - } - if (d_ifd.empty()) { - allsame = false; - d_ifd.resize(m_ncomp); - } - if (! allsame) { - Gpu::copyAsync(Gpu::HostToDevice(), h_ifd.begin(), h_ifd.end(), d_ifd.begin()); - } +void FlashFluxRegister::add (int fine_global_index, int dir, FArrayBox const& fine_flux, Real sf) +{ + store_or_add(fine_global_index, dir, fine_flux, sf); +} - Box const& b = fab_a[dir]->box(); - Array4 const& dest = fab_a[dir]->array(); - Array4 const& src = fine_flux.const_array(); - Array4 const& area = fine_area.const_array(); - const int* ifd = d_ifd.data(); - if (dir == 0) { -#if (AMREX_SPACEDIM == 1) - AMREX_HOST_DEVICE_PARALLEL_FOR_4D (b, ncomp, i, j, k, n, - { - amrex::ignore_unused(j,k); - if (ifd[n]) { - dest(i,0,0,n) = src(2*i,0,0,n)*area(2*i,0,0)*sf; - } else { - dest(i,0,0,n) = src(2*i,0,0,n)*sf; - } - }); -#endif -#if (AMREX_SPACEDIM == 2) - AMREX_HOST_DEVICE_PARALLEL_FOR_4D (b, ncomp, i, j, k, n, - { - amrex::ignore_unused(k); - if (ifd[n]) { - dest(i,j,0,n) = (src(2*i,2*j ,0,n)*area(2*i,2*j ,0) + - src(2*i,2*j+1,0,n)*area(2*i,2*j+1,0)) * sf; - } else { - dest(i,j,0,n) = (src(2*i,2*j ,0,n) + - src(2*i,2*j+1,0,n)) * sf; - } - }); -#endif -#if (AMREX_SPACEDIM == 3) - AMREX_HOST_DEVICE_PARALLEL_FOR_4D (b, ncomp, i, j, k, n, - { - if (ifd[n]) { - dest(i,j,k,n) = (src(2*i,2*j ,2*k ,n)*area(2*i,2*j ,2*k ) + - src(2*i,2*j+1,2*k ,n)*area(2*i,2*j+1,2*k ) + - src(2*i,2*j ,2*k+1,n)*area(2*i,2*j ,2*k+1) + - src(2*i,2*j+1,2*k+1,n)*area(2*i,2*j+1,2*k+1)) * sf; - } else { - dest(i,j,k,n) = (src(2*i,2*j ,2*k ,n) + - src(2*i,2*j+1,2*k ,n) + - src(2*i,2*j ,2*k+1,n) + - src(2*i,2*j+1,2*k+1,n)) * sf; - } - }); -#endif - } -#if (AMREX_SPACEDIM >= 2) - else if (dir == 1) { -#if (AMREX_SPACEDIM == 2) - AMREX_HOST_DEVICE_PARALLEL_FOR_4D (b, ncomp, i, j, k, n, - { - amrex::ignore_unused(k); - if (ifd[n]) { - dest(i,j,0,n) = (src(2*i ,2*j,0,n)*area(2*i ,2*j,0) + - src(2*i+1,2*j,0,n)*area(2*i+1,2*j,0)) * sf; - } else { - dest(i,j,0,n) = (src(2*i ,2*j,0,n) + - src(2*i+1,2*j,0,n)) * sf; - } - }); -#endif -#if (AMREX_SPACEDIM == 3) - AMREX_HOST_DEVICE_PARALLEL_FOR_4D (b, ncomp, i, j, k, n, - { - if (ifd[n]) { - dest(i,j,k,n) = (src(2*i ,2*j,2*k ,n)*area(2*i ,2*j,2*k ) + - src(2*i+1,2*j,2*k ,n)*area(2*i+1,2*j,2*k ) + - src(2*i ,2*j,2*k+1,n)*area(2*i ,2*j,2*k+1) + - src(2*i+1,2*j,2*k+1,n)*area(2*i+1,2*j,2*k+1)) * sf; - } else { - dest(i,j,k,n) = (src(2*i ,2*j,2*k ,n) + - src(2*i+1,2*j,2*k ,n) + - src(2*i ,2*j,2*k+1,n) + - src(2*i+1,2*j,2*k+1,n)) * sf; - } - }); -#endif - } -#if (AMREX_SPACEDIM == 3) - else { - AMREX_HOST_DEVICE_PARALLEL_FOR_4D (b, ncomp, i, j, k, n, - { - if (ifd[n]) { - dest(i,j,k,n) = (src(2*i ,2*j ,2*k,n)*area(2*i ,2*j ,2*k) + - src(2*i+1,2*j ,2*k,n)*area(2*i+1,2*j ,2*k) + - src(2*i ,2*j+1,2*k,n)*area(2*i ,2*j+1,2*k) + - src(2*i+1,2*j+1,2*k,n)*area(2*i+1,2*j+1,2*k)) * sf; - } else { - dest(i,j,k,n) = (src(2*i ,2*j ,2*k,n) + - src(2*i+1,2*j ,2*k,n) + - src(2*i ,2*j+1,2*k,n) + - src(2*i+1,2*j+1,2*k,n)) * sf; - } - }); - } -#endif -#endif - } - } +void FlashFluxRegister::add (int fine_global_index, int dir, FArrayBox const& fine_flux, + FArrayBox const& fine_area, Real sf) +{ + store_or_add(fine_global_index, dir, fine_flux, fine_area, sf); +} + +void FlashFluxRegister::add (int fine_global_index, int dir, FArrayBox const& fine_flux, + FArrayBox const& fine_area, const int* isFluxDensity, Real sf) +{ + store_or_add(fine_global_index, dir, fine_flux, fine_area, isFluxDensity, sf); } void FlashFluxRegister::communicate () diff --git a/Src/F_Interfaces/AmrCore/AMReX_flash_fluxregister_fi.cpp b/Src/F_Interfaces/AmrCore/AMReX_flash_fluxregister_fi.cpp index e8c2bace6d5..562405a889e 100644 --- a/Src/F_Interfaces/AmrCore/AMReX_flash_fluxregister_fi.cpp +++ b/Src/F_Interfaces/AmrCore/AMReX_flash_fluxregister_fi.cpp @@ -119,6 +119,41 @@ extern "C" { flux_reg->store(fgid, dir, fab, areafab, ifd, scaling_factor); } + void amrex_fi_flash_fluxregister_add (FlashFluxRegister* flux_reg, int fgid, int dir, + Real const* flux, const int* flo, const int* fhi, int nc, + Real scaling_factor) + { + Box bx; + bx = Box(IntVect(flo), IntVect(fhi)); + bx.shiftHalf(dir,-1); + const FArrayBox fab(bx,nc,const_cast(flux)); + flux_reg->add(fgid, dir, fab, scaling_factor); + } + + void amrex_fi_flash_fluxregister_add_area (FlashFluxRegister* flux_reg, int fgid, int dir, + Real const* flux, const int* flo, const int* fhi, int nc, + Real const* area, Real scaling_factor) + { + Box bx; + bx = Box(IntVect(flo), IntVect(fhi)); + bx.shiftHalf(dir,-1); + const FArrayBox fab(bx,nc,const_cast(flux)); + const FArrayBox areafab(bx,1,const_cast(area)); + flux_reg->add(fgid, dir, fab, areafab, scaling_factor); + } + + void amrex_fi_flash_fluxregister_add_area_ifd (FlashFluxRegister* flux_reg, int fgid, int dir, + Real const* flux, const int* flo, const int* fhi, int nc, + Real const* area, const int* ifd, Real scaling_factor) + { + Box bx; + bx = Box(IntVect(flo), IntVect(fhi)); + bx.shiftHalf(dir,-1); + const FArrayBox fab(bx,nc,const_cast(flux)); + const FArrayBox areafab(bx,1,const_cast(area)); + flux_reg->add(fgid, dir, fab, areafab, ifd, scaling_factor); + } + void amrex_fi_flash_fluxregister_communicate (FlashFluxRegister* flux_reg) { flux_reg->communicate(); diff --git a/Src/F_Interfaces/AmrCore/AMReX_flash_fluxregister_mod.F90 b/Src/F_Interfaces/AmrCore/AMReX_flash_fluxregister_mod.F90 index e13cc0e2543..dc0f00fdd0b 100644 --- a/Src/F_Interfaces/AmrCore/AMReX_flash_fluxregister_mod.F90 +++ b/Src/F_Interfaces/AmrCore/AMReX_flash_fluxregister_mod.F90 @@ -138,6 +138,36 @@ subroutine amrex_fi_flash_fluxregister_store_area_ifd (fr,cgid,dir,flux,flo,fhi, real(amrex_real), value :: scale end subroutine amrex_fi_flash_fluxregister_store_area_ifd + subroutine amrex_fi_flash_fluxregister_add (fr,cgid,dir,flux,flo,fhi,nc,scale) bind(c) + import + implicit none + type(c_ptr), value :: fr + real(amrex_real), intent(in) :: flux(*) + integer(c_int), value, intent(in) :: cgid, dir, nc + integer(c_int), intent(in) :: flo(*), fhi(*) + real(amrex_real), value :: scale + end subroutine amrex_fi_flash_fluxregister_add + + subroutine amrex_fi_flash_fluxregister_add_area (fr,cgid,dir,flux,flo,fhi,nc,area,scale) bind(c) + import + implicit none + type(c_ptr), value :: fr + real(amrex_real), intent(in) :: flux(*), area(*) + integer(c_int), value, intent(in) :: cgid, dir, nc + integer(c_int), intent(in) :: flo(*), fhi(*) + real(amrex_real), value :: scale + end subroutine amrex_fi_flash_fluxregister_add_area + + subroutine amrex_fi_flash_fluxregister_add_area_ifd (fr,cgid,dir,flux,flo,fhi,nc,area,ifd,scale) bind(c) + import + implicit none + type(c_ptr), value :: fr + real(amrex_real), intent(in) :: flux(*), area(*) + integer(c_int), value, intent(in) :: cgid, dir, nc + integer(c_int), intent(in) :: flo(*), fhi(*), ifd(*) + real(amrex_real), value :: scale + end subroutine amrex_fi_flash_fluxregister_add_area_ifd + subroutine amrex_fi_flash_fluxregister_communicate (fr) bind(c) import implicit none @@ -186,50 +216,80 @@ subroutine amrex_flash_fluxregister_communicate (this) call amrex_fi_flash_fluxregister_communicate(this%p) end subroutine amrex_flash_fluxregister_communicate - subroutine amrex_flash_fluxregister_store (this, flux, flo, fhi, grid_idx, dir, scale) + subroutine amrex_flash_fluxregister_store (this, flux, flo, fhi, grid_idx, dir, addit, scale) class(amrex_flash_fluxregister), intent(inout) :: this integer, intent(in) :: flo(*), fhi(*), grid_idx, dir real(amrex_real), intent(in) :: flux(flo(1):fhi(1),flo(2):fhi(2),flo(3):fhi(3),flo(4):fhi(4)) + logical, optional, intent(in) :: addit real(amrex_real), optional, intent(in) :: scale - ! + logical :: my_addit real(amrex_real) :: my_scale + if (present(addit)) then + my_addit = addit + else + my_addit = .FALSE. + end if if (present(scale)) then my_scale = scale else my_scale = 1._amrex_real end if - call amrex_fi_flash_fluxregister_store(this%p, grid_idx, dir, & - flux, flo, fhi, fhi(4)-flo(4)+1, my_scale) + if (my_addit) then + call amrex_fi_flash_fluxregister_add(this%p, grid_idx, dir, & + flux, flo, fhi, fhi(4)-flo(4)+1, my_scale) + else + call amrex_fi_flash_fluxregister_store(this%p, grid_idx, dir, & + flux, flo, fhi, fhi(4)-flo(4)+1, my_scale) + end if end subroutine amrex_flash_fluxregister_store - subroutine amrex_flash_fluxregister_store_area (this, flux, area, flo, fhi, grid_idx, dir, scale) + subroutine amrex_flash_fluxregister_store_area (this, flux, area, flo, fhi, grid_idx, dir, addit, scale) class(amrex_flash_fluxregister), intent(inout) :: this integer, intent(in) :: flo(*), fhi(*), grid_idx, dir real(amrex_real), intent(in) :: flux(flo(1):fhi(1),flo(2):fhi(2),flo(3):fhi(3),flo(4):fhi(4)) real(amrex_real), intent(in) :: area(flo(1):fhi(1),flo(2):fhi(2),flo(3):fhi(3)) + logical, optional, intent(in) :: addit real(amrex_real), optional, intent(in) :: scale ! + logical :: my_addit real(amrex_real) :: my_scale + if (present(addit)) then + my_addit = addit + else + my_addit = .FALSE. + end if if (present(scale)) then my_scale = scale else my_scale = 1._amrex_real end if - call amrex_fi_flash_fluxregister_store_area(this%p, grid_idx, dir, & - flux, flo, fhi, fhi(4)-flo(4)+1, area, my_scale) + if (my_addit) then + call amrex_fi_flash_fluxregister_add_area(this%p, grid_idx, dir, & + flux, flo, fhi, fhi(4)-flo(4)+1, area, my_scale) + else + call amrex_fi_flash_fluxregister_store_area(this%p, grid_idx, dir, & + flux, flo, fhi, fhi(4)-flo(4)+1, area, my_scale) + end if end subroutine amrex_flash_fluxregister_store_area subroutine amrex_flash_fluxregister_store_area_ifd (this, flux, area, flo, fhi, & - isFluxDensity, grid_idx, dir, scale) + isFluxDensity, grid_idx, dir, addit, scale) class(amrex_flash_fluxregister), intent(inout) :: this integer, intent(in) :: flo(*), fhi(*), grid_idx, dir real(amrex_real), intent(in) :: flux(flo(1):fhi(1),flo(2):fhi(2),flo(3):fhi(3),flo(4):fhi(4)) real(amrex_real), intent(in) :: area(flo(1):fhi(1),flo(2):fhi(2),flo(3):fhi(3)) logical, intent(in) :: isFluxDensity(flo(4):fhi(4)) + logical, optional, intent(in) :: addit real(amrex_real), optional, intent(in) :: scale ! + logical :: my_addit real(amrex_real) :: my_scale integer(c_int) :: ifd(flo(4):fhi(4)) + if (present(addit)) then + my_addit = addit + else + my_addit = .FALSE. + end if if (present(scale)) then my_scale = scale else @@ -240,8 +300,13 @@ subroutine amrex_flash_fluxregister_store_area_ifd (this, flux, area, flo, fhi, elsewhere ifd = 0 endwhere - call amrex_fi_flash_fluxregister_store_area_ifd(this%p, grid_idx, dir, & - flux, flo, fhi, fhi(4)-flo(4)+1, area, ifd, my_scale) + if (my_addit) then + call amrex_fi_flash_fluxregister_add_area_ifd(this%p, grid_idx, dir, & + flux, flo, fhi, fhi(4)-flo(4)+1, area, ifd, my_scale) + else + call amrex_fi_flash_fluxregister_store_area_ifd(this%p, grid_idx, dir, & + flux, flo, fhi, fhi(4)-flo(4)+1, area, ifd, my_scale) + end if end subroutine amrex_flash_fluxregister_store_area_ifd subroutine amrex_flash_fluxregister_load_1 (this, flux, flo, fhi, grid_idx, dir, scale) From ae7b64bcf6a4ed36dd03e17357cae83b0a394912 Mon Sep 17 00:00:00 2001 From: Weiqun Zhang Date: Wed, 1 Nov 2023 08:55:58 -0700 Subject: [PATCH 03/14] Update CHANGES for 23.11 (#3613) --- CHANGES | 49 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) diff --git a/CHANGES b/CHANGES index ae56cdd5ffe..a7a22fd97bd 100644 --- a/CHANGES +++ b/CHANGES @@ -1,3 +1,52 @@ +# 23.11 + + -- Give FlashFluxRegisters ways to accumulate data in registers (#3597) + + -- `AMReXBuildInfo.cmake`: AMReX_DIR (#3609) + + -- update doc for amrex::Abort on GPU (#3605) + + -- Add runtime particle components to HDF5 wrapper (#3596) + + -- Windows: Fix Installed AMReXBuildInfo.cmake (#3606) + + -- Print AMReX version at the beginning of Initialize (#3604) + + -- Install Move Tools to `shared/amrex` (#3599) + + -- Revert "Add ability for GCC 8 in CMake to build fgradient which uses std::filesystem" (#3601) + + -- Avoid std::filesystem (#3602) + + -- Fix Assertion in MLEBNodeFDLaplacian (#3594) + + -- Fix a memory "leak" in VisMF's persistent streams (#3592) + + -- RealVect Static: Export (#3589) + + -- change MaxCnt from 4 to max(4,max_level+1) for how many iterations we… (#3588) + … allow in creation of the initial grid hierarchy + + -- Add Bittree CI (#3577) + + -- BCType::ext_dir_cc (#3581) + + -- Disable CCache in Windows CIs (#3566) + + -- Fix ICC CI by Freeing up Disk Space (#3583) + + -- Docs: Link pyAMReX (#3582) + + -- NodeABecLaplacian: Reuse (#3579) + + -- simplify how 2d surface integrals are computed (#3571) + + -- Adding bittree interface to improve regridding performance in octree mode (#3555) + + -- MLNodeABecLaplacian (#3559) + + -- Fix Boundary Centroid in a Corner Case in 2D (#3568) + # 23.10 -- Bugfix typo in AMReX_SundialsIntegrator.H Nvar vs NVar, the From 1269a5bf41a8c4f0a832581532fda6e723d9aadc Mon Sep 17 00:00:00 2001 From: Rakesh Roy <137397847+rakesroy@users.noreply.github.com> Date: Thu, 2 Nov 2023 00:17:30 +0530 Subject: [PATCH 04/14] use hipPointerAttribute_t.type as HIP is removing hipPointerAttribute_t.memoryType (#3610) ## Summary This replaces hipPointerAttribute_t.memoryType with hipPointerAttribute_t.type. ## Additional background In ROCm6.0 hipPointerAttribute_t.memoryType will be removed from HIP. Instead hipPointerAttribute_t.type to be used. This is causing build failure in https://github.com/Exawind/amr-wind.git. hipPointerAttribute_t.type has been existing since ROCm5.5, so this change will be backward compatible till ROCm5.5. ## Checklist The proposed changes: - [x] fix a bug or incorrect behavior in AMReX - [ ] add new capabilities to AMReX - [ ] changes answers in the test suite to more than roundoff level - [ ] are likely to significantly affect the results of downstream AMReX users - [ ] include documentation in the code and/or rst files, if appropriate Co-authored-by: Rakesh Roy --- Src/Base/AMReX_GpuUtility.H | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/Src/Base/AMReX_GpuUtility.H b/Src/Base/AMReX_GpuUtility.H index a1fa3cdd9dc..ce98556fc5c 100644 --- a/Src/Base/AMReX_GpuUtility.H +++ b/Src/Base/AMReX_GpuUtility.H @@ -63,7 +63,11 @@ namespace Gpu { #if defined(AMREX_USE_HIP) hipPointerAttribute_t attrib; hipError_t r = hipPointerGetAttributes(&attrib, p); +#if defined(HIP_VERSION_MAJOR) && (HIP_VERSION_MAJOR < 6) return r == hipSuccess && attrib.memoryType == hipMemoryTypeDevice; +#else + return r == hipSuccess && attrib.type == hipMemoryTypeDevice; +#endif // (HIP_VERSION_MAJOR) && (HIP_VERSION_MAJOR < 6) #elif defined(AMREX_USE_CUDA) CUpointer_attribute attrib = CU_POINTER_ATTRIBUTE_MEMORY_TYPE; CUmemorytype mem_type = static_cast(0); @@ -83,7 +87,11 @@ namespace Gpu { #if defined(AMREX_USE_HIP) hipPointerAttribute_t attrib; hipError_t r = hipPointerGetAttributes(&attrib, p); +#if defined(HIP_VERSION_MAJOR) && (HIP_VERSION_MAJOR < 6) return r == hipSuccess && attrib.memoryType == hipMemoryTypeHost; +#else + return r == hipSuccess && attrib.type == hipMemoryTypeHost; +#endif // (HIP_VERSION_MAJOR) && (HIP_VERSION_MAJOR < 6) #elif defined(AMREX_USE_CUDA) CUpointer_attribute attrib = CU_POINTER_ATTRIBUTE_MEMORY_TYPE; CUmemorytype mem_type = static_cast(0); @@ -106,9 +114,15 @@ namespace Gpu { } else { hipPointerAttribute_t attrib; hipError_t r = hipPointerGetAttributes(&attrib, p); +#if defined(HIP_VERSION_MAJOR) && (HIP_VERSION_MAJOR < 6) return r == hipSuccess && (attrib.memoryType == hipMemoryTypeHost || attrib.memoryType == hipMemoryTypeDevice); +#else + return r == hipSuccess && + (attrib.type == hipMemoryTypeHost || + attrib.type == hipMemoryTypeDevice); +#endif // (HIP_VERSION_MAJOR) && (HIP_VERSION_MAJOR < 6) } #elif defined(AMREX_USE_CUDA) CUpointer_attribute attrib = CU_POINTER_ATTRIBUTE_MEMORY_TYPE; From f5400421a4b1ba84be3dda98040a49ff2ef89149 Mon Sep 17 00:00:00 2001 From: Weiqun Zhang Date: Wed, 1 Nov 2023 12:02:02 -0700 Subject: [PATCH 05/14] FillRandom: Use MKL host API (#3536) --- Src/Base/AMReX_Random.cpp | 40 ++++++++++++----------------------- Tools/CMake/AMReXSYCL.cmake | 2 +- Tools/GNUMake/comps/dpcpp.mak | 2 +- 3 files changed, 16 insertions(+), 28 deletions(-) diff --git a/Src/Base/AMReX_Random.cpp b/Src/Base/AMReX_Random.cpp index 9e1059e6798..cc791a11fef 100644 --- a/Src/Base/AMReX_Random.cpp +++ b/Src/Base/AMReX_Random.cpp @@ -19,7 +19,7 @@ namespace namespace amrex { #ifdef AMREX_USE_SYCL sycl_rng_descr* rand_engine_descr = nullptr; -//xxxxx oneapi::mkl::rng::philox4x32x10* gpu_rand_generator = nullptr; + oneapi::mkl::rng::philox4x32x10* gpu_rand_generator = nullptr; #else amrex::randState_t* gpu_rand_state = nullptr; amrex::randGenerator_t gpu_rand_generator = nullptr; @@ -44,8 +44,8 @@ void ResizeRandomSeed (amrex::ULong gpu_seed) rand_engine_descr = new sycl_rng_descr (Gpu::Device::streamQueue(), sycl::range<1>(N), gpu_seed, 1); -//xxxxx gpu_rand_generator = new std::remove_pointer_t -// (Gpu::Device::streamQueue(), gpu_seed+1234ULL); + gpu_rand_generator = new std::remove_pointer_t + (Gpu::Device::streamQueue(), gpu_seed+1234ULL); #elif defined(AMREX_USE_CUDA) || defined(AMREX_USE_HIP) @@ -212,11 +212,11 @@ DeallocateRandomSeedDevArray () Gpu::streamSynchronize(); rand_engine_descr = nullptr; } -//xxxxx if (gpu_rand_generator != nullptr) { -// delete gpu_rand_generator; -// Gpu::streamSynchronize(); -// gpu_rand_generator = nullptr; -// } + if (gpu_rand_generator != nullptr) { + delete gpu_rand_generator; + Gpu::streamSynchronize(); + gpu_rand_generator = nullptr; + } #else if (gpu_rand_state != nullptr) { @@ -258,15 +258,9 @@ void FillRandom (Real* p, Long N) #elif defined(AMREX_USE_SYCL) -//xxxxx oneapi::mkl::rng::uniform distr; -// auto event = oneapi::mkl::rng::generate(distr, gpu_rand_generator, N, p); -// event.wait(); - - amrex::ParallelForRNG(N, [=] AMREX_GPU_DEVICE (Long i, RandomEngine const& eng) - { - p[i] = Random(eng); - }); - Gpu::streamSynchronize(); + oneapi::mkl::rng::uniform distr; + auto event = oneapi::mkl::rng::generate(distr, *gpu_rand_generator, N, p); + event.wait(); #else std::uniform_real_distribution distribution(Real(0.0), Real(1.0)); @@ -299,15 +293,9 @@ void FillRandomNormal (Real* p, Long N, Real mean, Real stddev) #elif defined(AMREX_USE_SYCL) -//xxxxx oneapi::mkl::rng::gaussian distr(mean, stddev); -// auto event = oneapi::mkl::rng::generate(distr, gpu_rand_generator, N, p); -// event.wait(); - - amrex::ParallelForRNG(N, [=] AMREX_GPU_DEVICE (Long i, RandomEngine const& eng) - { - p[i] = RandomNormal(mean, stddev, eng); - }); - Gpu::streamSynchronize(); + oneapi::mkl::rng::gaussian distr(mean, stddev); + auto event = oneapi::mkl::rng::generate(distr, *gpu_rand_generator, N, p); + event.wait(); #else diff --git a/Tools/CMake/AMReXSYCL.cmake b/Tools/CMake/AMReXSYCL.cmake index 42eb5c4802b..a67571dc412 100644 --- a/Tools/CMake/AMReXSYCL.cmake +++ b/Tools/CMake/AMReXSYCL.cmake @@ -53,7 +53,7 @@ endif() # target_link_options( SYCL INTERFACE - $<${_cxx_sycl}:-fsycl -fsycl-device-lib=libc,libm-fp32,libm-fp64> ) + $<${_cxx_sycl}:-qmkl=sequential -fsycl -fsycl-device-lib=libc,libm-fp32,libm-fp64> ) # TODO: use $ genex for CMake >=3.17 diff --git a/Tools/GNUMake/comps/dpcpp.mak b/Tools/GNUMake/comps/dpcpp.mak index 6e490d9c063..3bcf5cb4372 100644 --- a/Tools/GNUMake/comps/dpcpp.mak +++ b/Tools/GNUMake/comps/dpcpp.mak @@ -123,7 +123,7 @@ ifneq ($(BL_NO_FORT),TRUE) endif endif -LDFLAGS += -fsycl-device-lib=libc,libm-fp32,libm-fp64 +LDFLAGS += -qmkl=sequential -fsycl-device-lib=libc,libm-fp32,libm-fp64 ifdef SYCL_PARALLEL_LINK_JOBS LDFLAGS += -fsycl-max-parallel-link-jobs=$(SYCL_PARALLEL_LINK_JOBS) From 606a94c69d640a03d5fbf35d11f1c46bf9676e47 Mon Sep 17 00:00:00 2001 From: Axel Huebl Date: Wed, 1 Nov 2023 15:11:03 -0700 Subject: [PATCH 06/14] [Breaking] Prefix `amrex_` to each plotfile Tool (#3600) ## Summary Packaging AMReX for Conda, we received feedback that our plotfile tool names are too generic and can collide with other packages. Thus, we propose to rename them with a common prefix. - [x] decide prefix or suffix (suffix: `` completion will work; prefix: easier for new users) - [x] decide name: `plt`/`amrex`/`amr`/... - Weiqun and Axel prefer `amrex_` so far - [ ] also change for GNUmake - [ ] update [test harness](https://github.com/AMReX-Codes/regression_testing) - [ ] update user-facing documentation for tools - [x] ~update [tutorials](https://github.com/AMReX-Codes/amrex-tutorials)?~ ## Additional background https://github.com/conda-forge/staged-recipes/pull/24294 ## Checklist The proposed changes: - [ ] fix a bug or incorrect behavior in AMReX - [ ] add new capabilities to AMReX - [ ] changes answers in the test suite to more than roundoff level - [ ] are likely to significantly affect the results of downstream AMReX users - [ ] include documentation in the code and/or rst files, if appropriate --------- Co-authored-by: Weiqun Zhang --- .github/workflows/clang.yml | 2 +- .github/workflows/gcc.yml | 2 +- Tools/Plotfile/CMakeLists.txt | 5 +++++ 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/.github/workflows/clang.yml b/.github/workflows/clang.yml index eea7e576af6..ec469bb5de6 100644 --- a/.github/workflows/clang.yml +++ b/.github/workflows/clang.yml @@ -64,7 +64,7 @@ jobs: CLANG_TIDY_ARGS="--config-file=${{github.workspace}}/.clang-tidy --warnings-as-errors=*" export PATH=/tmp/my-amrex/bin:$PATH - which fcompare + which amrex_fcompare ctest --output-on-failure diff --git a/.github/workflows/gcc.yml b/.github/workflows/gcc.yml index 6915a246018..afc2044bdd7 100644 --- a/.github/workflows/gcc.yml +++ b/.github/workflows/gcc.yml @@ -60,7 +60,7 @@ jobs: CLANG_TIDY_ARGS="--config-file=${{github.workspace}}/.clang-tidy --warnings-as-errors=*" export PATH=/tmp/my-amrex/bin:$PATH - which fcompare + which amrex_fcompare ctest --output-on-failure diff --git a/Tools/Plotfile/CMakeLists.txt b/Tools/Plotfile/CMakeLists.txt index 0efc55c9a16..f462445c952 100644 --- a/Tools/Plotfile/CMakeLists.txt +++ b/Tools/Plotfile/CMakeLists.txt @@ -27,6 +27,11 @@ foreach( _exe IN LISTS _exe_names) if (AMReX_CUDA) set_source_files_properties(${_exe}.cpp PROPERTIES LANGUAGE CUDA) endif() + + # Add prefix to each tool's name to make them unique when installed. + # This avoids potential collisions of names on user systems, e.g., in + # software packages (Spack/Conda/Debian/...). + set_target_properties(${_exe} PROPERTIES OUTPUT_NAME "amrex_${_exe}") endforeach() From a7afcba3cffd86acc748edf39a8a2f33c973bf5d Mon Sep 17 00:00:00 2001 From: Axel Huebl Date: Thu, 2 Nov 2023 08:45:19 -0700 Subject: [PATCH 07/14] `amrex.omp_threads`: Can Avoid SMT (#3607) ## Summary In all our applications in BLAST, the OpenMP default to use all [logical cores on modern CPUs](https://en.wikipedia.org/wiki/Simultaneous_multithreading) results in significantly slower performance than just using the physical cores with AMReX. Thus, we introduce a new option `amrex.omp_threads` that enables control over the OpenMP threads at startup and has - for most popular systems - an implementation to find out the actual number of physical threads and default to it. For codes, users that change the default to `amrex.omp_threads = nosmt`, the `OMP_NUM_THREADS` variable will still take precedence. This is a bit unusual (because CLI options usually have higher precedence than env vars - and they do if the user provides a number here), but done intentionally: this way, codes like WarpX can set the `nosmt` default and HPC job scripts will set the exact, preferably benchmarked number of threads as usual without surprises. - [x] document ## Tests Performed for AMReX OMP Backend Tests were performed with very small examples, WarpX 3D LWFA test as checked in or AMReX AMRCore 3d test. - [x] Ubuntu 22.04 Laptop w/ 12th Gen Intel i9-12900H: @ax3l - 20 logical cores; the first 12 logical cores use 2x SMT/HT - 20 virtual (default) -> 14 physical (`amrex.omp_threads = nosmt`) - faster runtime! - [x] Perlmutter (SUSE Linux Enterprise 15.4, kernel 5.14.21) - [CPU node](https://docs.nersc.gov/systems/perlmutter/architecture/) with 2x [AMD EPYC 7763](https://www.amd.com/en/products/cpu/amd-epyc-7763) - 2x SMT - 256 default, 128 with `amrex.omp_threads = nosmt` - faster runtime! - [x] Frontier (SUSE Linux Enterprise 15.4, kernel 5.14.21) - 1x AMD EPYC 7763 64-Core Processor (w/ 2x SMT enabled) - 2x SMT - 128 default - 64 with `amrex.omp_threads = nosmt` - faster runtime! - The ideal result might also be lower, due to first cores used by OS and [low-noise cores](https://docs.olcf.ornl.gov/systems/frontier_user_guide.html#low-noise-mode-layout) after that. But that is an orthogonal question and should be set in job scripts: `#SBATCH --ntasks-per-node=8` `#SBATCH --cpus-per-task=7` `#SBATCH --gpus-per-task=1` - [x] Summit (RHEL 8.2, kernel 4.18.0) - 2x IBM Power9 (each 22 physical cores each, each 6 disabled/hidden for OS?, 4x SMT enabled; cpuinfo says 128 total) - 4x SMT - 128 default, 32 with `amrex.omp_threads = nosmt` - faster runtime! - [x] [Lassen](https://hpc.llnl.gov/hardware/compute-platforms/lassen) (RHEL 7.9, kernel 4.14.0) - 2x IBM Power9 (each 22 physical cores, each 2 reserved for OS?, 4x SMT enabled) - 4x SMT - 160 default, 44 with `amrex.omp_threads = nosmt` - faster runtime! - The ideal result might be even down to 40, but that is an orthogonal question and should be set in job scripts. - [x] macOS M1 (arm64/aarch64) mini: - no SMT/HT - 8 default, 8 with `amrex.omp_threads = nosmt` - [x] macOS (OSX Ventura 13.5.2, 2.8 GHz Quad-Core Intel Core i7-8569U) Intel x86_64 @n01r - 2x SMT - 8 default, 4 with `amrex.omp_threads = nosmt` - faster runtime! - [x] macOS (OSX Ventura 13.5.2) M1 Max on mac studio @RTSandberg - no SMT/HT - 10 default, 10 with `amrex.omp_threads = nosmt` - [ ] some BSD/FreeBSD system? - no user requests - low priority, we just keep the default for now - [ ] Windows... looking for a system ## Additional background ## Checklist The proposed changes: - [ ] fix a bug or incorrect behavior in AMReX - [x] add new capabilities to AMReX - [ ] changes answers in the test suite to more than roundoff level - [ ] are likely to significantly affect the results of downstream AMReX users - [ ] include documentation in the code and/or rst files, if appropriate --------- Co-authored-by: Weiqun Zhang --- .../source/InputsComputeBackends.rst | 21 +++ .../source/Inputs_Chapter.rst | 1 + Src/Base/AMReX.cpp | 15 +- Src/Base/AMReX_OpenMP.H | 15 +- Src/Base/AMReX_OpenMP.cpp | 177 ++++++++++++++++++ Src/Base/CMakeLists.txt | 1 + Src/Base/Make.package | 1 + 7 files changed, 224 insertions(+), 7 deletions(-) create mode 100644 Docs/sphinx_documentation/source/InputsComputeBackends.rst create mode 100644 Src/Base/AMReX_OpenMP.cpp diff --git a/Docs/sphinx_documentation/source/InputsComputeBackends.rst b/Docs/sphinx_documentation/source/InputsComputeBackends.rst new file mode 100644 index 00000000000..26e5d527508 --- /dev/null +++ b/Docs/sphinx_documentation/source/InputsComputeBackends.rst @@ -0,0 +1,21 @@ +.. _Chap:InputsComputeBackends: + +Compute Backends +================ + +The following inputs must be preceded by ``amrex.`` and determine runtime options of CPU or GPU compute implementations. + ++------------------------+-----------------------------------------------------------------------+-------------+------------+ +| Parameter | Description | Type | Default | ++========================+=======================================================================+=============+============+ +| ``omp_threads`` | If OpenMP is enabled, this can be used to set the default number of | String | ``system`` | +| | threads. The special value ``nosmt`` can be used to avoid using | or Int | | +| | threads for virtual cores (aka Hyperthreading or SMT), as is default | | | +| | in OpenMP, and instead only spawns threads equal to the number of | | | +| | physical cores in the system. | | | +| | For the values ``system`` and ``nosmt``, the environment variable | | | +| | ``OMP_NUM_THREADS`` takes precedence. For Integer values, | | | +| | ``OMP_NUM_THREADS`` is ignored. | | | ++------------------------+-----------------------------------------------------------------------+-------------+------------+ + +For GPU-specific parameters, see also the :ref:`GPU chapter `. diff --git a/Docs/sphinx_documentation/source/Inputs_Chapter.rst b/Docs/sphinx_documentation/source/Inputs_Chapter.rst index 0a64aeb492c..43ead40b3c6 100644 --- a/Docs/sphinx_documentation/source/Inputs_Chapter.rst +++ b/Docs/sphinx_documentation/source/Inputs_Chapter.rst @@ -9,6 +9,7 @@ Run-time Inputs InputsProblemDefinition InputsTimeStepping InputsLoadBalancing + InputsComputeBackends InputsPlotFiles InputsCheckpoint diff --git a/Src/Base/AMReX.cpp b/Src/Base/AMReX.cpp index 147f8275c57..4449dab1955 100644 --- a/Src/Base/AMReX.cpp +++ b/Src/Base/AMReX.cpp @@ -52,6 +52,7 @@ #endif #ifdef AMREX_USE_OMP +#include #include #endif @@ -72,7 +73,9 @@ #include #include #include +#include #include +#include #include #include #include @@ -459,15 +462,17 @@ amrex::Initialize (int& argc, char**& argv, bool build_parm_parse, #endif #ifdef AMREX_USE_OMP + amrex::OpenMP::init_threads(); + + // status output if (system::verbose > 0) { // static_assert(_OPENMP >= 201107, "OpenMP >= 3.1 is required."); amrex::Print() << "OMP initialized with " << omp_get_max_threads() << " OMP threads\n"; } -#endif -#if defined(AMREX_USE_MPI) && defined(AMREX_USE_OMP) + // warn if over-subscription is detected if (system::verbose > 0) { auto ncores = int(std::thread::hardware_concurrency()); if (ncores != 0 && // It might be zero according to the C++ standard. @@ -476,8 +481,10 @@ amrex::Initialize (int& argc, char**& argv, bool build_parm_parse, amrex::Print(amrex::ErrorStream()) << "AMReX Warning: You might be oversubscribing CPU cores with OMP threads.\n" << " There are " << ncores << " cores per node.\n" - << " There are " << ParallelDescriptor::NProcsPerNode() << " MPI ranks per node.\n" - << " But OMP is initialized with " << omp_get_max_threads() << " threads per rank.\n" +#if defined(AMREX_USE_MPI) + << " There are " << ParallelDescriptor::NProcsPerNode() << " MPI ranks (processes) per node.\n" +#endif + << " But OMP is initialized with " << omp_get_max_threads() << " threads per process.\n" << " You should consider setting OMP_NUM_THREADS=" << ncores/ParallelDescriptor::NProcsPerNode() << " or less in the environment.\n"; } diff --git a/Src/Base/AMReX_OpenMP.H b/Src/Base/AMReX_OpenMP.H index 8eb8ada4513..ce267b9be73 100644 --- a/Src/Base/AMReX_OpenMP.H +++ b/Src/Base/AMReX_OpenMP.H @@ -11,10 +11,12 @@ namespace amrex::OpenMP { inline int get_max_threads () { return omp_get_max_threads(); } inline int get_thread_num () { return omp_get_thread_num(); } inline int in_parallel () { return omp_in_parallel(); } + inline void set_num_threads (int num) { omp_set_num_threads(num); } + void init_threads (); } -#else +#else // AMREX_USE_OMP namespace amrex::OpenMP { @@ -22,9 +24,16 @@ namespace amrex::OpenMP { constexpr int get_max_threads () { return 1; } constexpr int get_thread_num () { return 0; } constexpr int in_parallel () { return false; } - + constexpr void set_num_threads (int) { /* nothing */ } + constexpr void init_threads () { /* nothing */ } } -#endif +#endif // AMREX_USE_OMP + +namespace amrex { + /** ... */ + int + numUniquePhysicalCores(); +} #endif diff --git a/Src/Base/AMReX_OpenMP.cpp b/Src/Base/AMReX_OpenMP.cpp new file mode 100644 index 00000000000..5ddd9944411 --- /dev/null +++ b/Src/Base/AMReX_OpenMP.cpp @@ -0,0 +1,177 @@ +#include +#include +#include +#include + +#if defined(__APPLE__) +#include +#include +#endif + +#if defined(_WIN32) +#include +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace amrex +{ + int + numUniquePhysicalCores () + { + int ncores; + +#if defined(__APPLE__) + size_t len = sizeof(ncores); + // See hw.physicalcpu and hw.physicalcpu_max + // https://developer.apple.com/documentation/kernel/1387446-sysctlbyname/determining_system_capabilities/ + // https://developer.apple.com/documentation/kernel/1387446-sysctlbyname + if (sysctlbyname("hw.physicalcpu", &ncores, &len, NULL, 0) == -1) { + if (system::verbose > 0) { + amrex::Print() << "numUniquePhysicalCores(): Error receiving hw.physicalcpu! " + << "Defaulting to visible cores.\n"; + } + ncores = int(std::thread::hardware_concurrency()); + } +#elif defined(__linux__) + std::set> uniqueThreadSets; + int cpuIndex = 0; + + while (true) { + // for each logical CPU in cpuIndex from 0...N-1 + std::string path = "/sys/devices/system/cpu/cpu" + std::to_string(cpuIndex) + "/topology/thread_siblings_list"; + std::ifstream file(path); + if (!file.is_open()) { + break; // no further CPUs to check + } + + // find its siblings + std::vector siblings; + std::string line; + if (std::getline(file, line)) { + std::stringstream ss(line); + std::string token; + + // Possible syntax: 0-3, 8-11, 14,17 + // https://github.com/torvalds/linux/blob/v6.5/Documentation/ABI/stable/sysfs-devices-system-cpu#L68-L72 + while (std::getline(ss, token, ',')) { + size_t dashPos = token.find('-'); + if (dashPos != std::string::npos) { + // Range detected + int start = std::stoi(token.substr(0, dashPos)); + int end = std::stoi(token.substr(dashPos + 1)); + for (int i = start; i <= end; ++i) { + siblings.push_back(i); + } + } else { + siblings.push_back(std::stoi(token)); + } + } + } + + // and record the siblings group + // (assumes: ascending and unique sets per cpuIndex) + uniqueThreadSets.insert(siblings); + cpuIndex++; + } + + if (cpuIndex == 0) { + if (system::verbose > 0) { + amrex::Print() << "numUniquePhysicalCores(): Error reading CPU info.\n"; + } + ncores = int(std::thread::hardware_concurrency()); + } else { + ncores = int(uniqueThreadSets.size()); + } +#elif defined(_WIN32) + DWORD length = 0; + bool result = GetLogicalProcessorInformation(NULL, &length); + + if (!result) { + if (system::verbose > 0) { + amrex::Print() << "numUniquePhysicalCores(): Failed to get logical processor information! " + << "Defaulting to visible cores.\n"; + } + ncores = int(std::thread::hardware_concurrency()); + } + else { + std::vector buffer(length / sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION)); + if (!GetLogicalProcessorInformation(&buffer[0], &length)) { + if (system::verbose > 0) { + amrex::Print() << "numUniquePhysicalCores(): Failed to get logical processor information! " + << "Defaulting to visible cores.\n"; + } + ncores = int(std::thread::hardware_concurrency()); + } else { + ncores = 0; + for (const auto& info : buffer) { + if (info.Relationship == RelationProcessorCore) { + ncores++; + } + } + } + } +#else + // TODO: + // BSD + if (system::verbose > 0) { + amrex::Print() << "numUniquePhysicalCores(): Unknown system. Defaulting to visible cores.\n"; + } + ncores = int(std::thread::hardware_concurrency()); +#endif + return ncores; + } +} // namespace amrex + +#ifdef AMREX_USE_OMP +namespace amrex::OpenMP +{ + void init_threads () + { + amrex::ParmParse pp("amrex"); + std::string omp_threads = "system"; + pp.queryAdd("omp_threads", omp_threads); + + auto to_int = [](std::string const & str_omp_threads) { + std::optional num; + try { num = std::stoi(str_omp_threads); } + catch (...) { /* nothing */ } + return num; + }; + + if (omp_threads == "system") { + // default or OMP_NUM_THREADS environment variable + } else if (omp_threads == "nosmt") { + char const *env_omp_num_threads = std::getenv("OMP_NUM_THREADS"); + if (env_omp_num_threads != nullptr && amrex::system::verbose > 1) { + amrex::Print() << "amrex.omp_threads was set to nosmt," + << "but OMP_NUM_THREADS was set. Will keep " + << "OMP_NUM_THREADS=" << env_omp_num_threads << ".\n"; + } else { + omp_set_num_threads(numUniquePhysicalCores()); + } + } else { + std::optional num_omp_threads = to_int(omp_threads); + if (num_omp_threads.has_value()) { + omp_set_num_threads(num_omp_threads.value()); + } + else { + if (amrex::system::verbose > 0) { + amrex::Print() << "amrex.omp_threads has an unknown value: " + << omp_threads + << " (try system, nosmt, or a positive integer)\n"; + } + } + } + } +} // namespace amrex::OpenMP +#endif // AMREX_USE_OMP diff --git a/Src/Base/CMakeLists.txt b/Src/Base/CMakeLists.txt index 544de3aed8c..459ec3bd7c4 100644 --- a/Src/Base/CMakeLists.txt +++ b/Src/Base/CMakeLists.txt @@ -53,6 +53,7 @@ foreach(D IN LISTS AMReX_SPACEDIM) AMReX_ParallelDescriptor.H AMReX_ParallelDescriptor.cpp AMReX_OpenMP.H + AMReX_OpenMP.cpp AMReX_ParallelReduce.H AMReX_ForkJoin.H AMReX_ForkJoin.cpp diff --git a/Src/Base/Make.package b/Src/Base/Make.package index 29b4c25dc84..276887ebd79 100644 --- a/Src/Base/Make.package +++ b/Src/Base/Make.package @@ -38,6 +38,7 @@ C$(AMREX_BASE)_headers += AMReX_REAL.H AMReX_INT.H AMReX_CONSTANTS.H AMReX_SPACE C$(AMREX_BASE)_sources += AMReX_DistributionMapping.cpp AMReX_ParallelDescriptor.cpp C$(AMREX_BASE)_headers += AMReX_DistributionMapping.H AMReX_ParallelDescriptor.H C$(AMREX_BASE)_headers += AMReX_OpenMP.H +C$(AMREX_BASE)_sources += AMReX_OpenMP.cpp C$(AMREX_BASE)_headers += AMReX_ParallelReduce.H From d36463103daed09a40cdea235041a6ab79ff280c Mon Sep 17 00:00:00 2001 From: Ben Wibking Date: Tue, 7 Nov 2023 00:19:10 -0500 Subject: [PATCH 08/14] replace AMREX_DEVICE_COMPILE with AMREX_IF_ON_DEVICE and AMREX_IF_ON_HOST (#3591) ## Summary This adds the macros `AMREX_IF_ON_DEVICE((code_for_device))` and `AMREX_IF_ON_HOST((code_for_host))` that are compatible with single-pass host/device compilation (as used by `nvc++ -cuda`), as well as backward compatible with all other compilers. This also replaces all uses of `AMREX_DEVICE_COMPILE` with these macros. Fixes https://github.com/AMReX-Codes/amrex/issues/3586. ## Additional background Single-pass compilation evalutes the preprocessor macros once for each source file. This means that preprocessor conditionals cannot be used to choose between host and device code. In particular, NVHPC with `-cuda` does not support `__CUDA_ARCH__`, instead requiring the use of the `if target` construct. This creates portable macros that work for either single-pass or two-pass compilation, but requires restructuring of any code that uses AMREX_DEVICE_COMPILE so that the code appears as a macro argument. This PR will allow using NVHPC with `-cuda` as the unified host/device compiler for AMReX. In the future, single-pass compilers for other backends may be available, e.g., SYCL (https://dl.acm.org/doi/abs/10.1145/3585341.3585351). AMReX can be configured to build with `nvc++ -cuda` using CMake: ``` cmake .. -DAMReX_GPU_BACKEND=CUDA -DCMAKE_C_COMPILER=nvc -DCMAKE_CXX_COMPILER=nvc++ -DCMAKE_CUDA_COMPILER=nvc++ -DCMAKE_CUDA_COMPILER_ID=NVCXX -DCMAKE_CUDA_ARCHITECTURES=80 -DCMAKE_CUDA_COMPILER_FORCED=ON -DCMAKE_CUDA_COMPILE_FEATURES=cuda_std_17 -DAMReX_GPU_RDC=OFF -DCMAKE_CXX_FLAGS="-cuda --gcc-toolchain=$(which gcc)" -DCMAKE_CUDA_FLAGS="-cuda --gcc-toolchain=$(which gcc)" -DAMReX_ENABLE_TESTS=ON -DCMAKE_CUDA_HOST_LINK_LAUNCHER=nvc++ -DCMAKE_CUDA_LINK_EXECUTABLE=" -o " ``` CMake hacks (https://github.com/NVIDIA/cub/blob/0fc3c3701632a4be906765b73be20a9ad0da603d/cmake/CubCompilerHacks.cmake) are tested with CMake 3.22.1 and NVHPC 23.5, 23.7, and 23.9 (earlier versions do not work). However, it currently fails to link the executables for the tests due to a [compiler/linker bug](https://forums.developer.nvidia.com/t/nvc-cuda-fails-to-link-code-when-using-device-curand-functions/270401/5). (Note that by default, `nvcc` preserves denormals, whereas `nvc++` does not. Also, `nvc++` generates relocatable device code by default, whereas `nvcc` does not.) ## Checklist The proposed changes: - [ ] fix a bug or incorrect behavior in AMReX - [ ] add new capabilities to AMReX - [ ] changes answers in the test suite to more than roundoff level - [ ] are likely to significantly affect the results of downstream AMReX users - [ ] include documentation in the code and/or rst files, if appropriate --------- Co-authored-by: Weiqun Zhang --- Docs/sphinx_documentation/source/GPU.rst | 9 +- Src/Base/AMReX.H | 60 +++---- Src/Base/AMReX_Algorithm.H | 178 +++++++++++--------- Src/Base/AMReX_Array4.H | 46 ++--- Src/Base/AMReX_FabArray.H | 7 +- Src/Base/AMReX_GpuAtomic.H | 204 ++++++++++++----------- Src/Base/AMReX_GpuLaunch.H | 29 ++-- Src/Base/AMReX_GpuQualifiers.H | 29 ++++ Src/Base/AMReX_GpuRange.H | 52 +++--- Src/Base/AMReX_GpuUtility.H | 10 +- Src/Base/AMReX_Math.H | 76 +++++---- Src/Base/AMReX_Random.H | 109 ++++++------ Src/Base/AMReX_TableData.H | 100 +++++------ Src/Base/Parser/AMReX_IParser.H | 28 +--- Src/Base/Parser/AMReX_Parser.H | 35 ++-- Src/Base/Parser/AMReX_Parser_Y.H | 4 +- Src/EB/AMReX_EB2_GeometryShop.H | 13 +- 17 files changed, 518 insertions(+), 471 deletions(-) diff --git a/Docs/sphinx_documentation/source/GPU.rst b/Docs/sphinx_documentation/source/GPU.rst index 90dbcc9f26f..aff060e9166 100644 --- a/Docs/sphinx_documentation/source/GPU.rst +++ b/Docs/sphinx_documentation/source/GPU.rst @@ -489,11 +489,10 @@ GPU support. When AMReX is compiled with ``USE_OMP_OFFLOAD=TRUE``, ``AMREX_USE_OMP_OFFLOAD`` is defined. -In addition to AMReX's preprocessor macros, CUDA provides the -``__CUDA_ARCH__`` macro which is only defined when in device code. -HIP and Sycl provide similar macros. -``AMREX_DEVICE_COMPILE`` should be used when a ``__host__ __device__`` -function requires separate code for the CPU and GPU implementations. +The macros ``AMREX_IF_ON_DEVICE((code_for_device))`` and +``AMREX_IF_ON_HOST((code_for_host))`` should be used when a +``__host__ __device__`` function requires separate code for the +CPU and GPU implementations. .. =================================================================== diff --git a/Src/Base/AMReX.H b/Src/Base/AMReX.H index c539a1d8e75..2b88553bcdf 100644 --- a/Src/Base/AMReX.H +++ b/Src/Base/AMReX.H @@ -113,16 +113,15 @@ namespace amrex AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE void Error (const char* msg = nullptr) { -#if AMREX_DEVICE_COMPILE #if defined(NDEBUG) - amrex::ignore_unused(msg); + AMREX_IF_ON_DEVICE((amrex::ignore_unused(msg);)) #else - if (msg) { AMREX_DEVICE_PRINTF("Error %s\n", msg); } - AMREX_DEVICE_ASSERT(0); -#endif -#else - Error_host("Error", msg); + AMREX_IF_ON_DEVICE(( + if (msg) { AMREX_DEVICE_PRINTF("Error %s\n", msg); } + AMREX_DEVICE_ASSERT(0); + )) #endif + AMREX_IF_ON_HOST((Error_host("Error", msg);)) } //! Print out warning message to cerr. @@ -132,15 +131,12 @@ namespace amrex AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE void Warning (const char * msg) { -#if AMREX_DEVICE_COMPILE #if defined(NDEBUG) - amrex::ignore_unused(msg); -#else - if (msg) { AMREX_DEVICE_PRINTF("Warning %s\n", msg); } -#endif + AMREX_IF_ON_DEVICE((amrex::ignore_unused(msg);)) #else - Warning_host(msg); + AMREX_IF_ON_DEVICE((if (msg) { AMREX_DEVICE_PRINTF("Warning %s\n", msg); })) #endif + AMREX_IF_ON_HOST((Warning_host(msg);)) } //! Print out message to cerr and exit via abort(). @@ -148,16 +144,15 @@ namespace amrex AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE void Abort (const char * msg = nullptr) { -#if AMREX_DEVICE_COMPILE #if defined(NDEBUG) - amrex::ignore_unused(msg); + AMREX_IF_ON_DEVICE((amrex::ignore_unused(msg);)) #else - if (msg) { AMREX_DEVICE_PRINTF("Abort %s\n", msg); } - AMREX_DEVICE_ASSERT(0); -#endif -#else - Error_host("Abort", msg); + AMREX_IF_ON_DEVICE(( + if (msg) { AMREX_DEVICE_PRINTF("Abort %s\n", msg); } + AMREX_DEVICE_ASSERT(0); + )) #endif + AMREX_IF_ON_HOST((Error_host("Abort", msg);)) } /** @@ -170,22 +165,21 @@ namespace amrex AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE void Assert (const char* EX, const char* file, int line, const char* msg = nullptr) { -#if AMREX_DEVICE_COMPILE #if defined(NDEBUG) - amrex::ignore_unused(EX,file,line,msg); -#else - if (msg) { - AMREX_DEVICE_PRINTF("Assertion `%s' failed, file \"%s\", line %d, Msg: %s", - EX, file, line, msg); - } else { - AMREX_DEVICE_PRINTF("Assertion `%s' failed, file \"%s\", line %d", - EX, file, line); - } - AMREX_DEVICE_ASSERT(0); -#endif + AMREX_IF_ON_DEVICE((amrex::ignore_unused(EX,file,line,msg);)) #else - Assert_host(EX,file,line,msg); + AMREX_IF_ON_DEVICE(( + if (msg) { + AMREX_DEVICE_PRINTF("Assertion `%s' failed, file \"%s\", line %d, Msg: %s", + EX, file, line, msg); + } else { + AMREX_DEVICE_PRINTF("Assertion `%s' failed, file \"%s\", line %d", + EX, file, line); + } + AMREX_DEVICE_ASSERT(0); + )) #endif + AMREX_IF_ON_HOST((Assert_host(EX,file,line,msg);)) } /** diff --git a/Src/Base/AMReX_Algorithm.H b/Src/Base/AMReX_Algorithm.H index b418f3cc1c0..31889df4425 100644 --- a/Src/Base/AMReX_Algorithm.H +++ b/Src/Base/AMReX_Algorithm.H @@ -161,51 +161,52 @@ namespace amrex AMREX_GPU_HOST_DEVICE ItType upper_bound (ItType first, ItType last, const ValType& val) { -#if AMREX_DEVICE_COMPILE - std::ptrdiff_t count = last-first; - while(count>0){ - auto it = first; - const auto step = count/2; - it += step; - if (!(val < *it)){ - first = ++it; - count -= step + 1; + AMREX_IF_ON_DEVICE(( + std::ptrdiff_t count = last-first; + while(count>0){ + auto it = first; + const auto step = count/2; + it += step; + if (!(val < *it)){ + first = ++it; + count -= step + 1; + } + else{ + count = step; + } } - else{ - count = step; - } - } - - return first; -#else - return std::upper_bound(first, last, val); -#endif + return first; + )) + AMREX_IF_ON_HOST(( + return std::upper_bound(first, last, val); + )) } template AMREX_GPU_HOST_DEVICE ItType lower_bound (ItType first, ItType last, const ValType& val) { -#ifdef AMREX_DEVICE_COMPILE - std::ptrdiff_t count = last-first; - while(count>0) - { - auto it = first; - const auto step = count/2; - it += step; - if (*it < val){ - first = ++it; - count -= step + 1; - } - else{ - count = step; + AMREX_IF_ON_DEVICE(( + std::ptrdiff_t count = last-first; + while(count>0) + { + auto it = first; + const auto step = count/2; + it += step; + if (*it < val){ + first = ++it; + count -= step + 1; + } + else{ + count = step; + } } - } - return first; -#else - return std::lower_bound(first, last, val); -#endif + return first; + )) + AMREX_IF_ON_HOST(( + return std::lower_bound(first, last, val); + )) } namespace detail { @@ -239,83 +240,100 @@ int builtin_clz_wrapper (clzll_tag, T x) noexcept return static_cast(__builtin_clzll(x) - (sizeof(unsigned long long) * CHAR_BIT - sizeof(T) * CHAR_BIT)); } -#ifdef AMREX_USE_CUDA - -// likewise with CUDA, there are __clz functions that take (signed) int and long long int -template ::type> -AMREX_GPU_DEVICE AMREX_FORCE_INLINE -int clz_wrapper (clz_tag, T x) noexcept -{ - return __clz((int) x) - (sizeof(int) * CHAR_BIT - sizeof(T) * CHAR_BIT); -} - -template ::type> -AMREX_GPU_DEVICE AMREX_FORCE_INLINE -int clz_wrapper (clzll_tag, T x) noexcept -{ - return __clzll((long long int) x) - (sizeof(long long int) * CHAR_BIT - sizeof(T) * CHAR_BIT); } -#endif -} +template ,std::uint8_t> || + std::is_same_v,std::uint16_t> || + std::is_same_v,std::uint32_t> || + std::is_same_v,std::uint64_t>, int> = 0> +AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE +int clz (T x) noexcept; AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE -int clz (std::uint8_t x) noexcept +int clz_generic (std::uint8_t x) noexcept { -#if (AMREX_DEVICE_COMPILE && defined(AMREX_USE_CUDA)) // all supported cuda versions have __clz - return detail::clz_wrapper(detail::clz_tag{}, x); -#elif (!AMREX_DEVICE_COMPILE && AMREX_HAS_BUILTIN_CLZ) - return detail::builtin_clz_wrapper(detail::clz_tag{}, x); -#else static constexpr int clz_lookup[16] = { 4, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0 }; auto upper = x >> 4; auto lower = x & 0xF; return upper ? clz_lookup[upper] : 4 + clz_lookup[lower]; -#endif } AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE -int clz (std::uint16_t x) noexcept +int clz_generic (std::uint16_t x) noexcept { -#if (AMREX_DEVICE_COMPILE && defined(AMREX_USE_CUDA)) // all supported cuda versions have __clz - return detail::clz_wrapper(detail::clz_tag{}, x); -#elif (!AMREX_DEVICE_COMPILE && AMREX_HAS_BUILTIN_CLZ) - return detail::builtin_clz_wrapper(detail::clz_tag{}, x); -#else auto upper = std::uint8_t(x >> 8); auto lower = std::uint8_t(x & 0xFF); return upper ? clz(upper) : 8 + clz(lower); -#endif } AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE -int clz (std::uint32_t x) noexcept +int clz_generic (std::uint32_t x) noexcept { -#if (AMREX_DEVICE_COMPILE && defined(AMREX_USE_CUDA)) // all supported cuda versions have __clz - return detail::clz_wrapper(detail::clz_tag{}, x); -#elif (!AMREX_DEVICE_COMPILE && AMREX_HAS_BUILTIN_CLZ) - return detail::builtin_clz_wrapper(detail::clz_tag{}, x); -#else auto upper = std::uint16_t(x >> 16); auto lower = std::uint16_t(x & 0xFFFF); return upper ? clz(upper) : 16 + clz(lower); -#endif } AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE -int clz (std::uint64_t x) noexcept +int clz_generic (std::uint64_t x) noexcept { -#if (AMREX_DEVICE_COMPILE && defined(AMREX_USE_CUDA)) // all supported cuda versions have __clz - return detail::clz_wrapper(detail::clz_tag{}, x); -#elif (!AMREX_DEVICE_COMPILE && AMREX_HAS_BUILTIN_CLZ) - return detail::builtin_clz_wrapper(detail::clz_tag{}, x); -#else auto upper = std::uint32_t(x >> 32); auto lower = std::uint32_t(x & 0xFFFFFFFF); return upper ? clz(upper) : 32 + clz(lower); +} + +#if defined AMREX_USE_CUDA + +namespace detail { + // likewise with CUDA, there are __clz functions that take (signed) int and long long int + template ::type> + AMREX_GPU_DEVICE AMREX_FORCE_INLINE + int clz_wrapper (clz_tag, T x) noexcept + { + return __clz((int) x) - (sizeof(int) * CHAR_BIT - sizeof(T) * CHAR_BIT); + } + + template ::type> + AMREX_GPU_DEVICE AMREX_FORCE_INLINE + int clz_wrapper (clzll_tag, T x) noexcept + { + return __clzll((long long int) x) - (sizeof(long long int) * CHAR_BIT - sizeof(T) * CHAR_BIT); + } +} + +template ,std::uint8_t> || + std::is_same_v,std::uint16_t> || + std::is_same_v,std::uint32_t> || + std::is_same_v,std::uint64_t>, int> > +AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE +int clz (T x) noexcept +{ + AMREX_IF_ON_DEVICE((return detail::clz_wrapper(detail::clz_tag{}, x);)) +#if AMREX_HAS_BUILTIN_CLZ + AMREX_IF_ON_HOST((return detail::builtin_clz_wrapper(detail::clz_tag{}, x);)) +#else + AMREX_IF_ON_HOST((return clz_generic(x);)) #endif } +#else // !defined AMREX_USE_CUDA + +template ,std::uint8_t> || + std::is_same_v,std::uint16_t> || + std::is_same_v,std::uint32_t> || + std::is_same_v,std::uint64_t>, int> > +AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE +int clz (T x) noexcept +{ +#if (!AMREX_DEVICE_COMPILE && AMREX_HAS_BUILTIN_CLZ) + return detail::builtin_clz_wrapper(detail::clz_tag{}, x); +#else + return clz_generic(x); +#endif +} + +#endif // defined AMREX_USE_CUDA + } #endif diff --git a/Src/Base/AMReX_Array4.H b/Src/Base/AMReX_Array4.H index b2ff0fcb549..8d7a4a44f30 100644 --- a/Src/Base/AMReX_Array4.H +++ b/Src/Base/AMReX_Array4.H @@ -42,13 +42,14 @@ namespace amrex { U& operator[] (int n) const noexcept { #if defined(AMREX_DEBUG) || defined(AMREX_BOUND_CHECK) if (n < 0 || n >= ncomp) { -#if AMREX_DEVICE_COMPILE - AMREX_DEVICE_PRINTF(" %d is out of bound (0:%d)", n, ncomp-1); -#else - std::stringstream ss; - ss << " " << n << " is out of bound: (0:" << ncomp-1 << ")"; - amrex::Abort(ss.str()); -#endif + AMREX_IF_ON_DEVICE(( + AMREX_DEVICE_PRINTF(" %d is out of bound (0:%d)", n, ncomp-1); + )) + AMREX_IF_ON_HOST(( + std::stringstream ss; + ss << " " << n << " is out of bound: (0:" << ncomp-1 << ")"; + amrex::Abort(ss.str()); + )) } #endif return p[n*stride]; @@ -233,21 +234,22 @@ namespace amrex { { if (i=end.x || j=end.y || k=end.z || n < 0 || n >= ncomp) { -#if AMREX_DEVICE_COMPILE - AMREX_DEVICE_PRINTF(" (%d,%d,%d,%d) is out of bound (%d:%d,%d:%d,%d:%d,0:%d)\n", - i, j, k, n, begin.x, end.x-1, begin.y, end.y-1, - begin.z, end.z-1, ncomp-1); - amrex::Abort(); -#else - std::stringstream ss; - ss << " (" << i << "," << j << "," << k << "," << n - << ") is out of bound (" - << begin.x << ":" << end.x-1 << "," - << begin.y << ":" << end.y-1 << "," - << begin.z << ":" << end.z-1 << "," - << "0:" << ncomp-1 << ")"; - amrex::Abort(ss.str()); -#endif + AMREX_IF_ON_DEVICE(( + AMREX_DEVICE_PRINTF(" (%d,%d,%d,%d) is out of bound (%d:%d,%d:%d,%d:%d,0:%d)\n", + i, j, k, n, begin.x, end.x-1, begin.y, end.y-1, + begin.z, end.z-1, ncomp-1); + amrex::Abort(); + )) + AMREX_IF_ON_HOST(( + std::stringstream ss; + ss << " (" << i << "," << j << "," << k << "," << n + << ") is out of bound (" + << begin.x << ":" << end.x-1 << "," + << begin.y << ":" << end.y-1 << "," + << begin.z << ":" << end.z-1 << "," + << "0:" << ncomp-1 << ")"; + amrex::Abort(ss.str()); + )) } } #endif diff --git a/Src/Base/AMReX_FabArray.H b/Src/Base/AMReX_FabArray.H index a8839a4bcc0..e507dab153b 100644 --- a/Src/Base/AMReX_FabArray.H +++ b/Src/Base/AMReX_FabArray.H @@ -198,11 +198,8 @@ struct MultiArray4 { AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE Array4 const& operator[] (int li) const noexcept { -#if AMREX_DEVICE_COMPILE - return dp[li]; -#else - return hp[li]; -#endif + AMREX_IF_ON_DEVICE((return dp[li];)) + AMREX_IF_ON_HOST((return hp[li];)) } #ifdef AMREX_USE_GPU diff --git a/Src/Base/AMReX_GpuAtomic.H b/Src/Base/AMReX_GpuAtomic.H index 55fc351156a..deea6ae932e 100644 --- a/Src/Base/AMReX_GpuAtomic.H +++ b/Src/Base/AMReX_GpuAtomic.H @@ -132,17 +132,17 @@ namespace detail { AMREX_GPU_DEVICE AMREX_FORCE_INLINE T Add_device (T* const sum, T const value) noexcept { -#if defined(__CUDA_ARCH__) && defined(AMREX_USE_CUDA) || \ - defined(__HIP_DEVICE_COMPILE__) && defined(AMREX_USE_HIP) - return atomicAdd(sum, value); -#elif defined(__SYCL_DEVICE_ONLY__) +#if defined(__SYCL_DEVICE_ONLY__) constexpr auto mo = sycl::memory_order::relaxed; constexpr auto ms = sycl::memory_scope::device; sycl::atomic_ref a{*sum}; return a.fetch_add(value); #else - amrex::ignore_unused(sum, value); - return T(); // should never get here, but have to return something + AMREX_IF_ON_DEVICE(( return atomicAdd(sum, value); )) + AMREX_IF_ON_HOST(( + amrex::ignore_unused(sum, value); + return T(); // should never get here, but have to return something + )) #endif } @@ -175,7 +175,7 @@ namespace detail { #endif -#if defined(AMREX_USE_CUDA) && (__CUDA_ARCH__ < 600) +#if defined(AMREX_USE_CUDA) && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 600) AMREX_GPU_DEVICE AMREX_FORCE_INLINE double Add_device (double* const sum, double const value) noexcept @@ -195,17 +195,16 @@ namespace detail { AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE T Add (T* sum, T value) noexcept { -#if AMREX_DEVICE_COMPILE #ifdef AMREX_USE_SYCL - return Add_device(sum, value); -#else - return Add_device(sum, value); -#endif + AMREX_IF_ON_DEVICE((return Add_device(sum, value);)) #else - auto old = *sum; - *sum += value; - return old; + AMREX_IF_ON_DEVICE((return Add_device(sum, value);)) #endif + AMREX_IF_ON_HOST(( + auto old = *sum; + *sum += value; + return old; + )) } //////////////////////////////////////////////////////////////////////// @@ -252,18 +251,19 @@ namespace detail { AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE bool If (T* const add, T const value, Op&& op, Cond&& cond) noexcept { -#if AMREX_DEVICE_COMPILE - return If_device(add, value, std::forward(op), std::forward(cond)); -#else - T old = *add; - T const tmp = op(old, value); - if (cond(tmp)) { - *add = tmp; - return true; - } else { - return false; - } -#endif + AMREX_IF_ON_DEVICE(( + return If_device(add, value, std::forward(op), std::forward(cond)); + )) + AMREX_IF_ON_HOST(( + T old = *add; + T const tmp = op(old, value); + if (cond(tmp)) { + *add = tmp; + return true; + } else { + return false; + } + )) } //////////////////////////////////////////////////////////////////////// @@ -278,14 +278,11 @@ namespace detail { AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE void AddNoRet (T* sum, T value) noexcept { -#if AMREX_DEVICE_COMPILE -#ifdef AMREX_USE_SYCL +#if defined(__SYCL_DEVICE_ONLY__) Add_device(sum, value); #else - Add_device(sum, value); -#endif -#else - *sum += value; + AMREX_IF_ON_DEVICE((Add_device(sum, value);)) + AMREX_IF_ON_HOST((*sum += value;)) #endif } @@ -293,14 +290,11 @@ namespace detail { AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE void AddNoRet (float* const sum, float const value) noexcept { -#if AMREX_DEVICE_COMPILE #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wdeprecated-declarations" - atomicAddNoRet(sum, value); + AMREX_IF_ON_DEVICE((atomicAddNoRet(sum, value);)) #pragma clang diagnostic pop -#else - *sum += value; -#endif + AMREX_IF_ON_HOST((*sum += value;)) } #endif @@ -314,18 +308,18 @@ namespace detail { AMREX_GPU_DEVICE AMREX_FORCE_INLINE T Min_device (T* const m, T const value) noexcept { -#if defined(__CUDA_ARCH__) && defined(AMREX_USE_CUDA) || \ - defined(__HIP_DEVICE_COMPILE__) && defined(AMREX_USE_HIP) - return atomicMin(m, value); -#elif defined(__SYCL_DEVICE_ONLY__) +#if defined(__SYCL_DEVICE_ONLY__) constexpr auto mo = sycl::memory_order::relaxed; constexpr auto ms = sycl::memory_scope::device; constexpr auto as = sycl::access::address_space::global_space; sycl::atomic_ref a{*m}; return a.fetch_min(value); #else - amrex::ignore_unused(m,value); - return T(); // should never get here, but have to return something + AMREX_IF_ON_DEVICE(( return atomicMin(m, value); )) + AMREX_IF_ON_HOST(( + amrex::ignore_unused(m,value); + return T(); // should never get here, but have to return something + )) #endif } @@ -357,13 +351,14 @@ namespace detail { AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE T Min (T* const m, T const value) noexcept { -#if AMREX_DEVICE_COMPILE - return Min_device(m, value); -#else - auto const old = *m; - *m = (*m) < value ? (*m) : value; - return old; -#endif + AMREX_IF_ON_DEVICE(( + return Min_device(m, value); + )) + AMREX_IF_ON_HOST(( + auto const old = *m; + *m = (*m) < value ? (*m) : value; + return old; + )) } //////////////////////////////////////////////////////////////////////// @@ -376,18 +371,18 @@ namespace detail { AMREX_GPU_DEVICE AMREX_FORCE_INLINE T Max_device (T* const m, T const value) noexcept { -#if defined(__CUDA_ARCH__) && defined(AMREX_USE_CUDA) || \ - defined(__HIP_DEVICE_COMPILE__) && defined(AMREX_USE_HIP) - return atomicMax(m, value); -#elif defined(__SYCL_DEVICE_ONLY__) +#if defined(__SYCL_DEVICE_ONLY__) constexpr auto mo = sycl::memory_order::relaxed; constexpr auto ms = sycl::memory_scope::device; constexpr auto as = sycl::access::address_space::global_space; sycl::atomic_ref a{*m}; return a.fetch_max(value); #else - amrex::ignore_unused(m,value); - return T(); // should never get here, but have to return something + AMREX_IF_ON_DEVICE(( return atomicMax(m, value); )) + AMREX_IF_ON_HOST(( + amrex::ignore_unused(m,value); + return T(); // should never get here, but have to return something + )) #endif } @@ -419,13 +414,14 @@ namespace detail { AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE T Max (T* const m, T const value) noexcept { -#if AMREX_DEVICE_COMPILE - return Max_device(m, value); -#else - auto const old = *m; - *m = (*m) > value ? (*m) : value; - return old; -#endif + AMREX_IF_ON_DEVICE(( + return Max_device(m, value); + )) + AMREX_IF_ON_HOST(( + auto const old = *m; + *m = (*m) > value ? (*m) : value; + return old; + )) } //////////////////////////////////////////////////////////////////////// @@ -435,19 +431,21 @@ namespace detail { AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE int LogicalOr (int* const m, int const value) noexcept { -#if defined(__CUDA_ARCH__) && defined(AMREX_USE_CUDA) || \ - defined(__HIP_DEVICE_COMPILE__) && defined(AMREX_USE_HIP) - return atomicOr(m, value); -#elif defined(__SYCL_DEVICE_ONLY__) +#if defined(__SYCL_DEVICE_ONLY__) constexpr auto mo = sycl::memory_order::relaxed; constexpr auto ms = sycl::memory_scope::device; constexpr auto as = sycl::access::address_space::global_space; sycl::atomic_ref a{*m}; return a.fetch_or(value); #else - int const old = *m; - *m = (*m) || value; - return old; + AMREX_IF_ON_DEVICE(( + return atomicOr(m, value); + )) + AMREX_IF_ON_HOST(( + int const old = *m; + *m = (*m) || value; + return old; + )) #endif } @@ -458,19 +456,21 @@ namespace detail { AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE int LogicalAnd (int* const m, int const value) noexcept { -#if defined(__CUDA_ARCH__) && defined(AMREX_USE_CUDA) || \ - defined(__HIP_DEVICE_COMPILE__) && defined(AMREX_USE_HIP) - return atomicAnd(m, value ? ~0x0 : 0); -#elif defined(__SYCL_DEVICE_ONLY__) +#if defined(__SYCL_DEVICE_ONLY__) constexpr auto mo = sycl::memory_order::relaxed; constexpr auto ms = sycl::memory_scope::device; constexpr auto as = sycl::access::address_space::global_space; sycl::atomic_ref a{*m}; return a.fetch_and(value ? ~0x0 : 0); #else - int const old = *m; - *m = (*m) && value; - return old; + AMREX_IF_ON_DEVICE(( + return atomicAnd(m, value ? ~0x0 : 0); + )) + AMREX_IF_ON_HOST(( + int const old = *m; + *m = (*m) && value; + return old; + )) #endif } @@ -482,19 +482,21 @@ namespace detail { AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE T Exch (T* address, T val) noexcept { -#if defined(__CUDA_ARCH__) && defined(AMREX_USE_CUDA) || \ - defined(__HIP_DEVICE_COMPILE__) && defined(AMREX_USE_HIP) - return atomicExch(address, val); -#elif defined(__SYCL_DEVICE_ONLY__) +#if defined(__SYCL_DEVICE_ONLY__) constexpr auto mo = sycl::memory_order::relaxed; constexpr auto ms = sycl::memory_scope::device; constexpr auto as = sycl::access::address_space::global_space; sycl::atomic_ref a{*address}; return a.exchange(val); #else - auto const old = *address; - *address = val; - return old; + AMREX_IF_ON_DEVICE(( + return atomicExch(address, val); + )) + AMREX_IF_ON_HOST(( + auto const old = *address; + *address = val; + return old; + )) #endif } @@ -506,10 +508,7 @@ namespace detail { AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE T CAS (T* const address, T compare, T const val) noexcept { // cannot be T const compare because of compare_exchange_strong -#if defined(__CUDA_ARCH__) && defined(AMREX_USE_CUDA) || \ - defined(__HIP_DEVICE_COMPILE__) && defined(AMREX_USE_HIP) - return atomicCAS(address, compare, val); -#elif defined(__SYCL_DEVICE_ONLY__) +#if defined(__SYCL_DEVICE_ONLY__) constexpr auto mo = sycl::memory_order::relaxed; constexpr auto ms = sycl::memory_scope::device; constexpr auto as = sycl::access::address_space::global_space; @@ -517,9 +516,14 @@ namespace detail { a.compare_exchange_strong(compare, val); return compare; #else - auto const old = *address; - *address = (old == compare ? val : old); - return old; + AMREX_IF_ON_DEVICE(( + return atomicCAS(address, compare, val); + )) + AMREX_IF_ON_HOST(( + auto const old = *address; + *address = (old == compare ? val : old); + return old; + )) #endif } } @@ -527,17 +531,21 @@ namespace detail { namespace HostDevice::Atomic { template - AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE - void Add (T* const sum, T const value) noexcept + AMREX_FORCE_INLINE + void Add_Host (T* const sum, T const value) noexcept { -#if AMREX_DEVICE_COMPILE - Gpu::Atomic::AddNoRet(sum,value); -#else #ifdef AMREX_USE_OMP #pragma omp atomic update #endif *sum += value; -#endif + } + + template + AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE + void Add (T* const sum, T const value) noexcept + { + AMREX_IF_ON_DEVICE((Gpu::Atomic::AddNoRet(sum,value);)) + AMREX_IF_ON_HOST((Add_Host(sum,value);)) } } diff --git a/Src/Base/AMReX_GpuLaunch.H b/Src/Base/AMReX_GpuLaunch.H index c1870d2ef58..c4ba7dd86bc 100644 --- a/Src/Base/AMReX_GpuLaunch.H +++ b/Src/Base/AMReX_GpuLaunch.H @@ -103,20 +103,21 @@ namespace Gpu { inline Box getThreadBox (const Box& bx, Long offset) noexcept { -#if AMREX_DEVICE_COMPILE - const auto len = bx.length3d(); - Long k = offset / (len[0]*len[1]); - Long j = (offset - k*(len[0]*len[1])) / len[0]; - Long i = (offset - k*(len[0]*len[1])) - j*len[0]; - IntVect iv{AMREX_D_DECL(static_cast(i), - static_cast(j), - static_cast(k))}; - iv += bx.smallEnd(); - return (bx & Box(iv,iv,bx.type())); -#else - amrex::ignore_unused(offset); - return bx; -#endif + AMREX_IF_ON_DEVICE(( + const auto len = bx.length3d(); + Long k = offset / (len[0]*len[1]); + Long j = (offset - k*(len[0]*len[1])) / len[0]; + Long i = (offset - k*(len[0]*len[1])) - j*len[0]; + IntVect iv{AMREX_D_DECL(static_cast(i), + static_cast(j), + static_cast(k))}; + iv += bx.smallEnd(); + return (bx & Box(iv,iv,bx.type())); + )) + AMREX_IF_ON_HOST(( + amrex::ignore_unused(offset); + return bx; + )) } // ************************************************ diff --git a/Src/Base/AMReX_GpuQualifiers.H b/Src/Base/AMReX_GpuQualifiers.H index 1c0b5731762..4fba23a849a 100644 --- a/Src/Base/AMReX_GpuQualifiers.H +++ b/Src/Base/AMReX_GpuQualifiers.H @@ -8,6 +8,12 @@ #include #endif +#if defined(AMREX_USE_CUDA) && (defined(AMREX_CXX_PGI) || defined(AMREX_CXX_NVHPC)) +#include +#define AMREX_IF_ON_DEVICE(CODE) NV_IF_TARGET(NV_IS_DEVICE, CODE) +#define AMREX_IF_ON_HOST(CODE) NV_IF_TARGET(NV_IS_HOST, CODE) +#endif + #define AMREX_GPU_HOST __host__ #define AMREX_GPU_DEVICE __device__ #define AMREX_GPU_GLOBAL __global__ @@ -31,6 +37,29 @@ #define AMREX_DEVICE_COMPILE (__CUDA_ARCH__ || __HIP_DEVICE_COMPILE__ || __SYCL_DEVICE_ONLY__) +// Remove surrounding parentheses if present +#define AMREX_IMPL_STRIP_PARENS(X) AMREX_IMPL_ESC(AMREX_IMPL_ISH X) +#define AMREX_IMPL_ISH(...) AMREX_IMPL_ISH __VA_ARGS__ +#define AMREX_IMPL_ESC(...) AMREX_IMPL_ESC_(__VA_ARGS__) +#define AMREX_IMPL_ESC_(...) AMREX_IMPL_VAN_##__VA_ARGS__ +#define AMREX_IMPL_VAN_AMREX_IMPL_ISH + +#if !defined(AMREX_IF_ON_DEVICE) && !defined(AMREX_IF_ON_HOST) +#if (defined(AMREX_USE_CUDA) && defined(__CUDA_ARCH__)) || \ + (defined(AMREX_USE_HIP) && defined(__HIP_DEVICE_COMPILE__)) || \ + (defined(AMREX_USE_SYCL) && defined(__SYCL_DEVICE_ONLY__)) +#define AMREX_IF_ON_DEVICE(CODE) \ + { AMREX_IMPL_STRIP_PARENS(CODE) } +#define AMREX_IF_ON_HOST(CODE) \ + {} +#else +#define AMREX_IF_ON_DEVICE(CODE) \ + {} +#define AMREX_IF_ON_HOST(CODE) \ + { AMREX_IMPL_STRIP_PARENS(CODE) } +#endif +#endif + #ifdef AMREX_USE_SYCL # include #endif diff --git a/Src/Base/AMReX_GpuRange.H b/Src/Base/AMReX_GpuRange.H index b8d2ab89d08..be5071dbf8a 100644 --- a/Src/Base/AMReX_GpuRange.H +++ b/Src/Base/AMReX_GpuRange.H @@ -32,31 +32,31 @@ Long at (T const& /*b*/, Long offset) noexcept { return offset; } AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE Long size (Box const& b) noexcept { -#if AMREX_DEVICE_COMPILE - return b.numPts(); -#else - amrex::ignore_unused(b); - return 1; -#endif + AMREX_IF_ON_DEVICE((return b.numPts();)) + AMREX_IF_ON_HOST(( + amrex::ignore_unused(b); + return 1; + )) } AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE Box at (Box const& b, Long offset) noexcept { -#if AMREX_DEVICE_COMPILE - auto len = b.length3d(); - Long k = offset / (len[0]*len[1]); - Long j = (offset - k*(len[0]*len[1])) / len[0]; - Long i = (offset - k*(len[0]*len[1])) - j*len[0]; - IntVect iv{AMREX_D_DECL(static_cast(i), - static_cast(j), - static_cast(k))}; - iv += b.smallEnd(); - return Box(iv,iv,b.type()); -#else - amrex::ignore_unused(offset); - return b; -#endif + AMREX_IF_ON_DEVICE(( + auto len = b.length3d(); + Long k = offset / (len[0]*len[1]); + Long j = (offset - k*(len[0]*len[1])) / len[0]; + Long i = (offset - k*(len[0]*len[1])) - j*len[0]; + IntVect iv{AMREX_D_DECL(static_cast(i), + static_cast(j), + static_cast(k))}; + iv += b.smallEnd(); + return Box(iv,iv,b.type()); + )) + AMREX_IF_ON_HOST(( + amrex::ignore_unused(offset); + return b; + )) } template @@ -92,13 +92,15 @@ struct range_impl [[nodiscard]] AMREX_GPU_HOST_DEVICE iterator begin () const noexcept { -#if defined(__CUDA_ARCH__) && defined(AMREX_USE_CUDA) || \ - defined(__HIP_DEVICE_COMPILE__) && defined(AMREX_USE_HIP) - return iterator(m_b, blockDim.x*blockIdx.x+threadIdx.x, blockDim.x*gridDim.x); -#elif defined (__SYCL_DEVICE_ONLY__) +#if defined (__SYCL_DEVICE_ONLY__) return iterator(m_b, m_gid, m_grange); #else - return iterator(m_b,0,1); + AMREX_IF_ON_DEVICE(( + return iterator(m_b, blockDim.x*blockIdx.x+threadIdx.x, blockDim.x*gridDim.x); + )) + AMREX_IF_ON_HOST(( + return iterator(m_b,0,1); + )) #endif } diff --git a/Src/Base/AMReX_GpuUtility.H b/Src/Base/AMReX_GpuUtility.H index ce98556fc5c..4adc111f5e2 100644 --- a/Src/Base/AMReX_GpuUtility.H +++ b/Src/Base/AMReX_GpuUtility.H @@ -26,8 +26,9 @@ namespace Gpu { template AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE T LDG (Array4 const& a, int i, int j, int k) noexcept { -#if defined(__CUDA_ARCH__) && defined(AMREX_USE_CUDA) - return __ldg(a.ptr(i,j,k)); +#if defined(AMREX_USE_CUDA) + AMREX_IF_ON_DEVICE((return __ldg(a.ptr(i,j,k));)) + AMREX_IF_ON_HOST((return a(i,j,k);)) #else return a(i,j,k); #endif @@ -36,8 +37,9 @@ namespace Gpu { template AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE T LDG (Array4 const& a, int i, int j, int k, int n) noexcept { -#if defined(__CUDA_ARCH__) && defined(AMREX_USE_CUDA) - return __ldg(a.ptr(i,j,k,n)); +#if defined(AMREX_USE_CUDA) + AMREX_IF_ON_DEVICE((return __ldg(a.ptr(i,j,k,n));)) + AMREX_IF_ON_HOST((return a(i,j,k,n);)) #else return a(i,j,k,n); #endif diff --git a/Src/Base/AMReX_Math.H b/Src/Base/AMReX_Math.H index 769b9bf50f4..506289d03d5 100644 --- a/Src/Base/AMReX_Math.H +++ b/Src/Base/AMReX_Math.H @@ -68,11 +68,9 @@ double cospi (double x) { #if defined(AMREX_USE_SYCL) return sycl::cospi(x); -#elif defined(__CUDA_ARCH__) && defined(AMREX_USE_CUDA) || \ - defined(__HIP_DEVICE_COMPILE__) && defined(AMREX_USE_HIP) - return ::cospi(x); #else - return std::cos(pi()*x); + AMREX_IF_ON_DEVICE(( return ::cospi(x); )) + AMREX_IF_ON_HOST(( return std::cos(pi()*x); )) #endif } @@ -82,11 +80,9 @@ float cospi (float x) { #if defined(AMREX_USE_SYCL) return sycl::cospi(x); -#elif defined(__CUDA_ARCH__) && defined(AMREX_USE_CUDA) || \ - defined(__HIP_DEVICE_COMPILE__) && defined(AMREX_USE_HIP) - return ::cospif(x); #else - return std::cos(pi()*x); + AMREX_IF_ON_DEVICE(( return ::cospif(x); )) + AMREX_IF_ON_HOST(( return std::cos(pi()*x); )) #endif } @@ -96,11 +92,9 @@ double sinpi (double x) { #if defined(AMREX_USE_SYCL) return sycl::sinpi(x); -#elif defined(__CUDA_ARCH__) && defined(AMREX_USE_CUDA) || \ - defined(__HIP_DEVICE_COMPILE__) && defined(AMREX_USE_HIP) - return ::sinpi(x); #else - return std::sin(pi()*x); + AMREX_IF_ON_DEVICE(( return ::sinpi(x); )) + AMREX_IF_ON_HOST(( return std::sin(pi()*x); )) #endif } @@ -110,14 +104,32 @@ float sinpi (float x) { #if defined(AMREX_USE_SYCL) return sycl::sinpi(x); -#elif defined(__CUDA_ARCH__) && defined(AMREX_USE_CUDA) || \ - defined(__HIP_DEVICE_COMPILE__) && defined(AMREX_USE_HIP) - return ::sinpif(x); #else - return std::sin(pi()*x); + AMREX_IF_ON_DEVICE(( return ::sinpif(x); )) + AMREX_IF_ON_HOST(( return std::sin(pi()*x); )) #endif } +namespace detail { + AMREX_FORCE_INLINE void sincos (double x, double* sinx, double* cosx) { +#if defined(_GNU_SOURCE) && !defined(__APPLE__) + ::sincos(x, sinx, cosx); +#else + *sinx = std::sin(x); + *cosx = std::cos(x); +#endif + } + + AMREX_FORCE_INLINE void sincosf (float x, float* sinx, float* cosx) { +#if defined(_GNU_SOURCE) && !defined(__APPLE__) + ::sincosf(x, sinx, cosx); +#else + *sinx = std::sin(x); + *cosx = std::cos(x); +#endif + } +} + //! Return sine and cosine of given number AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE std::pair sincos (double x) @@ -125,13 +137,9 @@ std::pair sincos (double x) std::pair r; #if defined(AMREX_USE_SYCL) r.first = sycl::sincos(x, sycl::private_ptr(&r.second)); -#elif defined(__CUDA_ARCH__) && defined(AMREX_USE_CUDA) || \ - defined(__HIP_DEVICE_COMPILE__) && defined(AMREX_USE_HIP) || \ - (defined(_GNU_SOURCE) && !defined(__APPLE__)) - ::sincos(x, &r.first, &r.second); #else - r.first = std::sin(x); - r.second = std::cos(x); + AMREX_IF_ON_DEVICE(( ::sincos(x, &r.first, &r.second); )) + AMREX_IF_ON_HOST(( detail::sincos(x, &r.first, &r.second); )) #endif return r; } @@ -143,13 +151,9 @@ std::pair sincos (float x) std::pair r; #if defined(AMREX_USE_SYCL) r.first = sycl::sincos(x, sycl::private_ptr(&r.second)); -#elif defined(__CUDA_ARCH__) && defined(AMREX_USE_CUDA) || \ - defined(__HIP_DEVICE_COMPILE__) && defined(AMREX_USE_HIP) || \ - (defined(_GNU_SOURCE) && !defined(__APPLE__)) - ::sincosf(x, &r.first, &r.second); #else - r.first = std::sin(x); - r.second = std::cos(x); + AMREX_IF_ON_DEVICE(( ::sincosf(x, &r.first, &r.second); )) + AMREX_IF_ON_HOST(( detail::sincosf(x, &r.first, &r.second); )) #endif return r; } @@ -159,11 +163,11 @@ AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE std::pair sincospi (double x) { std::pair r; -#if defined(__CUDA_ARCH__) && defined(AMREX_USE_CUDA) || \ - defined(__HIP_DEVICE_COMPILE__) && defined(AMREX_USE_HIP) - ::sincospi(x, &r.first, &r.second); -#else +#if defined(AMREX_USE_SYCL) r = sincos(pi()*x); +#else + AMREX_IF_ON_DEVICE(( ::sincospi(x, &r.first, &r.second); )) + AMREX_IF_ON_HOST(( r = sincos(pi()*x); )) #endif return r; } @@ -173,11 +177,11 @@ AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE std::pair sincospi (float x) { std::pair r; -#if defined(__CUDA_ARCH__) && defined(AMREX_USE_CUDA) || \ - defined(__HIP_DEVICE_COMPILE__) && defined(AMREX_USE_HIP) - ::sincospif(x, &r.first, &r.second); -#else +#if defined(AMREX_USE_SYCL) r = sincos(pi()*x); +#else + AMREX_IF_ON_DEVICE(( ::sincospif(x, &r.first, &r.second); )) + AMREX_IF_ON_HOST(( r = sincos(pi()*x); )) #endif return r; } diff --git a/Src/Base/AMReX_Random.H b/Src/Base/AMReX_Random.H index 675c12082d5..50b2c2693b0 100644 --- a/Src/Base/AMReX_Random.H +++ b/Src/Base/AMReX_Random.H @@ -23,24 +23,29 @@ namespace amrex AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE Real Random (RandomEngine const& random_engine) { -#if defined(__CUDA_ARCH__) && defined(AMREX_USE_CUDA) -#ifdef BL_USE_FLOAT - return 1.0f - curand_uniform(random_engine.rand_state); +#if defined (__SYCL_DEVICE_ONLY__) + mkl::rng::device::uniform distr; + return mkl::rng::device::generate(distr, *random_engine.engine); #else - return 1.0 - curand_uniform_double(random_engine.rand_state); -#endif -#elif defined(__HIP_DEVICE_COMPILE__) && defined(AMREX_USE_HIP) #ifdef BL_USE_FLOAT - return 1.0f - hiprand_uniform(random_engine.rand_state); + AMREX_IF_ON_DEVICE(( + AMREX_HIP_OR_CUDA( + return 1.0f - hiprand_uniform(random_engine.rand_state); , + return 1.0f - curand_uniform(random_engine.rand_state); + ) + )) #else - return 1.0 - hiprand_uniform_double(random_engine.rand_state); + AMREX_IF_ON_DEVICE(( + AMREX_HIP_OR_CUDA( + return 1.0 - hiprand_uniform_double(random_engine.rand_state); , + return 1.0 - curand_uniform_double(random_engine.rand_state); + ) + )) #endif -#elif defined (__SYCL_DEVICE_ONLY__) - mkl::rng::device::uniform distr; - return mkl::rng::device::generate(distr, *random_engine.engine); -#else - amrex::ignore_unused(random_engine); - return Random(); + AMREX_IF_ON_HOST(( + amrex::ignore_unused(random_engine); + return Random(); + )) #endif } @@ -56,24 +61,29 @@ namespace amrex AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE Real RandomNormal (Real mean, Real stddev, RandomEngine const& random_engine) { -#if defined(__CUDA_ARCH__) && defined(AMREX_USE_CUDA) -#ifdef BL_USE_FLOAT - return stddev * curand_normal(random_engine.rand_state) + mean; +#if defined (__SYCL_DEVICE_ONLY__) + mkl::rng::device::gaussian distr(mean, stddev); + return mkl::rng::device::generate(distr, *random_engine.engine); #else - return stddev * curand_normal_double(random_engine.rand_state) + mean; -#endif -#elif defined(__HIP_DEVICE_COMPILE__) && defined(AMREX_USE_HIP) #ifdef BL_USE_FLOAT - return stddev * hiprand_normal(random_engine.rand_state) + mean; + AMREX_IF_ON_DEVICE(( + AMREX_HIP_OR_CUDA( + return stddev * hiprand_normal(random_engine.rand_state) + mean; , + return stddev * curand_normal(random_engine.rand_state) + mean; + ) + )) #else - return stddev * hiprand_normal_double(random_engine.rand_state) + mean; + AMREX_IF_ON_DEVICE(( + AMREX_HIP_OR_CUDA( + return stddev * hiprand_normal_double(random_engine.rand_state) + mean; , + return stddev * curand_normal_double(random_engine.rand_state) + mean; + ) + )) #endif -#elif defined (__SYCL_DEVICE_ONLY__) - mkl::rng::device::gaussian distr(mean, stddev); - return mkl::rng::device::generate(distr, *random_engine.engine); -#else - amrex::ignore_unused(random_engine); - return RandomNormal(mean, stddev); + AMREX_IF_ON_HOST(( + amrex::ignore_unused(random_engine); + return RandomNormal(mean, stddev); + )) #endif } @@ -91,16 +101,20 @@ namespace amrex AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE unsigned int RandomPoisson (Real lambda, RandomEngine const& random_engine) { -#if defined(__CUDA_ARCH__) && defined(AMREX_USE_CUDA) - return curand_poisson(random_engine.rand_state, lambda); -#elif defined(__HIP_DEVICE_COMPILE__) && defined(AMREX_USE_HIP) - return hiprand_poisson(random_engine.rand_state, lambda); -#elif defined (__SYCL_DEVICE_ONLY__) +#if defined (__SYCL_DEVICE_ONLY__) mkl::rng::device::poisson distr(lambda); return mkl::rng::device::generate(distr, *random_engine.engine); #else - amrex::ignore_unused(random_engine); - return RandomPoisson(lambda); + AMREX_IF_ON_DEVICE(( + AMREX_HIP_OR_CUDA( + return hiprand_poisson(random_engine.rand_state, lambda); , + return curand_poisson(random_engine.rand_state, lambda); + ) + )) + AMREX_IF_ON_HOST(( + amrex::ignore_unused(random_engine); + return RandomPoisson(lambda); + )) #endif } @@ -116,22 +130,23 @@ namespace amrex AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE unsigned int Random_int (unsigned int n, RandomEngine const& random_engine) { -#if AMREX_DEVICE_COMPILE #if defined(__SYCL_DEVICE_ONLY__) mkl::rng::device::uniform distr(0,n); return mkl::rng::device::generate(distr, *random_engine.engine); #else - unsigned int rand; - constexpr unsigned int RAND_M = 4294967295; // 2**32-1 - do { - AMREX_HIP_OR_CUDA( rand = hiprand(random_engine.rand_state);, - rand = curand(random_engine.rand_state) ); - } while (rand > (RAND_M - RAND_M % n)); - return rand % n; -#endif -#else - amrex::ignore_unused(random_engine); - return Random_int(n); + AMREX_IF_ON_DEVICE(( + unsigned int rand; + constexpr unsigned int RAND_M = 4294967295; // 2**32-1 + do { + AMREX_HIP_OR_CUDA( rand = hiprand(random_engine.rand_state);, + rand = curand(random_engine.rand_state) ); + } while (rand > (RAND_M - RAND_M % n)); + return rand % n; + )) + AMREX_IF_ON_HOST(( + amrex::ignore_unused(random_engine); + return Random_int(n); + )) #endif } diff --git a/Src/Base/AMReX_TableData.H b/Src/Base/AMReX_TableData.H index b7572e2a1cf..842225e53f4 100644 --- a/Src/Base/AMReX_TableData.H +++ b/Src/Base/AMReX_TableData.H @@ -57,16 +57,17 @@ struct Table1D void index_assert (int i) const { if (i < begin || i >= end) { -#if AMREX_DEVICE_COMPILE - AMREX_DEVICE_PRINTF(" (%d) is out of bound (%d:%d)\n", - i, begin, end-1); - amrex::Abort(); -#else - std::stringstream ss; - ss << " (" << i << ") is out of bound (" - << begin << ":" << end-1 << ")"; - amrex::Abort(ss.str()); -#endif + AMREX_IF_ON_DEVICE(( + AMREX_DEVICE_PRINTF(" (%d) is out of bound (%d:%d)\n", + i, begin, end-1); + amrex::Abort(); + )) + AMREX_IF_ON_HOST(( + std::stringstream ss; + ss << " (" << i << ") is out of bound (" + << begin << ":" << end-1 << ")"; + amrex::Abort(ss.str()); + )) } } #endif @@ -120,17 +121,18 @@ struct Table2D { if (i < begin[0] || i >= end[0] || j < begin[1] || j >= end[1]) { -#if AMREX_DEVICE_COMPILE - AMREX_DEVICE_PRINTF(" (%d,%d) is out of bound (%d:%d,%d:%d)\n", - i, j, begin[0], end[0]-1, begin[1], end[1]-1); - amrex::Abort(); -#else - std::stringstream ss; - ss << " (" << i << "," << j << ") is out of bound (" - << begin[0] << ":" << end[0]-1 - << "," << begin[1] << ":" << end[1]-1 << ")"; - amrex::Abort(ss.str()); -#endif + AMREX_IF_ON_DEVICE(( + AMREX_DEVICE_PRINTF(" (%d,%d) is out of bound (%d:%d,%d:%d)\n", + i, j, begin[0], end[0]-1, begin[1], end[1]-1); + amrex::Abort(); + )) + AMREX_IF_ON_HOST(( + std::stringstream ss; + ss << " (" << i << "," << j << ") is out of bound (" + << begin[0] << ":" << end[0]-1 + << "," << begin[1] << ":" << end[1]-1 << ")"; + amrex::Abort(ss.str()); + )) } } #endif @@ -188,19 +190,20 @@ struct Table3D if (i < begin[0] || i >= end[0] || j < begin[1] || j >= end[1] || k < begin[2] || k >= end[2]) { -#if AMREX_DEVICE_COMPILE - AMREX_DEVICE_PRINTF(" (%d,%d,%d) is out of bound (%d:%d,%d:%d,%d:%d)\n", - i, j, k, begin[0], end[0]-1, begin[1], end[1]-1, - begin[2], end[2]-1); - amrex::Abort(); -#else - std::stringstream ss; - ss << " (" << i << "," << j << "," << k << ") is out of bound (" - << begin[0] << ":" << end[0]-1 - << "," << begin[1] << ":" << end[1]-1 - << "," << begin[2] << ":" << end[2]-1 << ")"; - amrex::Abort(ss.str()); -#endif + AMREX_IF_ON_DEVICE(( + AMREX_DEVICE_PRINTF(" (%d,%d,%d) is out of bound (%d:%d,%d:%d,%d:%d)\n", + i, j, k, begin[0], end[0]-1, begin[1], end[1]-1, + begin[2], end[2]-1); + amrex::Abort(); + )) + AMREX_IF_ON_HOST(( + std::stringstream ss; + ss << " (" << i << "," << j << "," << k << ") is out of bound (" + << begin[0] << ":" << end[0]-1 + << "," << begin[1] << ":" << end[1]-1 + << "," << begin[2] << ":" << end[2]-1 << ")"; + amrex::Abort(ss.str()); + )) } } #endif @@ -262,20 +265,21 @@ struct Table4D j < begin[1] || j >= end[1] || k < begin[2] || k >= end[2] || n < begin[3] || n >= end[3]) { -#if AMREX_DEVICE_COMPILE - AMREX_DEVICE_PRINTF(" (%d,%d,%d,%d) is out of bound (%d:%d,%d:%d,%d:%d,%d:%d)\n", - i, j, k, n, begin[0], end[0]-1, begin[1], end[1]-1, - begin[2], end[2]-1, begin[3], end[3]-1); - amrex::Abort(); -#else - std::stringstream ss; - ss << " (" << i << "," << j << "," << k << "," << n << ") is out of bound (" - << begin[0] << ":" << end[0]-1 - << "," << begin[1] << ":" << end[1]-1 - << "," << begin[2] << ":" << end[2]-1 - << "," << begin[3] << ":" << end[3]-1 << ")"; - amrex::Abort(ss.str()); -#endif + AMREX_IF_ON_DEVICE(( + AMREX_DEVICE_PRINTF(" (%d,%d,%d,%d) is out of bound (%d:%d,%d:%d,%d:%d,%d:%d)\n", + i, j, k, n, begin[0], end[0]-1, begin[1], end[1]-1, + begin[2], end[2]-1, begin[3], end[3]-1); + amrex::Abort(); + )) + AMREX_IF_ON_HOST(( + std::stringstream ss; + ss << " (" << i << "," << j << "," << k << "," << n << ") is out of bound (" + << begin[0] << ":" << end[0]-1 + << "," << begin[1] << ":" << end[1]-1 + << "," << begin[2] << ":" << end[2]-1 + << "," << begin[3] << ":" << end[3]-1 << ")"; + amrex::Abort(ss.str()); + )) } } #endif diff --git a/Src/Base/Parser/AMReX_IParser.H b/Src/Base/Parser/AMReX_IParser.H index 69f40252b0c..025da853c28 100644 --- a/Src/Base/Parser/AMReX_IParser.H +++ b/Src/Base/Parser/AMReX_IParser.H @@ -20,11 +20,8 @@ struct IParserExecutor [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE int operator() () const noexcept { -#if AMREX_DEVICE_COMPILE - return iparser_exe_eval(m_device_executor, nullptr); -#else - return iparser_exe_eval(m_host_executor, nullptr); -#endif + AMREX_IF_ON_DEVICE((return iparser_exe_eval(m_device_executor, nullptr);)) + AMREX_IF_ON_HOST((return iparser_exe_eval(m_host_executor, nullptr);)) } template @@ -33,30 +30,21 @@ struct IParserExecutor operator() (Ts... var) const noexcept { amrex::GpuArray l_var{var...}; -#if AMREX_DEVICE_COMPILE - return iparser_exe_eval(m_device_executor, l_var.data()); -#else - return iparser_exe_eval(m_host_executor, l_var.data()); -#endif + AMREX_IF_ON_DEVICE((return iparser_exe_eval(m_device_executor, l_var.data());)) + AMREX_IF_ON_HOST((return iparser_exe_eval(m_host_executor, l_var.data());)) } [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE int operator() (GpuArray const& var) const noexcept { -#if AMREX_DEVICE_COMPILE - return iparser_exe_eval(m_device_executor, var.data()); -#else - return iparser_exe_eval(m_host_executor, var.data()); -#endif + AMREX_IF_ON_DEVICE((return iparser_exe_eval(m_device_executor, var.data());)) + AMREX_IF_ON_HOST((return iparser_exe_eval(m_host_executor, var.data());)) } [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE explicit operator bool () const { -#if AMREX_DEVICE_COMPILE - return m_device_executor != nullptr; -#else - return m_host_executor != nullptr; -#endif + AMREX_IF_ON_DEVICE((return m_device_executor != nullptr;)) + AMREX_IF_ON_HOST((return m_host_executor != nullptr;)) } char* m_host_executor = nullptr; diff --git a/Src/Base/Parser/AMReX_Parser.H b/Src/Base/Parser/AMReX_Parser.H index b74de941950..456910f8736 100644 --- a/Src/Base/Parser/AMReX_Parser.H +++ b/Src/Base/Parser/AMReX_Parser.H @@ -21,11 +21,8 @@ struct ParserExecutor [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE double operator() () const noexcept { -#if AMREX_DEVICE_COMPILE - return parser_exe_eval(m_device_executor, nullptr); -#else - return parser_exe_eval(m_host_executor, nullptr); -#endif + AMREX_IF_ON_DEVICE((return parser_exe_eval(m_device_executor, nullptr);)) + AMREX_IF_ON_HOST((return parser_exe_eval(m_host_executor, nullptr);)) } template @@ -34,11 +31,8 @@ struct ParserExecutor operator() (Ts... var) const noexcept { amrex::GpuArray l_var{var...}; -#if AMREX_DEVICE_COMPILE - return parser_exe_eval(m_device_executor, l_var.data()); -#else - return parser_exe_eval(m_host_executor, l_var.data()); -#endif + AMREX_IF_ON_DEVICE((return parser_exe_eval(m_device_executor, l_var.data());)) + AMREX_IF_ON_HOST((return parser_exe_eval(m_host_executor, l_var.data());)) } template @@ -47,30 +41,21 @@ struct ParserExecutor operator() (Ts... var) const noexcept { amrex::GpuArray l_var{var...}; -#if AMREX_DEVICE_COMPILE - return static_cast(parser_exe_eval(m_device_executor, l_var.data())); -#else - return static_cast(parser_exe_eval(m_host_executor, l_var.data())); -#endif + AMREX_IF_ON_DEVICE((return static_cast(parser_exe_eval(m_device_executor, l_var.data()));)) + AMREX_IF_ON_HOST((return static_cast(parser_exe_eval(m_host_executor, l_var.data()));)) } [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE double operator() (GpuArray const& var) const noexcept { -#if AMREX_DEVICE_COMPILE - return parser_exe_eval(m_device_executor, var.data()); -#else - return parser_exe_eval(m_host_executor, var.data()); -#endif + AMREX_IF_ON_DEVICE((return parser_exe_eval(m_device_executor, var.data());)) + AMREX_IF_ON_HOST((return parser_exe_eval(m_host_executor, var.data());)) } AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE explicit operator bool () const { -#if AMREX_DEVICE_COMPILE - return m_device_executor != nullptr; -#else - return m_host_executor != nullptr; -#endif + AMREX_IF_ON_DEVICE((return m_device_executor != nullptr;)) + AMREX_IF_ON_HOST((return m_host_executor != nullptr;)) } char* m_host_executor = nullptr; diff --git a/Src/Base/Parser/AMReX_Parser_Y.H b/Src/Base/Parser/AMReX_Parser_Y.H index 792f796554e..e84cf9e0d59 100644 --- a/Src/Base/Parser/AMReX_Parser_Y.H +++ b/Src/Base/Parser/AMReX_Parser_Y.H @@ -350,7 +350,7 @@ template AMREX_GPU_HOST_DEVICE AMREX_NO_INLINE T parser_math_comp_ellint_1 (T a) { -#if defined(__GNUC__) && !defined(__clang__) && !defined(__CUDA_ARCH__) +#if defined(__GNUC__) && !defined(__clang__) && !defined(__CUDA_ARCH__) && !defined(__NVCOMPILER) return std::comp_ellint_1(a); #else amrex::ignore_unused(a); @@ -363,7 +363,7 @@ template AMREX_GPU_HOST_DEVICE AMREX_NO_INLINE T parser_math_comp_ellint_2 (T a) { -#if defined(__GNUC__) && !defined(__clang__) && !defined(__CUDA_ARCH__) +#if defined(__GNUC__) && !defined(__clang__) && !defined(__CUDA_ARCH__) && !defined(__NVCOMPILER) return std::comp_ellint_2(a); #else amrex::ignore_unused(a); diff --git a/Src/EB/AMReX_EB2_GeometryShop.H b/Src/EB/AMReX_EB2_GeometryShop.H index ee353c13952..33931b28c5f 100644 --- a/Src/EB/AMReX_EB2_GeometryShop.H +++ b/Src/EB/AMReX_EB2_GeometryShop.H @@ -28,13 +28,12 @@ AMREX_GPU_HOST_DEVICE Real IF_f (F const& f, GpuArray const& p) noexcept { -#if AMREX_DEVICE_COMPILE - amrex::ignore_unused(f,p); - amrex::Error("EB2::GeometryShop: how did this happen?"); - return 0.0; -#else - return f({AMREX_D_DECL(p[0],p[1],p[2])}); -#endif + AMREX_IF_ON_DEVICE(( + amrex::ignore_unused(f,p); + amrex::Error("EB2::GeometryShop: how did this happen?"); + return 0.0; + )) + AMREX_IF_ON_HOST((return f({AMREX_D_DECL(p[0],p[1],p[2])});)) } template From 6e2b831245f6fdac9a714c64417bd5ab21fb613d Mon Sep 17 00:00:00 2001 From: Edward Basso Date: Mon, 13 Nov 2023 15:00:36 -0800 Subject: [PATCH 09/14] solve_bicgstab: use linop.make instead of MF constructor (#3619) ## Summary This PR replaces the explicit use of MF constructors in ```MLCGSolverT::solve_bicgstab``` with calls to the `make` method of the linear operator associated with the MLCGSolverT object. ## Additional background The use of `MLLinOpT::make` allows for inheritance of MLCGSolverT without an override of `solve_bicgstab` even if the MF class lacks a constructor with the same arguments as those MultiFab. For the MLMG template classes, `make` should generally be used instead of explicit MF constructors. Another PR to change this in `solve_cg` will follow once this is fully vetted and approved. --- Src/LinearSolvers/MLMG/AMReX_MLCGSolver.H | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/Src/LinearSolvers/MLMG/AMReX_MLCGSolver.H b/Src/LinearSolvers/MLMG/AMReX_MLCGSolver.H index 3764fa38f8a..fce7b1d5005 100644 --- a/Src/LinearSolvers/MLMG/AMReX_MLCGSolver.H +++ b/Src/LinearSolvers/MLMG/AMReX_MLCGSolver.H @@ -90,22 +90,18 @@ MLCGSolverT::solve_bicgstab (MF& sol, const MF& rhs, RT eps_rel, RT eps_abs) const int ncomp = sol.nComp(); - const BoxArray& ba = sol.boxArray(); - const DistributionMapping& dm = sol.DistributionMap(); - const auto& factory = sol.Factory(); - - MF ph(ba, dm, ncomp, sol.nGrowVect(), MFInfo(), factory); - MF sh(ba, dm, ncomp, sol.nGrowVect(), MFInfo(), factory); + MF ph = Lp.make(amrlev, mglev, sol.nGrowVect()); + MF sh = Lp.make(amrlev, mglev, sol.nGrowVect()); ph.setVal(RT(0.0)); sh.setVal(RT(0.0)); - MF sorig(ba, dm, ncomp, nghost, MFInfo(), factory); - MF p (ba, dm, ncomp, nghost, MFInfo(), factory); - MF r (ba, dm, ncomp, nghost, MFInfo(), factory); - MF s (ba, dm, ncomp, nghost, MFInfo(), factory); - MF rh (ba, dm, ncomp, nghost, MFInfo(), factory); - MF v (ba, dm, ncomp, nghost, MFInfo(), factory); - MF t (ba, dm, ncomp, nghost, MFInfo(), factory); + MF sorig = Lp.make(amrlev, mglev, nghost); + MF p = Lp.make(amrlev, mglev, nghost); + MF r = Lp.make(amrlev, mglev, nghost); + MF s = Lp.make(amrlev, mglev, nghost); + MF rh = Lp.make(amrlev, mglev, nghost); + MF v = Lp.make(amrlev, mglev, nghost); + MF t = Lp.make(amrlev, mglev, nghost); Lp.correctionResidual(amrlev, mglev, r, sol, rhs, MLLinOpT::BCMode::Homogeneous); From fa3743fd1fdd5e3f1b12d431b0f6bb4b15bb7b95 Mon Sep 17 00:00:00 2001 From: Weiqun Zhang Date: Mon, 13 Nov 2023 15:00:56 -0800 Subject: [PATCH 10/14] CArena: shrink_in_place and operator<< (#3621) ## Summary Implement CArena::shrink_in_place, which is used by PODVector::shrink_to_fit. It avoids a new memory allocation and data movement. Add operator<< to CArena. This helps debugging. ## Additional background Follow-up on #3426. ## Checklist The proposed changes: - [ ] fix a bug or incorrect behavior in AMReX - [x] add new capabilities to AMReX - [ ] changes answers in the test suite to more than roundoff level - [ ] are likely to significantly affect the results of downstream AMReX users - [ ] include documentation in the code and/or rst files, if appropriate --- Src/Base/AMReX_CArena.H | 19 ++++---- Src/Base/AMReX_CArena.cpp | 96 ++++++++++++++++++++++++++++++++++++++- 2 files changed, 105 insertions(+), 10 deletions(-) diff --git a/Src/Base/AMReX_CArena.H b/Src/Base/AMReX_CArena.H index d68285bc878..163039df2ef 100644 --- a/Src/Base/AMReX_CArena.H +++ b/Src/Base/AMReX_CArena.H @@ -5,13 +5,14 @@ #include #include -#include -#include +#include +#include #include #include -#include -#include +#include #include +#include +#include namespace amrex { @@ -57,7 +58,7 @@ public: * Try to shrink in-place */ [[nodiscard]] void* - shrink_in_place (void* pt, std::size_t sz) final; + shrink_in_place (void* pt, std::size_t new_size) final; /** * \brief Free up allocated memory. Merge neighboring free memory chunks @@ -164,15 +165,15 @@ protected: MemStat* m_stat; }; + //! The list of blocks allocated via ::operator new(). + std::vector > m_alloc; + /** * \brief The type of our freelist and blocklist. * We use a set sorted from lo to hi memory addresses. */ using NL = std::set; - //! The list of blocks allocated via ::operator new(). - std::vector > m_alloc; - /** * \brief The free list of allocated but not currently used blocks. * Maintained in lo to hi memory sorted order. @@ -198,6 +199,8 @@ protected: std::mutex carena_mutex; + + friend std::ostream& operator<< (std::ostream& os, const CArena& arena); }; } diff --git a/Src/Base/AMReX_CArena.cpp b/Src/Base/AMReX_CArena.cpp index 6f7979d4750..c47f8f5ed26 100644 --- a/Src/Base/AMReX_CArena.cpp +++ b/Src/Base/AMReX_CArena.cpp @@ -14,6 +14,7 @@ namespace amrex { #include #include +#include namespace amrex { @@ -203,9 +204,61 @@ CArena::alloc_in_place (void* pt, std::size_t szmin, std::size_t szmax) } void* -CArena::shrink_in_place (void* /*pt*/, std::size_t sz) +CArena::shrink_in_place (void* pt, std::size_t new_size) { - return alloc(sz); // xxxxx TODO + if ((pt == nullptr) || (new_size == 0)) { return nullptr; } + + new_size = Arena::align(new_size); + + std::lock_guard lock(carena_mutex); + + auto busy_it = m_busylist.find(Node(pt,nullptr,0)); + if (busy_it == m_busylist.end()) { + amrex::Abort("CArena::shrink_in_place: unknown pointer"); + return nullptr; + } + AMREX_ASSERT(m_freelist.find(*busy_it) == m_freelist.end()); + + auto const old_size = busy_it->size(); + + if (new_size > old_size) { + amrex::Abort("CArena::shrink_in_place: wrong size. Cannot shrink to a larger size."); + return nullptr; + } else if (new_size == old_size) { + return pt; + } else { + auto const leftover_size = old_size - new_size; + + void* pt2 = static_cast(pt) + new_size; + Node new_free_node(pt2, busy_it->owner(), leftover_size); + + void* pt_end = static_cast(pt) + old_size; + auto free_it = m_freelist.find(Node(pt_end,nullptr,0)); + if ((free_it == m_freelist.end()) || ! new_free_node.coalescable(*free_it)) { + m_freelist.insert(free_it, new_free_node); + } else { + auto& node = const_cast(*free_it); + // This is safe because the free list is std::set and the + // modification of `block` does not change the order of elements + // in the container, even though Node's operator< uses block. + node.block(pt2); + node.size(leftover_size + node.size()); + } + + const_cast(*busy_it).size(new_size); + + m_actually_used -= leftover_size; + +#ifdef AMREX_TINY_PROFILING + if (m_do_profiling) { + TinyProfiler::memory_free(old_size, busy_it->mem_stat()); + auto* stat = TinyProfiler::memory_alloc(new_size, m_profiling_stats); + const_cast(*busy_it).mem_stat(stat); + } +#endif + + return pt; + } } void @@ -439,4 +492,43 @@ CArena::PrintUsage (std::ostream& os, std::string const& name, std::string const << m_busylist.size() << " busy blocks, " << m_freelist.size() << " free blocks\n"; } +std::ostream& operator<< (std::ostream& os, const CArena& arena) +{ + os << "CArea:\n" + << " Hunk size: " << arena.m_hunk << "\n" + << " Memory allocated: " << arena.m_used << "\n" + << " Memory actually used: " << arena.m_actually_used << "\n"; + + if (arena.m_alloc.empty()) { + os << " No memory allocations\n"; + } else { + os << " List of memory alloations: (address, size)\n"; + for (auto const& a : arena.m_alloc) { + os << " " << a.first << ", " << a.second << "\n"; + } + } + + if (arena.m_freelist.empty()) { + os << " No free nodes\n"; + } else { + os << " List of free nodes: (address, owner, size)\n"; + for (auto const& a : arena.m_freelist) { + os << " " << a.block() << ", " << a.owner() << ", " + << a.size() << "\n"; + } + } + + if (arena.m_busylist.empty()) { + os << " No busy nodes\n"; + } else { + os << " List of busy nodes: (address, owner, size)\n"; + for (auto const& a : arena.m_busylist) { + os << " " << a.block() << ", " << a.owner() << ", " + << a.size() << "\n"; + } + } + + return os; +} + } From b7408ea6e8feca7eab2f7cf30606d066b6699814 Mon Sep 17 00:00:00 2001 From: Edward Basso Date: Mon, 13 Nov 2023 18:56:02 -0800 Subject: [PATCH 11/14] solve_cg: use linop.make instead of MF constructor (#3627) ## Summary This PR replaces the explicit use of `MF` constructors in `MLCGSolverT::solve_cg` with calls to the make method of the linear operator associated with the `MLCGSolverT` object. ## Additional background This is a similar to the PR on `solve_bicgstab`. --- Src/LinearSolvers/MLMG/AMReX_MLCGSolver.H | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/Src/LinearSolvers/MLMG/AMReX_MLCGSolver.H b/Src/LinearSolvers/MLMG/AMReX_MLCGSolver.H index fce7b1d5005..c99d7b319bd 100644 --- a/Src/LinearSolvers/MLMG/AMReX_MLCGSolver.H +++ b/Src/LinearSolvers/MLMG/AMReX_MLCGSolver.H @@ -261,17 +261,13 @@ MLCGSolverT::solve_cg (MF& sol, const MF& rhs, RT eps_rel, RT eps_abs) const int ncomp = sol.nComp(); - const BoxArray& ba = sol.boxArray(); - const DistributionMapping& dm = sol.DistributionMap(); - const auto& factory = sol.Factory(); - - MF p(ba, dm, ncomp, sol.nGrowVect(), MFInfo(), factory); + MF p = Lp.make(amrlev, mglev, sol.nGrowVect()); p.setVal(RT(0.0)); - MF sorig(ba, dm, ncomp, nghost, MFInfo(), factory); - MF r (ba, dm, ncomp, nghost, MFInfo(), factory); - MF z (ba, dm, ncomp, nghost, MFInfo(), factory); - MF q (ba, dm, ncomp, nghost, MFInfo(), factory); + MF sorig = Lp.make(amrlev, mglev, nghost); + MF r = Lp.make(amrlev, mglev, nghost); + MF z = Lp.make(amrlev, mglev, nghost); + MF q = Lp.make(amrlev, mglev, nghost); sorig.LocalCopy(sol,0,0,ncomp,nghost); From af1e1be8d7d41de9b999b4973b2024281a28f23e Mon Sep 17 00:00:00 2001 From: Weiqun Zhang Date: Tue, 14 Nov 2023 11:13:47 -0800 Subject: [PATCH 12/14] Plotfile Tools: GPU support (#3626) ## Summary Add GPU support for fcompare, fextrema, fsnapshot and fvolumesum. ## Checklist The proposed changes: - [x] fix a bug or incorrect behavior in AMReX - [x] add new capabilities to AMReX - [ ] changes answers in the test suite to more than roundoff level - [ ] are likely to significantly affect the results of downstream AMReX users - [ ] include documentation in the code and/or rst files, if appropriate --- Src/Base/AMReX_PlotFileDataImpl.cpp | 2 +- Tools/Plotfile/fextrema.cpp | 35 ++++---- Tools/Plotfile/fsnapshot.cpp | 4 +- Tools/Plotfile/fvolumesum.cpp | 130 ++++++++++------------------ 4 files changed, 66 insertions(+), 105 deletions(-) diff --git a/Src/Base/AMReX_PlotFileDataImpl.cpp b/Src/Base/AMReX_PlotFileDataImpl.cpp index 1fbf5044a50..b85c17ad93c 100644 --- a/Src/Base/AMReX_PlotFileDataImpl.cpp +++ b/Src/Base/AMReX_PlotFileDataImpl.cpp @@ -141,7 +141,7 @@ PlotFileDataImpl::get (int level, std::string const& varname) noexcept int gid = mfi.index(); FArrayBox& dstfab = mf[mfi]; std::unique_ptr srcfab(m_vismf[level]->readFAB(gid, icomp)); - dstfab.copy(*srcfab); + dstfab.copy(*srcfab); } } return mf; diff --git a/Tools/Plotfile/fextrema.cpp b/Tools/Plotfile/fextrema.cpp index 44cfaf161d4..55596bf952d 100644 --- a/Tools/Plotfile/fextrema.cpp +++ b/Tools/Plotfile/fextrema.cpp @@ -1,5 +1,6 @@ #include #include +#include #include #include #include @@ -80,23 +81,23 @@ void main_main() pf.boxArray(ilev+1), ratio); for (int ivar = 0; ivar < var_names.size(); ++ivar) { const MultiFab& mf = pf.get(ilev, var_names[ivar]); - for (MFIter mfi(mf); mfi.isValid(); ++mfi) { - const Box& bx = mfi.validbox(); - const auto lo = amrex::lbound(bx); - const auto hi = amrex::ubound(bx); - const auto& ifab = mask.array(mfi); - const auto& fab = mf.array(mfi); - for (int k = lo.z; k <= hi.z; ++k) { - for (int j = lo.y; j <= hi.y; ++j) { - for (int i = lo.x; i <= hi.x; ++i) { - if (ifab(i,j,k) == 0) { - vvmin[ivar] = std::min(fab(i,j,k),vvmin[ivar]); - vvmax[ivar] = std::max(fab(i,j,k),vvmax[ivar]); - } - } - } - } - } + auto const& ma = mf.const_arrays(); + auto const& ima = mask.const_arrays(); + auto rr = ParReduce(TypeList{}, + TypeList{}, mf, + [=] AMREX_GPU_DEVICE (int bno, int i, int j, int k) + -> GpuTuple + { + if (ima[bno](i,j,k) == 0) { + auto x = ma[bno](i,j,k); + return {x,x}; + } else { + return {std::numeric_limits::max(), + std::numeric_limits::lowest()}; + } + }); + vvmin[ivar] = std::min(amrex::get<0>(rr), vvmin[ivar]); + vvmax[ivar] = std::max(amrex::get<1>(rr), vvmax[ivar]); } } } diff --git a/Tools/Plotfile/fsnapshot.cpp b/Tools/Plotfile/fsnapshot.cpp index e68f8a33b6d..c4b9fd3f361 100644 --- a/Tools/Plotfile/fsnapshot.cpp +++ b/Tools/Plotfile/fsnapshot.cpp @@ -278,7 +278,7 @@ void main_main() gmx = std::log10(gmx); } - BaseFab intdat; + BaseFab intdat(The_Pinned_Arena()); for (int idir = ndir_begin; idir < ndir_end; ++idir) { intdat.resize(finebox[idir],1); const int width = (idir == 0) ? finebox[idir].length(1) : finebox[idir].length(0); @@ -286,7 +286,7 @@ void main_main() const auto& intarr = intdat.array(); const auto& realarr = datamf[idir].array(0); Real fac = Real(253.999) / (gmx-gmn); - amrex::LoopOnCpu(finebox[idir], [=] (int i, int j, int k) + amrex::ParallelFor(finebox[idir], [=] AMREX_GPU_DEVICE (int i, int j, int k) { int jj = (idir == 2) ? height - 1 - j : j; // flip the data in second image direction int kk = (idir == 2) ? k : height - 1 - k; diff --git a/Tools/Plotfile/fvolumesum.cpp b/Tools/Plotfile/fvolumesum.cpp index 2f9f03cc522..ec6e461dcc7 100644 --- a/Tools/Plotfile/fvolumesum.cpp +++ b/Tools/Plotfile/fvolumesum.cpp @@ -2,6 +2,7 @@ #include #include #include +#include #include #include #include @@ -80,7 +81,6 @@ void main_main() const int dim = pf.spaceDim(); - int fine_level = pf.finestLevel(); Vector pos; @@ -95,6 +95,35 @@ void main_main() Array dx = pf.cellSize(ilev); + Real volfac = AMREX_D_TERM(dx[0], *dx[1], *dx[2]); + + if (coord == 1) { + AMREX_ALWAYS_ASSERT(AMREX_SPACEDIM == 2); + // axisymmetric V = pi (r_r**2 - r_l**2) * dz + // = pi dr * dz * (r_r + r_l) + // = 2 pi r dr dz + volfac *= 2 * pi; // 2 * pi * dr * dz part here + } else if (coord == 2) { + AMREX_ALWAYS_ASSERT(AMREX_SPACEDIM == 1); + // 1-d spherical V = 4/3 pi (r_r**3 - r_l**3) + volfac *= (4.0_rt/3.0_rt) * pi; // 4/3 * pi * dr part here + } + + auto xlo = problo[0]; + auto dx0 = dx[0]; + AMREX_ASSERT(coord == 0 || coord == 1 || coord == 2); + auto f_vol = [=] AMREX_GPU_DEVICE (int i) { + if (coord == 0) { + return volfac; + } else if (coord == 1) { + return volfac * (xlo + (Real(i)+0.5_rt)*dx0); + } else { + Real r_r = xlo + Real(i+1)*dx0; + Real r_l = xlo + Real(i )*dx0; + return volfac * (r_r*r_r + r_l*r_r + r_l*r_l); + } + }; + if (ilev < fine_level) { IntVect ratio{pf.refRatio(ilev)}; for (int idim = dim; idim < AMREX_SPACEDIM; ++idim) { @@ -103,97 +132,28 @@ void main_main() const iMultiFab mask = makeFineMask(pf.boxArray(ilev), pf.DistributionMap(ilev), pf.boxArray(ilev+1), ratio); const MultiFab& mf = pf.get(ilev, var_name); - for (MFIter mfi(mf); mfi.isValid(); ++mfi) { - const Box& bx = mfi.validbox(); - if (bx.ok()) { - const auto& m = mask.array(mfi); - const auto& fab = mf.array(mfi); - const auto lo = amrex::lbound(bx); - const auto hi = amrex::ubound(bx); - for (int k = lo.z; k <= hi.z; ++k) { - for (int j = lo.y; j <= hi.y; ++j) { - for (int i = lo.x; i <= hi.x; ++i) { - if (m(i,j,k) == 0) { // not covered by fine - Array p - = {AMREX_D_DECL(problo[0]+static_cast(i+0.5)*dx[0], - problo[1]+static_cast(j+0.5)*dx[1], - problo[2]+static_cast(k+0.5)*dx[2])}; - - // compute the volume - Real vol = std::numeric_limits::quiet_NaN(); - if (coord == 0) { - // Cartesian - vol = 1.0_rt; - for (int idim = 0; idim < AMREX_SPACEDIM; ++idim) { - vol *= dx[idim]; - } - } else if (coord == 1) { - // axisymmetric V = pi (r_r**2 - r_l**2) * dz - // = pi dr * dz * (r_r + r_l) - // = 2 pi r dr dz - vol = 2 * pi * p[0] * dx[0] * dx[1]; - } else if (coord == 2) { - // 1-d spherical V = 4/3 pi (r_r**3 - r_l**3) - Real r_r = problo[0]+static_cast(i+1)*dx[0]; - Real r_l = problo[0]+static_cast(i)*dx[0]; - vol = (4.0_rt/3.0_rt) * pi * dx[0] * (r_r*r_r + r_l*r_r + r_l*r_l); - } - - lsum += fab(i,j,k) * vol; - } - } - } - } - } - } + auto const& ima = mask.const_arrays(); + auto const& ma = mf.const_arrays(); + lsum += ParReduce(TypeList{}, TypeList{}, mf, + [=] AMREX_GPU_DEVICE (int bno, int i, int j, int k) + -> GpuTuple + { + return { (ima[bno](i,j,k) == 0) ? ma[bno](i,j,k)*f_vol(i) : 0._rt }; + }); } else { const MultiFab& mf = pf.get(ilev, var_name); - for (MFIter mfi(mf); mfi.isValid(); ++mfi) { - const Box& bx = mfi.validbox(); - if (bx.ok()) { - const auto& fab = mf.array(mfi); - const auto lo = amrex::lbound(bx); - const auto hi = amrex::ubound(bx); - for (int k = lo.z; k <= hi.z; ++k) { - for (int j = lo.y; j <= hi.y; ++j) { - for (int i = lo.x; i <= hi.x; ++i) { - Array p - = {AMREX_D_DECL(problo[0]+static_cast(i+0.5)*dx[0], - problo[1]+static_cast(j+0.5)*dx[1], - problo[2]+static_cast(k+0.5)*dx[2])}; - - // compute the volume - Real vol = std::numeric_limits::quiet_NaN(); - if (coord == 0) { - // Cartesian - vol = 1.0_rt; - for (int idim = 0; idim < AMREX_SPACEDIM; ++idim) { - vol *= dx[idim]; - } - } else if (coord == 1) { - // axisymmetric V = pi (r_r**2 - r_l**2) * dz - // = pi dr * dz * (r_r + r_l) - // = 2 pi r dr dz - vol = 2 * pi * p[0] * dx[0] * dx[1]; - } else if (coord == 2) { - // 1-d spherical V = 4/3 pi (r_r**3 - r_l**3) - Real r_r = problo[0]+static_cast(i+1)*dx[0]; - Real r_l = problo[0]+static_cast(i)*dx[0]; - vol = (4.0_rt/3.0_rt) * pi * dx[0] * (r_r*r_r + r_l*r_r + r_l*r_l); - } - - lsum += fab(i,j,k) * vol; - } - } - } - } - } + auto const& ma = mf.const_arrays(); + lsum += ParReduce(TypeList{}, TypeList{}, mf, + [=] AMREX_GPU_DEVICE (int bno, int i, int j, int k) + -> GpuTuple + { + return { ma[bno](i,j,k)*f_vol(i) }; + }); } } ParallelDescriptor::ReduceRealSum(lsum); - if (ParallelDescriptor::IOProcessor()) { std::cout << "integral of " << var_name << " = " << lsum << std::endl; From 15a0bb9a8c1b34136632b16c5511375e9d56b184 Mon Sep 17 00:00:00 2001 From: Weiqun Zhang Date: Tue, 14 Nov 2023 19:36:40 -0800 Subject: [PATCH 13/14] SYCL: Use get_multi_ptr instead of get_pointer (#3630) The latter has been deprecated in SYCL 2020. --- .github/workflows/intel.yml | 3 +-- Src/Base/AMReX_GpuLaunchFunctsG.H | 10 +++++----- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/.github/workflows/intel.yml b/.github/workflows/intel.yml index 6133e666fad..d86035d916e 100644 --- a/.github/workflows/intel.yml +++ b/.github/workflows/intel.yml @@ -226,10 +226,9 @@ jobs: -DCMAKE_VERBOSE_MAKEFILE=ON \ -DAMReX_EB=ON \ -DAMReX_ENABLE_TESTS=ON \ - -DAMReX_FORTRAN=ON \ + -DAMReX_FORTRAN=OFF \ -DCMAKE_C_COMPILER=$(which icc) \ -DCMAKE_CXX_COMPILER=$(which icpc) \ - -DCMAKE_Fortran_COMPILER=$(which ifort) \ -DCMAKE_CXX_COMPILER_LAUNCHER=ccache cmake --build build --parallel 2 cmake --build build --target install diff --git a/Src/Base/AMReX_GpuLaunchFunctsG.H b/Src/Base/AMReX_GpuLaunchFunctsG.H index aea0c030152..78e9e856535 100644 --- a/Src/Base/AMReX_GpuLaunchFunctsG.H +++ b/Src/Base/AMReX_GpuLaunchFunctsG.H @@ -36,7 +36,7 @@ void launch (int nblocks, int nthreads_per_block, std::size_t shared_mem_bytes, [=] (sycl::nd_item<1> item) [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]] { - f(Gpu::Handler{&item,shared_data.get_pointer()}); + f(Gpu::Handler{&item,shared_data.get_multi_ptr().get()}); }); }); } catch (sycl::exception const& ex) { @@ -82,7 +82,7 @@ void launch (int nblocks, std::size_t shared_mem_bytes, gpuStream_t stream, [[sycl::reqd_work_group_size(1,1,MT)]] [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]] { - f(Gpu::Handler{&item,shared_data.get_pointer()}); + f(Gpu::Handler{&item,shared_data.get_multi_ptr().get()}); }); }); } catch (sycl::exception const& ex) { @@ -210,7 +210,7 @@ void ParallelFor (Gpu::KernelInfo const& info, T n, L&& f) noexcept i < n; i += stride) { int n_active_threads = amrex::min(n-i+(T)item.get_local_id(0), (T)item.get_local_range(0)); - detail::call_f(f, i, Gpu::Handler{&item, shared_data.get_pointer(), + detail::call_f(f, i, Gpu::Handler{&item, shared_data.get_multi_ptr().get(), n_active_threads}); } }); @@ -269,7 +269,7 @@ void ParallelFor (Gpu::KernelInfo const& info, Box const& box, L&& f) noexcept k += lo.z; int n_active_threads = amrex::min(ncells-icell+(int)item.get_local_id(0), (int)item.get_local_range(0)); - detail::call_f(f, i, j, k, Gpu::Handler{&item, shared_data.get_pointer(), + detail::call_f(f, i, j, k, Gpu::Handler{&item, shared_data.get_multi_ptr().get(), n_active_threads}); } }); @@ -335,7 +335,7 @@ void ParallelFor (Gpu::KernelInfo const& info, Box const& box, T ncomp, L&& f) n int n_active_threads = amrex::min(ncells-icell+(int)item.get_local_id(0), (int)item.get_local_range(0)); detail::call_f(f, i, j, k, ncomp, - Gpu::Handler{&item, shared_data.get_pointer(), + Gpu::Handler{&item, shared_data.get_multi_ptr().get(), n_active_threads}); } }); From a9da2a54de5d704dda746c8b036de469ea19f4e9 Mon Sep 17 00:00:00 2001 From: Weiqun Zhang Date: Wed, 15 Nov 2023 12:56:44 -0800 Subject: [PATCH 14/14] SUNDIALS: Use sunrealtype instead of realtype (#3632) The latter has been deprecated. --- Src/Extern/SUNDIALS/AMReX_Sundials.H | 4 +- .../SUNDIALS/AMReX_SundialsIntegrator.H | 44 +++++++++---------- 2 files changed, 24 insertions(+), 24 deletions(-) diff --git a/Src/Extern/SUNDIALS/AMReX_Sundials.H b/Src/Extern/SUNDIALS/AMReX_Sundials.H index 7bd09795d50..141291e4a21 100644 --- a/Src/Extern/SUNDIALS/AMReX_Sundials.H +++ b/Src/Extern/SUNDIALS/AMReX_Sundials.H @@ -6,7 +6,7 @@ #include #include -static_assert(std::is_same::value, - "amrex::Real must be the same as SUNDIALS realtype"); +static_assert(std::is_same::value, + "amrex::Real must be the same as SUNDIALS sunrealtype"); #endif diff --git a/Src/Extern/SUNDIALS/AMReX_SundialsIntegrator.H b/Src/Extern/SUNDIALS/AMReX_SundialsIntegrator.H index 2623f832590..8066e9e8840 100644 --- a/Src/Extern/SUNDIALS/AMReX_SundialsIntegrator.H +++ b/Src/Extern/SUNDIALS/AMReX_SundialsIntegrator.H @@ -15,48 +15,48 @@ #include /* access to SPGMR SUNLinearSolver */ #include /* access to SPGMR SUNLinearSolver */ #include /* access to FixedPoint SUNNonlinearSolver */ -#include /* defs. of realtype, sunindextype, etc */ +#include /* defs. of sunrealtype, sunindextype, etc */ namespace amrex { struct SundialsUserData { - std::function f0; - std::function f_fast; - std::function f; - /* std::function StoreStage; */ - std::function ProcessStage; - std::function PostStoreStage; + std::function f0; + std::function f_fast; + std::function f; + /* std::function StoreStage; */ + std::function ProcessStage; + std::function PostStoreStage; }; namespace SundialsUserFun { - static int f0 (realtype t, N_Vector y, N_Vector ydot, void *user_data) { + static int f0 (sunrealtype t, N_Vector y, N_Vector ydot, void *user_data) { SundialsUserData* udata = static_cast(user_data); return udata->f0(t, y, ydot, user_data); } - static int f_fast (realtype t, N_Vector y_data, N_Vector y_rhs, void *user_data) { + static int f_fast (sunrealtype t, N_Vector y_data, N_Vector y_rhs, void *user_data) { SundialsUserData* udata = static_cast(user_data); return udata->f_fast(t, y_data, y_rhs, user_data); } - static int f (realtype t, N_Vector y_data, N_Vector y_rhs, void *user_data) { + static int f (sunrealtype t, N_Vector y_data, N_Vector y_rhs, void *user_data) { SundialsUserData* udata = static_cast(user_data); return udata->f(t, y_data, y_rhs, user_data); } /* - static int StoreStage (realtype t, N_Vector* f_data, int nvecs, void *user_data) { + static int StoreStage (sunrealtype t, N_Vector* f_data, int nvecs, void *user_data) { SundialsUserData* udata = static_cast(user_data); return udata->StoreStage(t, f_data, nvecs, user_data); } */ - static int ProcessStage (realtype t, N_Vector y_data, void *user_data) { + static int ProcessStage (sunrealtype t, N_Vector y_data, void *user_data) { SundialsUserData* udata = static_cast(user_data); return udata->ProcessStage(t, y_data, user_data); } - static int PostStoreStage(realtype t, N_Vector y_data, void *user_data) { + static int PostStoreStage(sunrealtype t, N_Vector y_data, void *user_data) { SundialsUserData* udata = static_cast(user_data); return udata->PostStoreStage(t, y_data, user_data); } @@ -245,7 +245,7 @@ public: /* Begin Section: SUNDIALS FUNCTION HOOKS */ /* f routine to compute the ODE RHS function f(t,y). */ - udata.f = [&](realtype rhs_time, N_Vector y_data, N_Vector y_rhs, void * /* user_data */) -> int { + udata.f = [&](sunrealtype rhs_time, N_Vector y_data, N_Vector y_rhs, void * /* user_data */) -> int { amrex::Vector S_data; amrex::Vector S_rhs; @@ -265,7 +265,7 @@ public: return 0; }; - udata.ProcessStage = [&](realtype rhs_time, N_Vector y_data, void * /* user_data */) -> int { + udata.ProcessStage = [&](sunrealtype rhs_time, N_Vector y_data, void * /* user_data */) -> int { amrex::Vector S_data; const int num_vecs = N_VGetNumSubvectors_ManyVector(y_data); @@ -421,14 +421,14 @@ public: /* Begin Section: SUNDIALS FUNCTION HOOKS */ /* f0 routine to compute a zero-valued ODE RHS function f(t,y). */ - udata.f0 = [&](realtype /* rhs_time */, N_Vector /* y */, N_Vector ydot, void * /* user_data */) -> int { + udata.f0 = [&](sunrealtype /* rhs_time */, N_Vector /* y */, N_Vector ydot, void * /* user_data */) -> int { // Initialize ydot to zero and return N_VConst(0.0, ydot); return 0; }; /* f routine to compute the ODE RHS function f(t,y). */ - udata.f_fast = [&](realtype rhs_time, N_Vector y_data, N_Vector y_rhs, void * /* user_data */) -> int { + udata.f_fast = [&](sunrealtype rhs_time, N_Vector y_data, N_Vector y_rhs, void * /* user_data */) -> int { amrex::Vector S_data; amrex::Vector S_rhs; amrex::Vector S_stage_data; @@ -456,7 +456,7 @@ public: }; /* f routine to compute the ODE RHS function f(t,y). */ - udata.f = [&](realtype rhs_time, N_Vector y_data, N_Vector y_rhs, void * /* user_data */) -> int { + udata.f = [&](sunrealtype rhs_time, N_Vector y_data, N_Vector y_rhs, void * /* user_data */) -> int { amrex::Vector S_data; amrex::Vector S_rhs; @@ -476,7 +476,7 @@ public: return 0; }; - udata.ProcessStage = [&](realtype rhs_time, N_Vector y_data, void * /* user_data */) -> int { + udata.ProcessStage = [&](sunrealtype rhs_time, N_Vector y_data, void * /* user_data */) -> int { amrex::Vector S_data; const int num_vecs = N_VGetNumSubvectors_ManyVector(y_data); @@ -492,7 +492,7 @@ public: return 0; }; - udata.PostStoreStage = [&](realtype rhs_time, N_Vector y_data, void *user_data) -> int { + udata.PostStoreStage = [&](sunrealtype rhs_time, N_Vector y_data, void *user_data) -> int { udata.ProcessStage(rhs_time, y_data, user_data); for(int i=0; i int { + udata.f = [&](sunrealtype rhs_time, N_Vector y_data, N_Vector y_rhs, void * /* user_data */) -> int { amrex::Vector S_data; amrex::Vector S_rhs; @@ -713,7 +713,7 @@ public: return 0; }; - udata.ProcessStage = [&](realtype rhs_time, N_Vector y_data, void * /* user_data */) -> int { + udata.ProcessStage = [&](sunrealtype rhs_time, N_Vector y_data, void * /* user_data */) -> int { amrex::Vector S_data; const int num_vecs = N_VGetNumSubvectors_ManyVector(y_data);