replace AMREX_DEVICE_COMPILE with AMREX_IF_ON_DEVICE and AMREX_IF_ON_…

…HOST (AMReX-Codes#3591) ## Summary This adds the macros `AMREX_IF_ON_DEVICE((code_for_device))` and `AMREX_IF_ON_HOST((code_for_host))` that are compatible with single-pass host/device compilation (as used by `nvc++ -cuda`), as well as backward compatible with all other compilers. This also replaces all uses of `AMREX_DEVICE_COMPILE` with these macros. Fixes AMReX-Codes#3586. ## Additional background Single-pass compilation evalutes the preprocessor macros once for each source file. This means that preprocessor conditionals cannot be used to choose between host and device code. In particular, NVHPC with `-cuda` does not support `__CUDA_ARCH__`, instead requiring the use of the `if target` construct. This creates portable macros that work for either single-pass or two-pass compilation, but requires restructuring of any code that uses AMREX_DEVICE_COMPILE so that the code appears as a macro argument. This PR will allow using NVHPC with `-cuda` as the unified host/device compiler for AMReX. In the future, single-pass compilers for other backends may be available, e.g., SYCL (https://dl.acm.org/doi/abs/10.1145/3585341.3585351). AMReX can be configured to build with `nvc++ -cuda` using CMake: ``` cmake .. -DAMReX_GPU_BACKEND=CUDA -DCMAKE_C_COMPILER=nvc -DCMAKE_CXX_COMPILER=nvc++ -DCMAKE_CUDA_COMPILER=nvc++ -DCMAKE_CUDA_COMPILER_ID=NVCXX -DCMAKE_CUDA_ARCHITECTURES=80 -DCMAKE_CUDA_COMPILER_FORCED=ON -DCMAKE_CUDA_COMPILE_FEATURES=cuda_std_17 -DAMReX_GPU_RDC=OFF -DCMAKE_CXX_FLAGS="-cuda --gcc-toolchain=$(which gcc)" -DCMAKE_CUDA_FLAGS="-cuda --gcc-toolchain=$(which gcc)" -DAMReX_ENABLE_TESTS=ON -DCMAKE_CUDA_HOST_LINK_LAUNCHER=nvc++ -DCMAKE_CUDA_LINK_EXECUTABLE="<CMAKE_CUDA_HOST_LINK_LAUNCHER> <FLAGS> <LINK_FLAGS> <OBJECTS> -o <TARGET> <LINK_LIBRARIES>" ``` CMake hacks (https://github.com/NVIDIA/cub/blob/0fc3c3701632a4be906765b73be20a9ad0da603d/cmake/CubCompilerHacks.cmake) are tested with CMake 3.22.1 and NVHPC 23.5, 23.7, and 23.9 (earlier versions do not work). However, it currently fails to link the executables for the tests due to a [compiler/linker bug](https://forums.developer.nvidia.com/t/nvc-cuda-fails-to-link-code-when-using-device-curand-functions/270401/5). (Note that by default, `nvcc` preserves denormals, whereas `nvc++` does not. Also, `nvc++` generates relocatable device code by default, whereas `nvcc` does not.) ## Checklist The proposed changes: - [ ] fix a bug or incorrect behavior in AMReX - [ ] add new capabilities to AMReX - [ ] changes answers in the test suite to more than roundoff level - [ ] are likely to significantly affect the results of downstream AMReX users - [ ] include documentation in the code and/or rst files, if appropriate --------- Co-authored-by: Weiqun Zhang <[email protected]>
ajnonaka · Nov 7, 2023 · d364631 · d364631
1 parent a7afcba
commit d364631
Show file tree

Hide file tree

Showing 17 changed files with 518 additions and 471 deletions.
diff --git a/Docs/sphinx_documentation/source/GPU.rst b/Docs/sphinx_documentation/source/GPU.rst
@@ -489,11 +489,10 @@ GPU support.
 When AMReX is compiled with ``USE_OMP_OFFLOAD=TRUE``,
 ``AMREX_USE_OMP_OFFLOAD`` is defined.
 
-In addition to AMReX's preprocessor macros, CUDA provides the
-``__CUDA_ARCH__`` macro which is only defined when in device code.
-HIP and Sycl provide similar macros.
-``AMREX_DEVICE_COMPILE`` should be used when a ``__host__ __device__``
-function requires separate code for the CPU and GPU implementations.
+The macros ``AMREX_IF_ON_DEVICE((code_for_device))`` and
+``AMREX_IF_ON_HOST((code_for_host))`` should be used when a
+``__host__ __device__`` function requires separate code for the
+CPU and GPU implementations.
 
 .. ===================================================================
 

diff --git a/Src/Base/AMReX.H b/Src/Base/AMReX.H
@@ -113,16 +113,15 @@ namespace amrex
 
     AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
     void Error (const char* msg = nullptr) {
-#if AMREX_DEVICE_COMPILE
 #if defined(NDEBUG)
-        amrex::ignore_unused(msg);
+        AMREX_IF_ON_DEVICE((amrex::ignore_unused(msg);))
 #else
-        if (msg) { AMREX_DEVICE_PRINTF("Error %s\n", msg); }
-        AMREX_DEVICE_ASSERT(0);
-#endif
-#else
-        Error_host("Error", msg);
+        AMREX_IF_ON_DEVICE((
+                if (msg) { AMREX_DEVICE_PRINTF("Error %s\n", msg); }
+                AMREX_DEVICE_ASSERT(0);
+        ))
 #endif
+        AMREX_IF_ON_HOST((Error_host("Error", msg);))
     }
 
     //! Print out warning message to cerr.
@@ -132,32 +131,28 @@ namespace amrex
 
     AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
     void Warning (const char * msg) {
-#if AMREX_DEVICE_COMPILE
 #if defined(NDEBUG)
-        amrex::ignore_unused(msg);
-#else
-        if (msg) { AMREX_DEVICE_PRINTF("Warning %s\n", msg); }
-#endif
+        AMREX_IF_ON_DEVICE((amrex::ignore_unused(msg);))
 #else
-        Warning_host(msg);
+        AMREX_IF_ON_DEVICE((if (msg) { AMREX_DEVICE_PRINTF("Warning %s\n", msg); }))
 #endif
+        AMREX_IF_ON_HOST((Warning_host(msg);))
     }
 
     //! Print out message to cerr and exit via abort().
     void Abort (const std::string& msg);
 
     AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
     void Abort (const char * msg = nullptr) {
-#if AMREX_DEVICE_COMPILE
 #if defined(NDEBUG)
-        amrex::ignore_unused(msg);
+        AMREX_IF_ON_DEVICE((amrex::ignore_unused(msg);))
 #else
-        if (msg) { AMREX_DEVICE_PRINTF("Abort %s\n", msg); }
-        AMREX_DEVICE_ASSERT(0);
-#endif
-#else
-        Error_host("Abort", msg);
+        AMREX_IF_ON_DEVICE((
+                if (msg) { AMREX_DEVICE_PRINTF("Abort %s\n", msg); }
+                AMREX_DEVICE_ASSERT(0);
+        ))
 #endif
+        AMREX_IF_ON_HOST((Error_host("Abort", msg);))
     }
 
     /**
@@ -170,22 +165,21 @@ namespace amrex
 
     AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
     void Assert (const char* EX, const char* file, int line, const char* msg = nullptr) {
-#if AMREX_DEVICE_COMPILE
 #if defined(NDEBUG)
-        amrex::ignore_unused(EX,file,line,msg);
-#else
-        if (msg) {
-            AMREX_DEVICE_PRINTF("Assertion `%s' failed, file \"%s\", line %d, Msg: %s",
-                                EX, file, line, msg);
-        } else {
-            AMREX_DEVICE_PRINTF("Assertion `%s' failed, file \"%s\", line %d",
-                                EX, file, line);
-        }
-        AMREX_DEVICE_ASSERT(0);
-#endif
+        AMREX_IF_ON_DEVICE((amrex::ignore_unused(EX,file,line,msg);))
 #else
-        Assert_host(EX,file,line,msg);
+        AMREX_IF_ON_DEVICE((
+                if (msg) {
+                AMREX_DEVICE_PRINTF("Assertion `%s' failed, file \"%s\", line %d, Msg: %s",
+                                        EX, file, line, msg);
+                } else {
+                AMREX_DEVICE_PRINTF("Assertion `%s' failed, file \"%s\", line %d",
+                                        EX, file, line);
+                }
+                AMREX_DEVICE_ASSERT(0);
+        ))
 #endif
+        AMREX_IF_ON_HOST((Assert_host(EX,file,line,msg);))
     }
 
     /**

diff --git a/Src/Base/AMReX_Algorithm.H b/Src/Base/AMReX_Algorithm.H
@@ -161,51 +161,52 @@ namespace amrex
     AMREX_GPU_HOST_DEVICE
     ItType upper_bound (ItType first, ItType last, const ValType& val)
     {
-#if AMREX_DEVICE_COMPILE
-        std::ptrdiff_t count = last-first;
-        while(count>0){
-            auto it = first;
-            const auto step = count/2;
-            it += step;
-            if (!(val < *it)){
-                first = ++it;
-                count -= step + 1;
+        AMREX_IF_ON_DEVICE((
+            std::ptrdiff_t count = last-first;
+            while(count>0){
+                auto it = first;
+                const auto step = count/2;
+                it += step;
+                if (!(val < *it)){
+                    first = ++it;
+                    count -= step + 1;
+                }
+                else{
+                    count = step;
+                }
             }
-            else{
-                count = step;
-            }
-        }
-
-        return first;
-#else
-        return std::upper_bound(first, last, val);
-#endif
+            return first;
+        ))
+        AMREX_IF_ON_HOST((
+            return std::upper_bound(first, last, val);
+        ))
     }
 
     template<typename ItType, typename ValType>
     AMREX_GPU_HOST_DEVICE
     ItType lower_bound (ItType first, ItType last, const ValType& val)
     {
-#ifdef AMREX_DEVICE_COMPILE
-        std::ptrdiff_t count = last-first;
-        while(count>0)
-        {
-            auto it = first;
-            const auto step = count/2;
-            it += step;
-            if (*it < val){
-                first = ++it;
-                count -= step + 1;
-            }
-            else{
-                count = step;
+        AMREX_IF_ON_DEVICE((
+            std::ptrdiff_t count = last-first;
+            while(count>0)
+            {
+                auto it = first;
+                const auto step = count/2;
+                it += step;
+                if (*it < val){
+                    first = ++it;
+                    count -= step + 1;
+                }
+                else{
+                    count = step;
+                }
             }
-        }
 
-        return first;
-#else
-        return std::lower_bound(first, last, val);
-#endif
+            return first;
+        ))
+        AMREX_IF_ON_HOST((
+            return std::lower_bound(first, last, val);
+        ))
     }
 
 namespace detail {
@@ -239,83 +240,100 @@ int builtin_clz_wrapper (clzll_tag, T x) noexcept
     return static_cast<int>(__builtin_clzll(x) - (sizeof(unsigned long long) * CHAR_BIT - sizeof(T) * CHAR_BIT));
 }
 
-#ifdef AMREX_USE_CUDA
-
-// likewise with CUDA, there are __clz functions that take (signed) int and long long int
-template <typename T, typename = typename std::enable_if<sizeof(T) <= sizeof(int)>::type>
-AMREX_GPU_DEVICE AMREX_FORCE_INLINE
-int clz_wrapper (clz_tag, T x) noexcept
-{
-    return __clz((int) x) - (sizeof(int) * CHAR_BIT - sizeof(T) * CHAR_BIT);
-}
-
-template <typename T, typename = typename std::enable_if<sizeof(T) <= sizeof(long long int)>::type>
-AMREX_GPU_DEVICE AMREX_FORCE_INLINE
-int clz_wrapper (clzll_tag, T x) noexcept
-{
-    return __clzll((long long int) x) - (sizeof(long long int) * CHAR_BIT - sizeof(T) * CHAR_BIT);
 }
-#endif
 
-}
+template <class T, typename std::enable_if_t<std::is_same_v<std::decay_t<T>,std::uint8_t>  ||
+                                             std::is_same_v<std::decay_t<T>,std::uint16_t> ||
+                                             std::is_same_v<std::decay_t<T>,std::uint32_t> ||
+                                             std::is_same_v<std::decay_t<T>,std::uint64_t>, int> = 0>
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+int clz (T x) noexcept;
 
 AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
-int clz (std::uint8_t x) noexcept
+int clz_generic (std::uint8_t x) noexcept
 {
-#if (AMREX_DEVICE_COMPILE && defined(AMREX_USE_CUDA)) // all supported cuda versions have __clz
-    return detail::clz_wrapper(detail::clz_tag{}, x);
-#elif (!AMREX_DEVICE_COMPILE && AMREX_HAS_BUILTIN_CLZ)
-    return detail::builtin_clz_wrapper(detail::clz_tag{}, x);
-#else
     static constexpr int clz_lookup[16] = { 4, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0 };
     auto upper = x >> 4;
     auto lower = x & 0xF;
     return upper ? clz_lookup[upper] : 4 + clz_lookup[lower];
-#endif
 }
 
 AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
-int clz (std::uint16_t x) noexcept
+int clz_generic (std::uint16_t x) noexcept
 {
-#if (AMREX_DEVICE_COMPILE && defined(AMREX_USE_CUDA)) // all supported cuda versions have __clz
-    return detail::clz_wrapper(detail::clz_tag{}, x);
-#elif (!AMREX_DEVICE_COMPILE && AMREX_HAS_BUILTIN_CLZ)
-    return detail::builtin_clz_wrapper(detail::clz_tag{}, x);
-#else
     auto upper = std::uint8_t(x >> 8);
     auto lower = std::uint8_t(x & 0xFF);
     return upper ? clz(upper) : 8 + clz(lower);
-#endif
 }
 
 AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
-int clz (std::uint32_t x) noexcept
+int clz_generic (std::uint32_t x) noexcept
 {
-#if (AMREX_DEVICE_COMPILE && defined(AMREX_USE_CUDA)) // all supported cuda versions have __clz
-    return detail::clz_wrapper(detail::clz_tag{}, x);
-#elif (!AMREX_DEVICE_COMPILE && AMREX_HAS_BUILTIN_CLZ)
-    return detail::builtin_clz_wrapper(detail::clz_tag{}, x);
-#else
     auto upper = std::uint16_t(x >> 16);
     auto lower = std::uint16_t(x & 0xFFFF);
     return upper ? clz(upper) : 16 + clz(lower);
-#endif
 }
 
 AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
-int clz (std::uint64_t x) noexcept
+int clz_generic (std::uint64_t x) noexcept
 {
-#if (AMREX_DEVICE_COMPILE && defined(AMREX_USE_CUDA)) // all supported cuda versions have __clz
-    return detail::clz_wrapper(detail::clz_tag{}, x);
-#elif (!AMREX_DEVICE_COMPILE && AMREX_HAS_BUILTIN_CLZ)
-    return detail::builtin_clz_wrapper(detail::clz_tag{}, x);
-#else
     auto upper = std::uint32_t(x >> 32);
     auto lower = std::uint32_t(x & 0xFFFFFFFF);
     return upper ? clz(upper) : 32 + clz(lower);
+}
+
+#if defined AMREX_USE_CUDA
+
+namespace detail {
+    // likewise with CUDA, there are __clz functions that take (signed) int and long long int
+    template <typename T, typename = typename std::enable_if<sizeof(T) <= sizeof(int)>::type>
+    AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+    int clz_wrapper (clz_tag, T x) noexcept
+    {
+        return __clz((int) x) - (sizeof(int) * CHAR_BIT - sizeof(T) * CHAR_BIT);
+    }
+
+    template <typename T, typename = typename std::enable_if<sizeof(T) <= sizeof(long long int)>::type>
+    AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+    int clz_wrapper (clzll_tag, T x) noexcept
+    {
+        return __clzll((long long int) x) - (sizeof(long long int) * CHAR_BIT - sizeof(T) * CHAR_BIT);
+    }
+}
+
+template <class T, typename std::enable_if_t<std::is_same_v<std::decay_t<T>,std::uint8_t>  ||
+                                             std::is_same_v<std::decay_t<T>,std::uint16_t> ||
+                                             std::is_same_v<std::decay_t<T>,std::uint32_t> ||
+                                             std::is_same_v<std::decay_t<T>,std::uint64_t>, int> >
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+int clz (T x) noexcept
+{
+    AMREX_IF_ON_DEVICE((return detail::clz_wrapper(detail::clz_tag{}, x);))
+#if AMREX_HAS_BUILTIN_CLZ
+    AMREX_IF_ON_HOST((return detail::builtin_clz_wrapper(detail::clz_tag{}, x);))
+#else
+    AMREX_IF_ON_HOST((return clz_generic(x);))
 #endif
 }
 
+#else // !defined AMREX_USE_CUDA
+
+template <class T, typename std::enable_if_t<std::is_same_v<std::decay_t<T>,std::uint8_t>  ||
+                                             std::is_same_v<std::decay_t<T>,std::uint16_t> ||
+                                             std::is_same_v<std::decay_t<T>,std::uint32_t> ||
+                                             std::is_same_v<std::decay_t<T>,std::uint64_t>, int> >
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+int clz (T x) noexcept
+{
+#if (!AMREX_DEVICE_COMPILE && AMREX_HAS_BUILTIN_CLZ)
+    return detail::builtin_clz_wrapper(detail::clz_tag{}, x);
+#else
+    return clz_generic(x);
+#endif
+}
+
+#endif // defined AMREX_USE_CUDA
+
 }
 
 #endif