From cd58dea4f9471cdf50b1346da8416bea719f2300 Mon Sep 17 00:00:00 2001
From: Yanfei Guo <yguo@anl.gov>
Date: Sat, 16 Sep 2023 00:01:35 -0500
Subject: [PATCH 1/3] backend/cuda: add Ada and Hopper SM support

Signed-off-by: Yanfei Guo <yguo@anl.gov>
---
 src/backend/cuda/subconfigure.m4 | 29 +++++++++++++++++++++++++++--
 1 file changed, 27 insertions(+), 2 deletions(-)
diff --git a/src/backend/cuda/subconfigure.m4 b/src/backend/cuda/subconfigure.m4
index 3b2a7b64..31d8e8fc 100644
--- a/src/backend/cuda/subconfigure.m4
+++ b/src/backend/cuda/subconfigure.m4
@@ -49,6 +49,16 @@ AC_ARG_WITH([cuda-sm],
                 ampere - build compatibility for all Ampere GPUs
                 80     - A100, A30
                 86     - RTX Ampere, MX570, A40, A16, A10, A2
+                87     - Jetson AGX Orin and Drive AGX Orin
+
+                # Ada architecture
+                ada    - build compatibility for all Ada GPUs
+                89     - GeForce RTX 4090, RTX 4080, RTX 6000, Tesla L40
+
+                # Hopper architecture
+                hopper - build compatibility for all Hopper GPUs
+                90     - NVIDIA H100 (GH100)
+                90a    - add acceleration for features like wgmma and setmaxnreg. Required for NVIDIA CUTLASS
 
                 # Other
                 <numeric> - specific SM numeric to use
@@ -150,7 +160,7 @@ fi
 ##########################################################################
 
 if test "${have_cuda}" = "yes" ; then
-    for version in 11010 11000 10000 9000 8000 7000 6000 5000 ; do
+    for version in 12000 11080 11010 11000 10000 9000 8000 7000 6000 5000 ; do
         AC_COMPILE_IFELSE([AC_LANG_PROGRAM([
                               #include <cuda.h>
                               int x[[CUDA_VERSION - $version]];
@@ -163,7 +173,13 @@ if test "${have_cuda}" = "yes" ; then
     for sm in ${with_cuda_sm} ; do
         case "$sm" in
             all)
-                if test ${cuda_version} -ge 11010 ; then
+                if test ${cuda_version} -ge 12000 ; then
+                    # maxwell (52) to hopper (90a)
+                    supported_cuda_sms="52 53 60 61 62 70 72 75 80 86 87 89 90 90a"
+                elif test ${cuda_version} -ge 11080 ; then
+                    # maxwell (52) to ada (89) and hopper (90)
+                    supported_cuda_sms="52 53 60 61 62 70 72 75 80 86 87 89 90"
+                elif test ${cuda_version} -ge 11010 ; then
                     # maxwell (52) to ampere (86)
                     supported_cuda_sms="52 53 60 61 62 70 72 75 80 86"
                 elif test ${cuda_version} -ge 11000 ; then
@@ -223,6 +239,15 @@ if test "${have_cuda}" = "yes" ; then
                 PAC_APPEND_FLAG([86],[CUDA_SM])
                 ;;
 
+            ada)
+                PAC_APPEND_FLAG([89],[CUDA_SM])
+                ;;
+
+            hopper)
+                PAC_APPEND_FLAG([90],[CUDA_SM])
+                PAC_APPEND_FLAG([90a],[CUDA_SM])
+                ;;
+
             none)
                 ;;
 

From 075f2a3516808a0ceade6d768d948a2b418d1ef3 Mon Sep 17 00:00:00 2001
From: Yanfei Guo <yguo@anl.gov>
Date: Fri, 15 Sep 2023 16:19:39 -0500
Subject: [PATCH 2/3] backend/cuda: add config to detect CUDA compute
 capability

Default to --with-cuda-sm=auto to only build for the visible GPUs

Signed-off-by: Yanfei Guo <yguo@anl.gov>
---
 src/backend/cuda/subconfigure.m4 | 55 ++++++++++++++++++++++++++++++--
 1 file changed, 52 insertions(+), 3 deletions(-)

diff --git a/src/backend/cuda/subconfigure.m4 b/src/backend/cuda/subconfigure.m4
index 31d8e8fc..0d374cc1 100644
--- a/src/backend/cuda/subconfigure.m4
+++ b/src/backend/cuda/subconfigure.m4
@@ -13,6 +13,7 @@ AC_ARG_WITH([cuda-sm],
             [
   --with-cuda-sm=<options> (https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/)
           Comma-separated list of below options:
+                auto - automatically build compatibility for all GPUs visible, any other specified compatibilities are ignored
                 all - build compatibility for all GPUs supported by the CUDA version (can increase compilation time)
 
                 # Kepler architecture
@@ -63,7 +64,7 @@ AC_ARG_WITH([cuda-sm],
                 # Other
                 <numeric> - specific SM numeric to use
             ],,
-            [with_cuda_sm=all])
+            [with_cuda_sm=auto])
 
 
 # --with-cuda
@@ -160,16 +161,64 @@ fi
 ##########################################################################
 
 if test "${have_cuda}" = "yes" ; then
-    for version in 12000 11080 11010 11000 10000 9000 8000 7000 6000 5000 ; do
+    for version in 12000 11080 11050 11010 11000 10000 9000 8000 7000 6000 5000 ; do
         AC_COMPILE_IFELSE([AC_LANG_PROGRAM([
                               #include <cuda.h>
                               int x[[CUDA_VERSION - $version]];
                           ],)],[cuda_version=${version}],[])
         if test ! -z ${cuda_version} ; then break ; fi
     done
+
+    CUDA_SM=
+    case "$with_cuda_sm" in
+        *auto*)
+            dnl process auto detection
+            PAC_PUSH_FLAG([IFS])
+            IFS=" "
+            AC_MSG_CHECKING([for CUDA compute capability auto detection])
+            AC_LANG_PUSH([C])
+            AC_RUN_IFELSE(
+                [AC_LANG_PROGRAM(
+                  [
+                      #include <cuda_runtime.h>
+                      #include <stdio.h>
+                  ],
+                  [
+                      int count = 0;
+                      if (cudaSuccess != cudaGetDeviceCount(&count)) return -1;
+                      if (count == 0) return -1;
+                      for (int device = 0; device < count; ++device)
+                      {
+                          struct cudaDeviceProp prop;
+                          if (cudaSuccess == cudaGetDeviceProperties(&prop, device))
+                          printf("%d.%d ", prop.major, prop.minor);
+                      }
+                      return 0;
+                  ]
+                )],
+                [
+                    cuda_output=$(./conftest$EXEEXT | xargs -n1 | sort -u | xargs)
+                    for sm in $cuda_output; do
+                        sm_no_decimal=`echo $sm | tr -d '.'`
+                        PAC_APPEND_FLAG([$sm_no_decimal],[CUDA_SM])
+                    done
+                    with_cuda_sm=
+                    AC_MSG_RESULT([yes])
+                ],
+                [
+                    with_cuda_sm=all
+                    AC_MSG_RESULT([no])
+                ]
+              )
+            AC_LANG_POP([C])
+            PAC_POP_FLAG([IFS])
+            ;;
+        *)
+            ;;
+    esac
+
     PAC_PUSH_FLAG([IFS])
     IFS=","
-    CUDA_SM=
     for sm in ${with_cuda_sm} ; do
         case "$sm" in
             all)

From 49a732ba08ebd8a7c7769bac99b2b8580bf17b52 Mon Sep 17 00:00:00 2001
From: Yanfei Guo <yguo@anl.gov>
Date: Thu, 21 Sep 2023 10:51:15 -0500
Subject: [PATCH 3/3] backend/cuda: adding all-major option for CUDA capabilies

We set it as the fallback for CUDA capabilities detection.
Also removing all option.

Co-authored-by: Jeff Hammond <jeff.science@gmail.com>
Signed-off-by: Yanfei Guo <yguo@anl.gov>
---
 src/backend/cuda/subconfigure.m4 | 41 +++++++++++++++-----------------
 1 file changed, 19 insertions(+), 22 deletions(-)

diff --git a/src/backend/cuda/subconfigure.m4 b/src/backend/cuda/subconfigure.m4
index 0d374cc1..522067d8 100644
--- a/src/backend/cuda/subconfigure.m4
+++ b/src/backend/cuda/subconfigure.m4
@@ -14,7 +14,7 @@ AC_ARG_WITH([cuda-sm],
   --with-cuda-sm=<options> (https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/)
           Comma-separated list of below options:
                 auto - automatically build compatibility for all GPUs visible, any other specified compatibilities are ignored
-                all - build compatibility for all GPUs supported by the CUDA version (can increase compilation time)
+                all-major - build compatibility for all major GPU versions (sm_*0) supported by the CUDA version
 
                 # Kepler architecture
                 kepler - build compatibility for all Kepler GPUs
@@ -206,7 +206,7 @@ if test "${have_cuda}" = "yes" ; then
                     AC_MSG_RESULT([yes])
                 ],
                 [
-                    with_cuda_sm=all
+                    with_cuda_sm=all-major
                     AC_MSG_RESULT([no])
                 ]
               )
@@ -221,34 +221,31 @@ if test "${have_cuda}" = "yes" ; then
     IFS=","
     for sm in ${with_cuda_sm} ; do
         case "$sm" in
-            all)
-                if test ${cuda_version} -ge 12000 ; then
-                    # maxwell (52) to hopper (90a)
-                    supported_cuda_sms="52 53 60 61 62 70 72 75 80 86 87 89 90 90a"
-                elif test ${cuda_version} -ge 11080 ; then
-                    # maxwell (52) to ada (89) and hopper (90)
-                    supported_cuda_sms="52 53 60 61 62 70 72 75 80 86 87 89 90"
+            all-major)
+                if test ${cuda_version} -ge 11080 ; then
+                    # maxwell (52) to hopper (90)
+                    supported_cuda_sms="52 60 70 80 90"
                 elif test ${cuda_version} -ge 11010 ; then
-                    # maxwell (52) to ampere (86)
-                    supported_cuda_sms="52 53 60 61 62 70 72 75 80 86"
+                    # maxwell (52) to ampere (80)
+                    supported_cuda_sms="52 60 70 80"
                 elif test ${cuda_version} -ge 11000 ; then
                     # maxwell (52) to ampere (80)
-                    supported_cuda_sms="52 53 60 61 62 70 72 75 80"
+                    supported_cuda_sms="52 60 70 80"
                 elif test ${cuda_version} -ge 10000 ; then
-                    # kepler (30) to turing (75)
-                    supported_cuda_sms="30 35 37 50 52 53 60 61 62 70 72 75"
+                    # kepler (30) to volta (70)
+                    supported_cuda_sms="30 50 60 70"
                 elif test ${cuda_version} -ge 9000 ; then
-                    # kepler (30) to volta (72)
-                    supported_cuda_sms="30 35 37 50 52 53 60 61 62 70 72"
+                    # kepler (30) to volta (70)
+                    supported_cuda_sms="30 50 60 70"
                 elif test ${cuda_version} -ge 8000 ; then
-                    # kepler (30) to pascal (62)
-                    supported_cuda_sms="30 35 37 50 52 53 60 61 62"
+                    # kepler (30) to pascal (60)
+                    supported_cuda_sms="30 50 60"
                 elif test ${cuda_version} -ge 6000 ; then
-                    # kepler (30) to maxwell (53)
-                    supported_cuda_sms="30 35 37 50 52 53"
+                    # kepler (30) to maxwell (50)
+                    supported_cuda_sms="30 50"
                 elif test ${cuda_version} -ge 5000 ; then
-                    # kepler (30) to kepler (37)
-                    supported_cuda_sms="30 35 37"
+                    # kepler (30)
+                    supported_cuda_sms="30"
                 fi
 
                 for supported_cuda_sm in $supported_cuda_sms ; do