Merge pull request #246 from yfguo/cuda-arch-auto

Adding auto CUDA compute capability detection
pmodels · Sep 28, 2023 · 911849b · 911849b
2 parents b6b38c8 + 49a732b
commit 911849b
Showing 1 changed file with 90 additions and 19 deletions.
diff --git a/src/backend/cuda/subconfigure.m4 b/src/backend/cuda/subconfigure.m4
@@ -13,7 +13,8 @@ AC_ARG_WITH([cuda-sm],
             [
   --with-cuda-sm=<options> (https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/)
           Comma-separated list of below options:
-                all - build compatibility for all GPUs supported by the CUDA version (can increase compilation time)
+                auto - automatically build compatibility for all GPUs visible, any other specified compatibilities are ignored
+                all-major - build compatibility for all major GPU versions (sm_*0) supported by the CUDA version
 
                 # Kepler architecture
                 kepler - build compatibility for all Kepler GPUs
@@ -49,11 +50,21 @@ AC_ARG_WITH([cuda-sm],
                 ampere - build compatibility for all Ampere GPUs
                 80     - A100, A30
                 86     - RTX Ampere, MX570, A40, A16, A10, A2
+                87     - Jetson AGX Orin and Drive AGX Orin
+
+                # Ada architecture
+                ada    - build compatibility for all Ada GPUs
+                89     - GeForce RTX 4090, RTX 4080, RTX 6000, Tesla L40
+
+                # Hopper architecture
+                hopper - build compatibility for all Hopper GPUs
+                90     - NVIDIA H100 (GH100)
+                90a    - add acceleration for features like wgmma and setmaxnreg. Required for NVIDIA CUTLASS
 
                 # Other
                 <numeric> - specific SM numeric to use
             ],,
-            [with_cuda_sm=all])
+            [with_cuda_sm=auto])
 
 
 # --with-cuda
@@ -150,40 +161,91 @@ fi
 ##########################################################################
 
 if test "${have_cuda}" = "yes" ; then
-    for version in 11010 11000 10000 9000 8000 7000 6000 5000 ; do
+    for version in 12000 11080 11050 11010 11000 10000 9000 8000 7000 6000 5000 ; do
         AC_COMPILE_IFELSE([AC_LANG_PROGRAM([
                               #include <cuda.h>
                               int x[[CUDA_VERSION - $version]];
                           ],)],[cuda_version=${version}],[])
         if test ! -z ${cuda_version} ; then break ; fi
     done
+
+    CUDA_SM=
+    case "$with_cuda_sm" in
+        *auto*)
+            dnl process auto detection
+            PAC_PUSH_FLAG([IFS])
+            IFS=" "
+            AC_MSG_CHECKING([for CUDA compute capability auto detection])
+            AC_LANG_PUSH([C])
+            AC_RUN_IFELSE(
+                [AC_LANG_PROGRAM(
+                  [
+                      #include <cuda_runtime.h>
+                      #include <stdio.h>
+                  ],
+                  [
+                      int count = 0;
+                      if (cudaSuccess != cudaGetDeviceCount(&count)) return -1;
+                      if (count == 0) return -1;
+                      for (int device = 0; device < count; ++device)
+                      {
+                          struct cudaDeviceProp prop;
+                          if (cudaSuccess == cudaGetDeviceProperties(&prop, device))
+                          printf("%d.%d ", prop.major, prop.minor);
+                      }
+                      return 0;
+                  ]
+                )],
+                [
+                    cuda_output=$(./conftest$EXEEXT | xargs -n1 | sort -u | xargs)
+                    for sm in $cuda_output; do
+                        sm_no_decimal=`echo $sm | tr -d '.'`
+                        PAC_APPEND_FLAG([$sm_no_decimal],[CUDA_SM])
+                    done
+                    with_cuda_sm=
+                    AC_MSG_RESULT([yes])
+                ],
+                [
+                    with_cuda_sm=all-major
+                    AC_MSG_RESULT([no])
+                ]
+              )
+            AC_LANG_POP([C])
+            PAC_POP_FLAG([IFS])
+            ;;
+        *)
+            ;;
+    esac
+
     PAC_PUSH_FLAG([IFS])
     IFS=","
-    CUDA_SM=
     for sm in ${with_cuda_sm} ; do
         case "$sm" in
-            all)
-                if test ${cuda_version} -ge 11010 ; then
-                    # maxwell (52) to ampere (86)
-                    supported_cuda_sms="52 53 60 61 62 70 72 75 80 86"
+            all-major)
+                if test ${cuda_version} -ge 11080 ; then
+                    # maxwell (52) to hopper (90)
+                    supported_cuda_sms="52 60 70 80 90"
+                elif test ${cuda_version} -ge 11010 ; then
+                    # maxwell (52) to ampere (80)
+                    supported_cuda_sms="52 60 70 80"
                 elif test ${cuda_version} -ge 11000 ; then
                     # maxwell (52) to ampere (80)
-                    supported_cuda_sms="52 53 60 61 62 70 72 75 80"
+                    supported_cuda_sms="52 60 70 80"
                 elif test ${cuda_version} -ge 10000 ; then
-                    # kepler (30) to turing (75)
-                    supported_cuda_sms="30 35 37 50 52 53 60 61 62 70 72 75"
+                    # kepler (30) to volta (70)
+                    supported_cuda_sms="30 50 60 70"
                 elif test ${cuda_version} -ge 9000 ; then
-                    # kepler (30) to volta (72)
-                    supported_cuda_sms="30 35 37 50 52 53 60 61 62 70 72"
+                    # kepler (30) to volta (70)
+                    supported_cuda_sms="30 50 60 70"
                 elif test ${cuda_version} -ge 8000 ; then
-                    # kepler (30) to pascal (62)
-                    supported_cuda_sms="30 35 37 50 52 53 60 61 62"
+                    # kepler (30) to pascal (60)
+                    supported_cuda_sms="30 50 60"
                 elif test ${cuda_version} -ge 6000 ; then
-                    # kepler (30) to maxwell (53)
-                    supported_cuda_sms="30 35 37 50 52 53"
+                    # kepler (30) to maxwell (50)
+                    supported_cuda_sms="30 50"
                 elif test ${cuda_version} -ge 5000 ; then
-                    # kepler (30) to kepler (37)
-                    supported_cuda_sms="30 35 37"
+                    # kepler (30)
+                    supported_cuda_sms="30"
                 fi
 
                 for supported_cuda_sm in $supported_cuda_sms ; do
@@ -223,6 +285,15 @@ if test "${have_cuda}" = "yes" ; then
                 PAC_APPEND_FLAG([86],[CUDA_SM])
                 ;;
 
+            ada)
+                PAC_APPEND_FLAG([89],[CUDA_SM])
+                ;;
+
+            hopper)
+                PAC_APPEND_FLAG([90],[CUDA_SM])
+                PAC_APPEND_FLAG([90a],[CUDA_SM])
+                ;;
+
             none)
                 ;;