From 4cc8e294a8c2e9f34ac9c195578f94c9c5c12e34 Mon Sep 17 00:00:00 2001
From: Jade Abraham <jade.abraham@hpe.com>
Date: Wed, 25 Sep 2024 15:14:34 -0500
Subject: [PATCH 01/21] expose CHPL_GPU_SDK_VERSION

Signed-off-by: Jade Abraham <jade.abraham@hpe.com>
---
 modules/standard/ChplConfig.chpl |  6 ++++++
 runtime/Makefile                 | 13 +++++++++++++
 util/chplenv/chpl_gpu.py         |  5 +++--
 util/chplenv/printchplenv.py     |  4 ++++
 4 files changed, 26 insertions(+), 2 deletions(-)

diff --git a/modules/standard/ChplConfig.chpl b/modules/standard/ChplConfig.chpl
index 4963bb5479b1..cda29ca67483 100644
--- a/modules/standard/ChplConfig.chpl
+++ b/modules/standard/ChplConfig.chpl
@@ -189,6 +189,12 @@ module ChplConfig {
   param CHPL_GPU:string;
   CHPL_GPU = __primitive("get compiler variable", "CHPL_GPU");
 
+  @chpldoc.nodoc
+  @unstable("'ChplConfig.CHPL_GPU_SDK_VERSION' is unstable and may be replaced with a different way to access this information in the future")
+  param CHPL_GPU_SDK_VERSION:string;
+  CHPL_GPU_SDK_VERSION = __primitive("get compiler variable", "CHPL_GPU_SDK_VERSION");
+
+
   @chpldoc.nodoc
   @unstable("'ChplConfig.CHPL_LIB_PIC' is unstable and may be replaced with a different way to access this information in the future")
   param CHPL_LIB_PIC: string;
diff --git a/runtime/Makefile b/runtime/Makefile
index 9d422e075ced..9987fd27d922 100644
--- a/runtime/Makefile
+++ b/runtime/Makefile
@@ -64,6 +64,19 @@ $(CHPL_ENV_HEADER): $(CHPL_MAKE_HOME)/util/printchplenv $(CHPL_MAKE_HOME)/util/c
 	  sed 's/^ *//;s/ *$$//' | \
 	  sed 's/[^0-9A-Za-z]/_/g' | \
 	  awk '{ print "#define " toupper($$1) }' >> $(CHPL_ENV_HEADER)
+	
+	@$(CHPL_MAKE_HOME)/util/printchplenv --only CHPL_GPU_SDK_VERSION --value | \
+		grep -q none && \
+			echo "#define CHPL_GPU_SDK_VERSION_MAJOR 0" >> $(CHPL_ENV_HEADER) && \
+			echo "#define CHPL_GPU_SDK_VERSION_MINOR 0" >> $(CHPL_ENV_HEADER) && \
+			echo "#define CHPL_GPU_SDK_VERSION_PATCH 0" >> $(CHPL_ENV_HEADER) \
+		|| \
+			$(CHPL_MAKE_HOME)/util/printchplenv --only CHPL_GPU_SDK_VERSION --value | \
+			sed 's/\./ /g' | \
+			awk '{ print "#define CHPL_GPU_SDK_VERSION_MAJOR " $$1; \
+						 print "#define CHPL_GPU_SDK_VERSION_MINOR " $$2; \
+						 print "#define CHPL_GPU_SDK_VERSION_PATCH " $$3 }' >> $(CHPL_ENV_HEADER)
+
 	@echo "#endif /* _CHPL_ENV_GEN_H_ */" >> $(CHPL_ENV_HEADER)
 
 THIRD_PARTY_PKGS = $(shell $(CHPL_MAKE_PYTHON) $(CHPL_MAKE_HOME)/util/chplenv/third-party-pkgs)
diff --git a/util/chplenv/chpl_gpu.py b/util/chplenv/chpl_gpu.py
index f7e2a79dd96c..dd3f3286ef86 100644
--- a/util/chplenv/chpl_gpu.py
+++ b/util/chplenv/chpl_gpu.py
@@ -367,8 +367,7 @@ def get_sdk_version():
                     if match:
                         rocm_version = match.group(1)
         return rocm_version
-
-    if get() == 'nvidia':
+    elif get() == 'nvidia':
         chpl_cuda_path = get_sdk_path('nvidia')
         version_file_json = '%s/version.json' % chpl_cuda_path
         version_file_txt = '%s/version.txt' % chpl_cuda_path
@@ -392,6 +391,8 @@ def get_sdk_version():
                 if match:
                     cuda_version = match.group(1)
         return cuda_version
+    else:
+        return 'none'
 
 
 def _validate_rocm_version_impl():
diff --git a/util/chplenv/printchplenv.py b/util/chplenv/printchplenv.py
index d78c840d1e86..1ba9cc2323ef 100755
--- a/util/chplenv/printchplenv.py
+++ b/util/chplenv/printchplenv.py
@@ -101,6 +101,7 @@
     ChapelEnv('CHPL_TARGET_BACKEND_CPU', INTERNAL),
     ChapelEnv('CHPL_LOCALE_MODEL', RUNTIME | LAUNCHER | DEFAULT, 'loc'),
     ChapelEnv('  CHPL_GPU', RUNTIME | DEFAULT, 'gpu'),
+    ChapelEnv('  CHPL_GPU_SDK_VERSION', RUNTIME),
     ChapelEnv('  CHPL_GPU_ARCH', INTERNAL),
     ChapelEnv('  CHPL_GPU_MEM_STRATEGY', RUNTIME , 'gpu_mem' ),
     ChapelEnv('  CHPL_CUDA_PATH', INTERNAL),
@@ -205,6 +206,7 @@ def compute_all_values():
 
     ENV_VALS['CHPL_LOCALE_MODEL'] = chpl_locale_model.get()
     ENV_VALS['  CHPL_GPU'] = chpl_gpu.get()
+    ENV_VALS['  CHPL_GPU_SDK_VERSION'] = chpl_gpu.get_sdk_version()
     ENV_VALS['  CHPL_CUDA_LIBDEVICE_PATH'] = chpl_gpu.get_cuda_libdevice_path()
     ENV_VALS['  CHPL_GPU_MEM_STRATEGY'] = chpl_gpu.get_gpu_mem_strategy()
     ENV_VALS['CHPL_COMM'] = chpl_comm.get()
@@ -379,6 +381,8 @@ def filter_tidy(chpl_env):
         return gpu == 'amd'
     elif chpl_env.name == '  CHPL_GPU_ARCH':
         return gpu == 'nvidia' or gpu == 'amd'
+    elif chpl_env.name == '  CHPL_GPU_SDK_VERSION':
+        return gpu != 'none'
     elif chpl_env.name == '  CHPL_HOST_JEMALLOC':
         return host_mem == 'jemalloc'
     elif chpl_env.name == '  CHPL_TARGET_JEMALLOC':

From 9b5d8964e620eb47b56d4fa028956d220a80f718 Mon Sep 17 00:00:00 2001
From: Jade Abraham <jade.abraham@hpe.com>
Date: Wed, 25 Sep 2024 15:14:47 -0500
Subject: [PATCH 02/21] use CHPL_GPU_SDK_VERSION_MAJOR

Signed-off-by: Jade Abraham <jade.abraham@hpe.com>
---
 runtime/include/gpu/amd/rocm-version.h | 18 ++----------------
 1 file changed, 2 insertions(+), 16 deletions(-)

diff --git a/runtime/include/gpu/amd/rocm-version.h b/runtime/include/gpu/amd/rocm-version.h
index 39a25bc7c0c7..8da037d82da0 100644
--- a/runtime/include/gpu/amd/rocm-version.h
+++ b/runtime/include/gpu/amd/rocm-version.h
@@ -21,20 +21,6 @@
 #ifndef __HIP_PLATFORM_AMD__
 #define __HIP_PLATFORM_AMD__
 #endif
-#include <hip/hip_common.h>
-#include <hip/hip_runtime.h>
 
-#if __has_include(<rocm-core/rocm_version.h>)  // 5.x wants this
-#include <rocm-core/rocm_version.h>
-#elif __has_include(<rocm/rocm_version.h>)  // 4.x wants this
-#include <rocm/rocm_version.h>
-#elif __has_include(<rocm_version.h>)  // Deprecated. 5.x used to want this
-#include <rocm_version.h>
-#endif
-
-// we should have found the correct header by now. But if not, we set
-// ROCM_VERSION_MAJOR to 4 as it is the lowest version we support and has fewer
-// runtime features enabled. So, it is a safer choice.
-#if !defined(ROCM_VERSION_MAJOR)
-#define ROCM_VERSION_MAJOR 4
-#endif
+// CHPL_GPU_SDK_VERSION_MAJOR is determined by printchplenv
+#define ROCM_VERSION_MAJOR CHPL_GPU_SDK_VERSION_MAJOR

From ad12c53753d5915e5b1e0793c836bc44540712fa Mon Sep 17 00:00:00 2001
From: Jade Abraham <jade.abraham@hpe.com>
Date: Wed, 25 Sep 2024 16:42:43 -0500
Subject: [PATCH 03/21] cleanup makefile

Signed-off-by: Jade Abraham <jade.abraham@hpe.com>
---
 runtime/Makefile | 2 --
 1 file changed, 2 deletions(-)

diff --git a/runtime/Makefile b/runtime/Makefile
index 9987fd27d922..4526b28b6520 100644
--- a/runtime/Makefile
+++ b/runtime/Makefile
@@ -64,7 +64,6 @@ $(CHPL_ENV_HEADER): $(CHPL_MAKE_HOME)/util/printchplenv $(CHPL_MAKE_HOME)/util/c
 	  sed 's/^ *//;s/ *$$//' | \
 	  sed 's/[^0-9A-Za-z]/_/g' | \
 	  awk '{ print "#define " toupper($$1) }' >> $(CHPL_ENV_HEADER)
-	
 	@$(CHPL_MAKE_HOME)/util/printchplenv --only CHPL_GPU_SDK_VERSION --value | \
 		grep -q none && \
 			echo "#define CHPL_GPU_SDK_VERSION_MAJOR 0" >> $(CHPL_ENV_HEADER) && \
@@ -76,7 +75,6 @@ $(CHPL_ENV_HEADER): $(CHPL_MAKE_HOME)/util/printchplenv $(CHPL_MAKE_HOME)/util/c
 			awk '{ print "#define CHPL_GPU_SDK_VERSION_MAJOR " $$1; \
 						 print "#define CHPL_GPU_SDK_VERSION_MINOR " $$2; \
 						 print "#define CHPL_GPU_SDK_VERSION_PATCH " $$3 }' >> $(CHPL_ENV_HEADER)
-
 	@echo "#endif /* _CHPL_ENV_GEN_H_ */" >> $(CHPL_ENV_HEADER)
 
 THIRD_PARTY_PKGS = $(shell $(CHPL_MAKE_PYTHON) $(CHPL_MAKE_HOME)/util/chplenv/third-party-pkgs)

From 480f84fc7a7c3e648111841b32c34985802bd78a Mon Sep 17 00:00:00 2001
From: Jade Abraham <jade.abraham@hpe.com>
Date: Wed, 25 Sep 2024 16:43:49 -0500
Subject: [PATCH 04/21] add strip to version checking

Signed-off-by: Jade Abraham <jade.abraham@hpe.com>
---
 util/chplenv/chpl_gpu.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/util/chplenv/chpl_gpu.py b/util/chplenv/chpl_gpu.py
index dd3f3286ef86..819b702517b2 100644
--- a/util/chplenv/chpl_gpu.py
+++ b/util/chplenv/chpl_gpu.py
@@ -341,6 +341,7 @@ def _validate_cuda_version_impl():
     return True
 
 def get_sdk_version():
+    version = 'none'
     if get() == 'amd':
         chpl_rocm_path = get_sdk_path('amd', sdk_type='include')
         files_to_try = ['%s/.info/version-hiprt' % chpl_rocm_path,
@@ -366,7 +367,7 @@ def get_sdk_version():
                     match = re.search(r"llvm-amdgpu-([\d\.]+)", my_stdout)
                     if match:
                         rocm_version = match.group(1)
-        return rocm_version
+        version = rocm_version
     elif get() == 'nvidia':
         chpl_cuda_path = get_sdk_path('nvidia')
         version_file_json = '%s/version.json' % chpl_cuda_path
@@ -390,9 +391,9 @@ def get_sdk_version():
                 match = re.search(pattern, my_stdout)
                 if match:
                     cuda_version = match.group(1)
-        return cuda_version
-    else:
-        return 'none'
+        version = cuda_version
+    version = version.strip() if version is not None else 'none'
+    return version
 
 
 def _validate_rocm_version_impl():

From fe47d2bcd395d2a92f8620e3afde12e38ef85a12 Mon Sep 17 00:00:00 2001
From: Jade Abraham <jade.abraham@hpe.com>
Date: Wed, 25 Sep 2024 16:48:26 -0500
Subject: [PATCH 05/21] remove hsa-runtime64

Signed-off-by: Jade Abraham <jade.abraham@hpe.com>
---
 util/chplenv/chpl_gpu.py                | 6 ------
 util/chplenv/compile_link_args_utils.py | 4 ----
 util/chplenv/overrides.py               | 1 -
 util/chplenv/printchplenv.py            | 4 +---
 4 files changed, 1 insertion(+), 14 deletions(-)

diff --git a/util/chplenv/chpl_gpu.py b/util/chplenv/chpl_gpu.py
index 819b702517b2..1a937446c23d 100644
--- a/util/chplenv/chpl_gpu.py
+++ b/util/chplenv/chpl_gpu.py
@@ -19,7 +19,6 @@ class gpu_type:
     def __init__(self, sdk_path_env,
                         sdk_path_env_bitcode,
                         sdk_path_env_include,
-                        sdk_path_env_runtime,
                         compiler,
                         default_arch,
                         llvm_target,
@@ -29,7 +28,6 @@ def __init__(self, sdk_path_env,
         self.sdk_path_env = sdk_path_env
         self.sdk_path_env_bitcode = sdk_path_env_bitcode
         self.sdk_path_env_include = sdk_path_env_include
-        self.sdk_path_env_runtime = sdk_path_env_runtime
         self.compiler = compiler
         self.default_arch = default_arch
         self.llvm_target = llvm_target
@@ -54,7 +52,6 @@ def _validate_rocm_llvm_version(gpu: gpu_type):
     "nvidia": gpu_type(sdk_path_env="CHPL_CUDA_PATH",
                        sdk_path_env_bitcode="CHPL_CUDA_PATH",
                        sdk_path_env_include="CHPL_CUDA_PATH",
-                       sdk_path_env_runtime="CHPL_CUDA_PATH",
                        compiler="nvcc",
                        default_arch="sm_60",
                        llvm_target="NVPTX",
@@ -64,7 +61,6 @@ def _validate_rocm_llvm_version(gpu: gpu_type):
     "amd": gpu_type(sdk_path_env="CHPL_ROCM_PATH",
                     sdk_path_env_bitcode="CHPL_ROCM_BITCODE_PATH",
                     sdk_path_env_include="CHPL_ROCM_INCLUDE_PATH",
-                    sdk_path_env_runtime="CHPL_ROCM_RUNTIME_PATH",
                     compiler="hipcc",
                     default_arch="",
                     llvm_target="AMDGPU",
@@ -74,7 +70,6 @@ def _validate_rocm_llvm_version(gpu: gpu_type):
     "cpu": gpu_type(sdk_path_env="",
                     sdk_path_env_bitcode="",
                     sdk_path_env_include="",
-                    sdk_path_env_runtime="",
                     compiler="",
                     default_arch="",
                     llvm_target="",
@@ -174,7 +169,6 @@ def get_sdk_path(for_gpu, sdk_type='bitcode'):
     sub_env_names = {
         "bitcode": gpu.sdk_path_env_bitcode,
         "include": gpu.sdk_path_env_include,
-        "runtime": gpu.sdk_path_env_runtime
     }
     assert sdk_type in sub_env_names
 
diff --git a/util/chplenv/compile_link_args_utils.py b/util/chplenv/compile_link_args_utils.py
index f7d4abff846f..2fd5238391f7 100644
--- a/util/chplenv/compile_link_args_utils.py
+++ b/util/chplenv/compile_link_args_utils.py
@@ -108,15 +108,11 @@ def get_runtime_link_args(runtime_subdir):
             system.append("-lcuda")
         elif gpu_type == "amd":
             paths = [sdk_path]
-            runtime_path = chpl_gpu.get_sdk_path(gpu_type, sdk_type='runtime')
-            if runtime_path not in paths:
-                paths.append(runtime_path)
             for p in paths:
                 lib_path = os.path.join(p, "lib")
                 system.append("-L" + lib_path)
                 system.append("-Wl,-rpath," + lib_path)
             system.append("-lamdhip64")
-            system.append("-lhsa-runtime64")
 
     # always link with the math library
     system.append("-lm")
diff --git a/util/chplenv/overrides.py b/util/chplenv/overrides.py
index 14d58dc01730..352dc8508f43 100755
--- a/util/chplenv/overrides.py
+++ b/util/chplenv/overrides.py
@@ -39,7 +39,6 @@
              'CHPL_ROCM_PATH',
              'CHPL_ROCM_BITCODE_PATH',
              'CHPL_ROCM_INCLUDE_PATH',
-             'CHPL_ROCM_RUNTIME_PATH'
              'CHPL_GPU_ARCH',
 
              'CHPL_COMM',
diff --git a/util/chplenv/printchplenv.py b/util/chplenv/printchplenv.py
index 1ba9cc2323ef..0ba0c312ddb7 100755
--- a/util/chplenv/printchplenv.py
+++ b/util/chplenv/printchplenv.py
@@ -108,7 +108,6 @@
     ChapelEnv('  CHPL_ROCM_PATH', INTERNAL),
     ChapelEnv('  CHPL_ROCM_BITCODE_PATH', INTERNAL),
     ChapelEnv('  CHPL_ROCM_INCLUDE_PATH', INTERNAL),
-    ChapelEnv('  CHPL_ROCM_RUNTIME_PATH', INTERNAL),
     ChapelEnv('  CHPL_CUDA_LIBDEVICE_PATH', INTERNAL),
     ChapelEnv('CHPL_COMM', RUNTIME | LAUNCHER | DEFAULT, 'comm'),
     ChapelEnv('  CHPL_COMM_SUBSTRATE', RUNTIME | LAUNCHER | DEFAULT),
@@ -321,7 +320,6 @@ def compute_internal_values():
     ENV_VALS['  CHPL_ROCM_PATH'] = chpl_gpu.get_sdk_path("amd")
     ENV_VALS['  CHPL_ROCM_BITCODE_PATH'] = chpl_gpu.get_sdk_path("amd", sdk_type="bitcode")
     ENV_VALS['  CHPL_ROCM_INCLUDE_PATH'] = chpl_gpu.get_sdk_path("amd", sdk_type="include")
-    ENV_VALS['  CHPL_ROCM_RUNTIME_PATH'] = chpl_gpu.get_sdk_path("amd", sdk_type="runtime")
 
 
 
@@ -377,7 +375,7 @@ def filter_tidy(chpl_env):
         return gpu == 'nvidia'
     elif chpl_env.name == '  CHPL_ROCM_PATH':
         return gpu == 'amd'
-    elif chpl_env.name in ('  CHPL_ROCM_BITCODE_PATH', '  CHPL_ROCM_INCLUDE_PATH', '  CHPL_ROCM_RUNTIME_PATH'):
+    elif chpl_env.name in ('  CHPL_ROCM_BITCODE_PATH', '  CHPL_ROCM_INCLUDE_PATH'):
         return gpu == 'amd'
     elif chpl_env.name == '  CHPL_GPU_ARCH':
         return gpu == 'nvidia' or gpu == 'amd'

From f319c79f1f0332bd7fdc38bbb56261e1a6075df2 Mon Sep 17 00:00:00 2001
From: Jade Abraham <jade.abraham@hpe.com>
Date: Mon, 7 Oct 2024 10:27:07 -0500
Subject: [PATCH 06/21] refactor compile args for gpus

Signed-off-by: Jade Abraham <jade.abraham@hpe.com>
---
 util/chplenv/chpl_gpu.py                | 60 +++++++++++++++++++++++++
 util/chplenv/compile_link_args_utils.py | 49 +++-----------------
 util/chplenv/third_party_utils.py       |  1 +
 3 files changed, 67 insertions(+), 43 deletions(-)

diff --git a/util/chplenv/chpl_gpu.py b/util/chplenv/chpl_gpu.py
index 1a937446c23d..901b01d8fb7c 100644
--- a/util/chplenv/chpl_gpu.py
+++ b/util/chplenv/chpl_gpu.py
@@ -3,6 +3,7 @@
 import glob
 import json
 import chpl_locale_model
+import chpl_platform
 import chpl_llvm
 import chpl_compiler
 import re
@@ -230,6 +231,65 @@ def get_gpu_mem_strategy():
         return memtype
     return "array_on_device"
 
+def get_runtime_compile_args():
+    if chpl_locale_model.get() != 'gpu':
+        return [], []
+    bundled = []
+    system = []
+
+    gpu_type = get()
+    sdk_path = get_sdk_path(gpu_type, sdk_type='include')
+    incl = chpl_home_utils.get_chpl_runtime_incl()
+
+    # this -D is needed since it affects code inside of headers
+    bundled.append("-DHAS_GPU_LOCALE")
+    if gpu_type == "cpu":
+        bundled.append("-DGPU_RUNTIME_CPU")
+
+    # If compiling for GPU locales, add CUDA runtime headers to include path
+    bundled.append("-I" + os.path.join(incl, "gpu", gpu_type))
+    if gpu_type == "nvidia":
+        system.append("-I" + os.path.join(sdk_path, "include"))
+
+        # workaround an issue with __float128 not being supported by clang in device code
+        system.append("-D__STRICT_ANSI__=1")
+
+    elif gpu_type == "amd":
+        # -isystem instead of -I silences warnings from inside these includes.
+        system.append("-isystem" + os.path.join(sdk_path, "include"))
+        system.append("-isystem" + os.path.join(sdk_path, "hip", "include"))
+    
+    return bundled, system
+
+def get_runtime_link_args():
+    if chpl_locale_model.get() != 'gpu':
+        return [], []
+    bundled = []
+    system = []
+
+    gpu_type = get()
+    sdk_path = get_sdk_path(gpu_type, sdk_type='include')
+
+    if gpu_type == "nvidia":
+        system.append("-L" + os.path.join(sdk_path, "lib64"))
+        system.append("-lcudart")
+        if chpl_platform.is_wsl():
+            # WSL needs to link with libcuda that belongs to the driver hosted in Windows
+            system.append("-L" + os.path.join("/usr", "lib", "wsl", "lib"))
+        system.append("-lcuda")
+    elif gpu_type == "amd":
+        paths = [sdk_path]
+        for p in paths:
+            lib_path = os.path.join(p, "lib")
+            system.append("-L" + lib_path)
+            system.append("-Wl,-rpath," + lib_path)
+        system.append("-lamdhip64")
+
+    return bundled, system
+
+
+
+
 
 def get_cuda_libdevice_path():
     if get() == 'nvidia':
diff --git a/util/chplenv/compile_link_args_utils.py b/util/chplenv/compile_link_args_utils.py
index 2fd5238391f7..ff2353f4b2b6 100644
--- a/util/chplenv/compile_link_args_utils.py
+++ b/util/chplenv/compile_link_args_utils.py
@@ -50,29 +50,9 @@ def get_runtime_includes_and_defines():
         # this is needed since it affects code inside of headers
         bundled.append("-DCHPL_COMM_DEBUG")
 
-    if locale_model == "gpu":
-        # this -D is needed since it affects code inside of headers
-        bundled.append("-DHAS_GPU_LOCALE")
-        if chpl_gpu.get() == "cpu":
-            bundled.append("-DGPU_RUNTIME_CPU")
-        memtype = chpl_gpu.get_gpu_mem_strategy()
-
-        # If compiling for GPU locales, add CUDA runtime headers to include path
-        gpu_type = chpl_gpu.get()
-        sdk_path = chpl_gpu.get_sdk_path(gpu_type, sdk_type='include')
-
-        bundled.append("-I" + os.path.join(incl, "gpu", chpl_gpu.get()))
-        if gpu_type == "nvidia":
-            system.append("-I" + os.path.join(sdk_path, "include"))
-
-            # workaround an issue with __float128 not being supported by clang in device code
-            system.append("-D__STRICT_ANSI__=1")
-
-        elif gpu_type == "amd":
-            # -isystem instead of -I silences warnings from inside these includes.
-            system.append("-isystem" + os.path.join(sdk_path, "include"))
-            system.append("-isystem" + os.path.join(sdk_path, "hip", "include"))
-            system.append("-isystem" + os.path.join(sdk_path, "hsa", "include"))
+    gpu_bundled, gpu_system = chpl_gpu.get_runtime_includes_and_defines()
+    bundled.extend(gpu_bundled)
+    system.extend(gpu_system)
 
     if mem == "jemalloc" and chpl_jemalloc.get('target') == "bundled":
         # set -DCHPL_JEMALLOC_PREFIX=chpl_je_
@@ -89,30 +69,13 @@ def get_runtime_link_args(runtime_subdir):
     system = [ ]
 
     lib = chpl_home_utils.get_chpl_runtime_lib()
-    locale_model = chpl_locale_model.get()
 
     bundled.append("-L" + os.path.join(lib, runtime_subdir))
     bundled.append("-lchpl")
 
-    if locale_model == "gpu":
-        # If compiling for GPU locales, add CUDA to link path,
-        # and add cuda libraries
-        gpu_type = chpl_gpu.get()
-        sdk_path = chpl_gpu.get_sdk_path(gpu_type, sdk_type='include')
-        if gpu_type == "nvidia":
-            system.append("-L" + os.path.join(sdk_path, "lib64"))
-            system.append("-lcudart")
-            if chpl_platform.is_wsl():
-                # WSL needs to link with libcuda that belongs to the driver hosted in Windows
-                system.append("-L" + os.path.join("/usr", "lib", "wsl", "lib"))
-            system.append("-lcuda")
-        elif gpu_type == "amd":
-            paths = [sdk_path]
-            for p in paths:
-                lib_path = os.path.join(p, "lib")
-                system.append("-L" + lib_path)
-                system.append("-Wl,-rpath," + lib_path)
-            system.append("-lamdhip64")
+    gpu_bundled, gpu_system = chpl_gpu.get_runtime_link_args()
+    bundled.extend(gpu_bundled)
+    system.extend(gpu_system)
 
     # always link with the math library
     system.append("-lm")
diff --git a/util/chplenv/third_party_utils.py b/util/chplenv/third_party_utils.py
index 0f0a2b332b60..05490e9e5809 100644
--- a/util/chplenv/third_party_utils.py
+++ b/util/chplenv/third_party_utils.py
@@ -471,3 +471,4 @@ def could_not_find_pkgconfig_pkg(pkg, envname):
     else:
         install_str = " with `brew install {0}`".format(pkg) if homebrew_utils.homebrew_exists() else ""
         error("Could not find a suitable {0} installation. Please install {0}{1} or set {2}=bundled.".format(pkg, install_str, envname))
+

From 3f2477ccbe09c24a8e5ee1ee3c8383a387551111 Mon Sep 17 00:00:00 2001
From: Jade Abraham <jade.abraham@hpe.com>
Date: Mon, 7 Oct 2024 10:27:33 -0500
Subject: [PATCH 07/21] remove CHPL_ROCM_PATH subvars

Signed-off-by: Jade Abraham <jade.abraham@hpe.com>
---
 compiler/main/driver.cpp     | 2 +-
 util/chplenv/overrides.py    | 2 --
 util/chplenv/printchplenv.py | 6 ------
 3 files changed, 1 insertion(+), 9 deletions(-)

diff --git a/compiler/main/driver.cpp b/compiler/main/driver.cpp
index b9224b2fa2f7..17a0d6faf7c4 100644
--- a/compiler/main/driver.cpp
+++ b/compiler/main/driver.cpp
@@ -1831,7 +1831,7 @@ static void setChapelEnvs() {
         gGpuSdkPath = envMap["CHPL_CUDA_PATH"];
         break;
       case GpuCodegenType::GPU_CG_AMD_HIP:
-        gGpuSdkPath = envMap["CHPL_ROCM_BITCODE_PATH"];
+        gGpuSdkPath = envMap["CHPL_ROCM_PATH"];
         break;
       case GpuCodegenType::GPU_CG_CPU:
         gGpuSdkPath = "";
diff --git a/util/chplenv/overrides.py b/util/chplenv/overrides.py
index 352dc8508f43..c8a218c10e0c 100755
--- a/util/chplenv/overrides.py
+++ b/util/chplenv/overrides.py
@@ -37,8 +37,6 @@
              'CHPL_GPU_MEM_STRATEGY',
              'CHPL_CUDA_PATH',
              'CHPL_ROCM_PATH',
-             'CHPL_ROCM_BITCODE_PATH',
-             'CHPL_ROCM_INCLUDE_PATH',
              'CHPL_GPU_ARCH',
 
              'CHPL_COMM',
diff --git a/util/chplenv/printchplenv.py b/util/chplenv/printchplenv.py
index 0ba0c312ddb7..d74bc77c3f0e 100755
--- a/util/chplenv/printchplenv.py
+++ b/util/chplenv/printchplenv.py
@@ -106,8 +106,6 @@
     ChapelEnv('  CHPL_GPU_MEM_STRATEGY', RUNTIME , 'gpu_mem' ),
     ChapelEnv('  CHPL_CUDA_PATH', INTERNAL),
     ChapelEnv('  CHPL_ROCM_PATH', INTERNAL),
-    ChapelEnv('  CHPL_ROCM_BITCODE_PATH', INTERNAL),
-    ChapelEnv('  CHPL_ROCM_INCLUDE_PATH', INTERNAL),
     ChapelEnv('  CHPL_CUDA_LIBDEVICE_PATH', INTERNAL),
     ChapelEnv('CHPL_COMM', RUNTIME | LAUNCHER | DEFAULT, 'comm'),
     ChapelEnv('  CHPL_COMM_SUBSTRATE', RUNTIME | LAUNCHER | DEFAULT),
@@ -318,8 +316,6 @@ def compute_internal_values():
     ENV_VALS['  CHPL_GPU_ARCH'] = chpl_gpu.get_arch()
     ENV_VALS['  CHPL_CUDA_PATH'] = chpl_gpu.get_sdk_path("nvidia")
     ENV_VALS['  CHPL_ROCM_PATH'] = chpl_gpu.get_sdk_path("amd")
-    ENV_VALS['  CHPL_ROCM_BITCODE_PATH'] = chpl_gpu.get_sdk_path("amd", sdk_type="bitcode")
-    ENV_VALS['  CHPL_ROCM_INCLUDE_PATH'] = chpl_gpu.get_sdk_path("amd", sdk_type="include")
 
 
 
@@ -375,8 +371,6 @@ def filter_tidy(chpl_env):
         return gpu == 'nvidia'
     elif chpl_env.name == '  CHPL_ROCM_PATH':
         return gpu == 'amd'
-    elif chpl_env.name in ('  CHPL_ROCM_BITCODE_PATH', '  CHPL_ROCM_INCLUDE_PATH'):
-        return gpu == 'amd'
     elif chpl_env.name == '  CHPL_GPU_ARCH':
         return gpu == 'nvidia' or gpu == 'amd'
     elif chpl_env.name == '  CHPL_GPU_SDK_VERSION':

From ae2f14b8288e5df4b40d6be26a787759a25d85b3 Mon Sep 17 00:00:00 2001
From: Jade Abraham <jade.abraham@hpe.com>
Date: Mon, 7 Oct 2024 10:27:45 -0500
Subject: [PATCH 08/21] add combine_output

Signed-off-by: Jade Abraham <jade.abraham@hpe.com>
---
 util/chplenv/utils.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/util/chplenv/utils.py b/util/chplenv/utils.py
index a9175a4a6b84..d945be0c0125 100644
--- a/util/chplenv/utils.py
+++ b/util/chplenv/utils.py
@@ -53,16 +53,18 @@ class CommandError(Exception):
     pass
 
 
-def try_run_command(command, cmd_input=None):
+def try_run_command(command, cmd_input=None, combine_output=False):
     """Command subprocess wrapper tolerating failure to find or run the cmd.
        For normal usage the vanilla run_command() may be simpler to use.
        This should be the only invocation of subprocess in all chplenv scripts.
        This could be replaced by subprocess.check_output, but that
        is only available after Python 2.7, and we still support 2.6 :("""
+    
+    stderr = subprocess.STDOUT if combine_output else subprocess.PIPE
     try:
         process = subprocess.Popen(command,
                                    stdout=subprocess.PIPE,
-                                   stderr=subprocess.PIPE,
+                                   stderr=stderr,
                                    stdin=subprocess.PIPE)
     except OSError:
         return (False, 0, None, None)

From 4c2d6f804381128e1afbc6949ab26d0bbae68fe6 Mon Sep 17 00:00:00 2001
From: Jade Abraham <jade.abraham@hpe.com>
Date: Mon, 7 Oct 2024 10:28:16 -0500
Subject: [PATCH 09/21] start refactor

Signed-off-by: Jade Abraham <jade.abraham@hpe.com>
---
 util/chplenv/chpl_gpu.py | 99 +++++++++++++++++++++++++++++++---------
 1 file changed, 78 insertions(+), 21 deletions(-)

diff --git a/util/chplenv/chpl_gpu.py b/util/chplenv/chpl_gpu.py
index 901b01d8fb7c..2bb76b5a47c9 100644
--- a/util/chplenv/chpl_gpu.py
+++ b/util/chplenv/chpl_gpu.py
@@ -8,7 +8,8 @@
 import chpl_compiler
 import re
 import chpl_tasks
-from utils import error, warning, memoize, run_command, which, is_ver_in_range
+import chpl_home_utils
+from utils import error, warning, memoize, run_command, try_run_command, which, is_ver_in_range
 
 def _validate_cuda_version():
     return _validate_cuda_version_impl()
@@ -48,33 +49,73 @@ def _validate_cuda_llvm_version(gpu: gpu_type):
 def _validate_rocm_llvm_version(gpu: gpu_type):
     return _validate_rocm_llvm_version_impl(gpu)
 
+@memoize
+def _gpu_compiler_version_output(compiler: str, lang: str):
+    dummy_main = "int main() { return 0; }"
+    exists, returncode, stdout, _ = try_run_command([compiler, "-v", "-c", "-x", lang, "-", "-o", "/dev/null"], cmd_input=dummy_main, combine_output=True)
+    if exists and returncode == 0 and stdout:
+        return stdout
+    else:
+        return None
+
+def _find_cuda_sdk_path(compiler: str):
+    out = _gpu_compiler_version_output(compiler, "cu")
+    # #$ TOP=
+
+    # find lib device: #$ NVVMIR_LIBRARY_DIR=
+
+
+# LLVM AMD GPU
+def _find_rocm_sdk_path(compiler: str):
+    out = _gpu_compiler_version_output(compiler, "hip")
+    # InstalledDir
+
+# HIP
+def _find_rocm_include_path(compiler: str):
+    # Found HIP installation
+    pass
+
+def _find_cuda_version(compiler: str):
+    # cuda, we can run nvcc --version
+    # 'Cuda compilation tools, release'
+    pass
+
+def _find_rocm_version(compiler: str):
+    # hip/amd sucks, we have to guess
+    # one of the following regexs in the compiler output, look in this order
+    ' roc-VERSION '
+    '/rocm-VERSION/'
+    '/hip-VERSION/'
+    '/llvm-amdgpu-VERSION/'
+    '/rocm/VERSION/'
+    pass
 
 GPU_TYPES = {
     "nvidia": gpu_type(sdk_path_env="CHPL_CUDA_PATH",
-                       sdk_path_env_bitcode="CHPL_CUDA_PATH",
-                       sdk_path_env_include="CHPL_CUDA_PATH",
                        compiler="nvcc",
                        default_arch="sm_60",
                        llvm_target="NVPTX",
                        runtime_impl="cuda",
+                       find_sdk_path=_find_cuda_sdk_path,
+                       find_version=_find_cuda_version,
                        version_validator=_validate_cuda_version,
                        llvm_validator=_validate_cuda_llvm_version),
     "amd": gpu_type(sdk_path_env="CHPL_ROCM_PATH",
-                    sdk_path_env_bitcode="CHPL_ROCM_BITCODE_PATH",
-                    sdk_path_env_include="CHPL_ROCM_INCLUDE_PATH",
                     compiler="hipcc",
                     default_arch="",
                     llvm_target="AMDGPU",
                     runtime_impl="rocm",
+                    find_sdk_path=_find_rocm_sdk_path,
+                    find_version=_find_rocm_version,
                     version_validator=_validate_rocm_version,
                     llvm_validator=_validate_rocm_llvm_version),
     "cpu": gpu_type(sdk_path_env="",
-                    sdk_path_env_bitcode="",
-                    sdk_path_env_include="",
                     compiler="",
                     default_arch="",
                     llvm_target="",
                     runtime_impl="cpu",
+                    find_sdk_path=lambda compiler: None,
+                    find_version=lambda compiler: None,
                     version_validator=lambda: None,
                     llvm_validator=lambda: None),
 }
@@ -157,6 +198,21 @@ def get_arch():
               "for more information.".format(gpu_type))
         return 'error'
 
+@memoize
+def get_gpu_compiler():
+    gpu_type = get()
+    sdk_path = get_sdk_path(gpu_type)
+    if sdk_path == 'none':
+        return 'none'
+    
+    name = GPU_TYPES[gpu_type].compiler
+    bin_dir = os.path.join(sdk_path, 'bin')
+    full_path = os.path.join(bin_dir, name)
+    if not os.path.exists(full_path):
+        _reportMissingGpuReq("Could not find {} in {}".format(name, bin_dir))
+    return full_path
+
+
 @memoize
 def get_sdk_path(for_gpu, sdk_type='bitcode'):
     gpu_type = get()
@@ -188,20 +244,21 @@ def get_sdk_path(for_gpu, sdk_type='bitcode'):
                                                                       gpu.compiler])
 
     if exists and returncode == 0:
-        # Walk up from directories from the one containing the gpu compiler
-        # (e.g.  `nvcc` or `hipcc`) until we find a directory that starts with
-        # `runtime_impl` (e.g `cuda` or `rocm`)
-        # TODO: this logic does not seem to work for spack
-        real_path = os.path.realpath(my_stdout.strip()).strip()
-        path_parts = real_path.split("/")
-        chpl_sdk_path = "/"
-        for part in path_parts:
-            if len(part) == 0: continue
-            chpl_sdk_path += part
-            if not part.startswith(gpu.runtime_impl):
-                chpl_sdk_path += "/"
-            else:
-                break
+        pass
+        # # Walk up from directories from the one containing the gpu compiler
+        # # (e.g.  `nvcc` or `hipcc`) until we find a directory that starts with
+        # # `runtime_impl` (e.g `cuda` or `rocm`)
+        # # TODO: this logic does not seem to work for spack
+        # real_path = os.path.realpath(my_stdout.strip()).strip()
+        # path_parts = real_path.split("/")
+        # chpl_sdk_path = "/"
+        # for part in path_parts:
+        #     if len(part) == 0: continue
+        #     chpl_sdk_path += part
+        #     if not part.startswith(gpu.runtime_impl):
+        #         chpl_sdk_path += "/"
+        #     else:
+        #         break
         
         # validate the SDK path found
         if not (os.path.exists(chpl_sdk_path) and os.path.isdir(chpl_sdk_path)):

From 639cc496d604341e65bb456391a970291703018e Mon Sep 17 00:00:00 2001
From: Jade Abraham <jade.abraham@hpe.com>
Date: Thu, 10 Oct 2024 14:14:24 -0500
Subject: [PATCH 10/21] finish main part of refactor

Signed-off-by: Jade Abraham <jade.abraham@hpe.com>
---
 compiler/include/driver.h               |   2 +
 compiler/llvm/clangUtil.cpp             |  11 +-
 compiler/main/driver.cpp                |   4 +
 util/chplenv/chpl_gpu.py                | 305 +++++++++++++-----------
 util/chplenv/compile_link_args_utils.py |   2 +-
 util/chplenv/printchplenv.py            |   8 +
 util/chplenv/utils.py                   |   5 +-
 7 files changed, 182 insertions(+), 155 deletions(-)

diff --git a/compiler/include/driver.h b/compiler/include/driver.h
index 525cbdb7587b..eb1aeafb225e 100644
--- a/compiler/include/driver.h
+++ b/compiler/include/driver.h
@@ -153,6 +153,8 @@ extern const char* CHPL_TARGET_BUNDLED_LINK_ARGS;
 extern const char* CHPL_TARGET_SYSTEM_LINK_ARGS;
 
 extern const char* CHPL_CUDA_LIBDEVICE_PATH;
+extern const char* CHPL_ROCM_LLVM_PATH;
+extern const char* CHPL_ROCM_LIBDEVICE_PATH;
 extern const char* CHPL_GPU;
 extern const char* CHPL_GPU_ARCH;
 
diff --git a/compiler/llvm/clangUtil.cpp b/compiler/llvm/clangUtil.cpp
index d2730cd13442..2a393021ff89 100644
--- a/compiler/llvm/clangUtil.cpp
+++ b/compiler/llvm/clangUtil.cpp
@@ -4450,7 +4450,7 @@ static void linkGpuDeviceLibraries() {
   } else {
     // See <https://github.com/RadeonOpenCompute/ROCm-Device-Libs> for details
     // on what these various libraries are.
-    auto libPath = gGpuSdkPath + std::string("/amdgcn/bitcode");
+    auto libPath = CHPL_ROCM_AMDGCN_PATH + std::string("/bitcode");
     linkBitCodeFile((libPath + "/hip.bc").c_str());
     linkBitCodeFile((libPath + "/ocml.bc").c_str());
     linkBitCodeFile((libPath + "/ockl.bc").c_str());
@@ -4821,11 +4821,7 @@ static void makeBinaryLLVMForHIP(const std::string& artifactFilename,
   std::string inputs = "-inputs=/dev/null";
   std::string outputs = "-outputs=" + fatbinFilename;
 #endif
-  auto sdkString = std::string(gGpuSdkPath);
-  // check file exists, maybe use alternate path (say if spack installed)
-  std::string lldBin = pathExists((sdkString + "/llvm/bin/lld").c_str()) ?
-                       sdkString + "/llvm/bin/lld"  :
-                       sdkString + "/bin/lld";
+  std::string lldBin = CHPL_ROCM_LLVM_PATH + std::string("/bin/lld");
   for (auto& gpuArch : gpuArches) {
     std::string gpuObject = gpuObjFilename + "_" + gpuArch + ".o";
     std::string gpuOut = outFilenamePrefix + "_" + gpuArch + ".out";
@@ -5051,7 +5047,8 @@ void makeBinaryLLVM(void) {
         gpuArgs += " -Wno-unknown-cuda-version";
       }
       else if (getGpuCodegenType() == GpuCodegenType::GPU_CG_AMD_HIP) {
-        curPath = addToPATH(gGpuSdkPath + std::string("/llvm/bin"));
+        // use the AMD LLVM path, use separate var
+        curPath = addToPATH(CHPL_ROCM_LLVM_PATH + std::string("/bin"));
       }
     }
 
diff --git a/compiler/main/driver.cpp b/compiler/main/driver.cpp
index 17a0d6faf7c4..d98126bfc94a 100644
--- a/compiler/main/driver.cpp
+++ b/compiler/main/driver.cpp
@@ -130,6 +130,8 @@ const char* CHPL_TARGET_BUNDLED_LINK_ARGS = NULL;
 const char* CHPL_TARGET_SYSTEM_LINK_ARGS = NULL;
 
 const char* CHPL_CUDA_LIBDEVICE_PATH = NULL;
+const char* CHPL_ROCM_LLVM_PATH = NULL;
+const char* CHPL_ROCM_AMDGCN_PATH = NULL;
 const char* CHPL_GPU = NULL;
 const char* CHPL_GPU_ARCH = NULL;
 
@@ -1824,6 +1826,8 @@ static void setChapelEnvs() {
 
   if (usingGpuLocaleModel()) {
     CHPL_CUDA_LIBDEVICE_PATH = envMap["CHPL_CUDA_LIBDEVICE_PATH"];
+    CHPL_ROCM_LLVM_PATH = envMap["CHPL_ROCM_LLVM_PATH"];
+    CHPL_ROCM_AMDGCN_PATH = envMap["CHPL_ROCM_AMDGCN_PATH"];
     CHPL_GPU= envMap["CHPL_GPU"];
     CHPL_GPU_ARCH = envMap["CHPL_GPU_ARCH"];
     switch (getGpuCodegenType()) {
diff --git a/util/chplenv/chpl_gpu.py b/util/chplenv/chpl_gpu.py
index 2bb76b5a47c9..73ab5e7dde7b 100644
--- a/util/chplenv/chpl_gpu.py
+++ b/util/chplenv/chpl_gpu.py
@@ -19,23 +19,25 @@ def _validate_rocm_version():
 
 class gpu_type:
     def __init__(self, sdk_path_env,
-                        sdk_path_env_bitcode,
-                        sdk_path_env_include,
                         compiler,
                         default_arch,
                         llvm_target,
                         runtime_impl,
+                        find_sdk_path,
+                        find_version,
                         version_validator,
-                        llvm_validator):
+                        llvm_validator,
+                        real_gpu):
         self.sdk_path_env = sdk_path_env
-        self.sdk_path_env_bitcode = sdk_path_env_bitcode
-        self.sdk_path_env_include = sdk_path_env_include
         self.compiler = compiler
         self.default_arch = default_arch
         self.llvm_target = llvm_target
         self.runtime_impl = runtime_impl
+        self.find_sdk_path = find_sdk_path
+        self.find_version = find_version
         self.version_validator = version_validator
         self.llvm_validator = llvm_validator
+        self.real_gpu = real_gpu
 
     def validate_sdk_version(self):
         return self.version_validator()
@@ -50,7 +52,7 @@ def _validate_rocm_llvm_version(gpu: gpu_type):
     return _validate_rocm_llvm_version_impl(gpu)
 
 @memoize
-def _gpu_compiler_version_output(compiler: str, lang: str):
+def gpu_compiler_basic_compile(compiler: str, lang: str):
     dummy_main = "int main() { return 0; }"
     exists, returncode, stdout, _ = try_run_command([compiler, "-v", "-c", "-x", lang, "-", "-o", "/dev/null"], cmd_input=dummy_main, combine_output=True)
     if exists and returncode == 0 and stdout:
@@ -59,36 +61,77 @@ def _gpu_compiler_version_output(compiler: str, lang: str):
         return None
 
 def _find_cuda_sdk_path(compiler: str):
-    out = _gpu_compiler_version_output(compiler, "cu")
-    # #$ TOP=
+    out = gpu_compiler_basic_compile(compiler, "cu")
+    if not out:
+        return None
+    regex = r"^#\$ TOP=(.+)$"
+    match = re.search(regex, out, re.MULTILINE)
+    return match.group(1) if match else None
 
-    # find lib device: #$ NVVMIR_LIBRARY_DIR=
 
+def find_llvm_amd_bin_path(compiler: str):
+    out = gpu_compiler_basic_compile(compiler, "hip")
+    if not out:
+        return None
+    regex = r"^InstalledDir: (.+)$"
+    match = re.search(regex, out, re.MULTILINE)
+    return match.group(1) if match else None
 
-# LLVM AMD GPU
-def _find_rocm_sdk_path(compiler: str):
-    out = _gpu_compiler_version_output(compiler, "hip")
-    # InstalledDir
+def find_amdgcn_path(compiler: str):
+    out = gpu_compiler_basic_compile(compiler, "hip")
+    if not out:
+        return None
+    # find the builtin bitcode path
+    # this will likely appear many times, we just take the first occurrence
+    regex = r"-mlink-builtin-bitcode\s*(/.+?amdgcn/bitcode)"
+    match = re.search(regex, out)
+    if not match:
+        return None
+    full_path = match.group(1)
+    # strip the 'bitcode' part (and trailing '/')
+    return os.path.dirname(full_path)
 
-# HIP
-def _find_rocm_include_path(compiler: str):
-    # Found HIP installation
-    pass
+def _find_hip_sdk_path(compiler: str):
+    out = gpu_compiler_basic_compile(compiler, "hip")
+    if not out:
+        return None
+    regex = r"^Found HIP installation: (.+),"
+    match = re.search(regex, out, re.MULTILINE)
+    return match.group(1) if match else None
 
 def _find_cuda_version(compiler: str):
-    # cuda, we can run nvcc --version
+    # we can run 'nvcc --version'
     # 'Cuda compilation tools, release'
-    pass
+    regex = r"Cuda compilation tools, release ([\d\.]+)"
+
+    exists, returncode, out, _ = utils.try_run_command([compiler, "--version"])
+    if not (exists and returncode == 0):
+        return None
+    
+    match = re.search(regex, out)
+    return match.group(1) if match else None
 
 def _find_rocm_version(compiler: str):
-    # hip/amd sucks, we have to guess
-    # one of the following regexs in the compiler output, look in this order
-    ' roc-VERSION '
-    '/rocm-VERSION/'
-    '/hip-VERSION/'
-    '/llvm-amdgpu-VERSION/'
-    '/rocm/VERSION/'
-    pass
+    # hip/amd has less uniform version info, we have to guess
+    # use one of the following regexes in the compiler output in this order
+    sep = os.path.sep
+    regexes = [
+        r"\broc-([\d\.]+)\b",
+        sep+r"rocm-([\d\.]+)"+sep,
+        sep+r"hip-([\d\.]+)",
+        sep+r"llvm-amdgpu-([\d\.]+)",
+        sep+r"rocm"+sep+r"([\d\.]+)"+sep,
+    ]
+
+    out = gpu_compiler_basic_compile(compiler, "hip")
+    if not out:
+        return None
+
+    for regex in regexes:
+        match = re.search(regex, out)
+        if match:
+            return match.group(1)
+    return None
 
 GPU_TYPES = {
     "nvidia": gpu_type(sdk_path_env="CHPL_CUDA_PATH",
@@ -99,16 +142,18 @@ def _find_rocm_version(compiler: str):
                        find_sdk_path=_find_cuda_sdk_path,
                        find_version=_find_cuda_version,
                        version_validator=_validate_cuda_version,
-                       llvm_validator=_validate_cuda_llvm_version),
+                       llvm_validator=_validate_cuda_llvm_version,
+                       real_gpu=True),
     "amd": gpu_type(sdk_path_env="CHPL_ROCM_PATH",
                     compiler="hipcc",
                     default_arch="",
                     llvm_target="AMDGPU",
                     runtime_impl="rocm",
-                    find_sdk_path=_find_rocm_sdk_path,
+                    find_sdk_path=_find_hip_sdk_path,
                     find_version=_find_rocm_version,
                     version_validator=_validate_rocm_version,
-                    llvm_validator=_validate_rocm_llvm_version),
+                    llvm_validator=_validate_rocm_llvm_version,
+                    real_gpu=True),
     "cpu": gpu_type(sdk_path_env="",
                     compiler="",
                     default_arch="",
@@ -117,7 +162,18 @@ def _find_rocm_version(compiler: str):
                     find_sdk_path=lambda compiler: None,
                     find_version=lambda compiler: None,
                     version_validator=lambda: None,
-                    llvm_validator=lambda: None),
+                    llvm_validator=lambda: None,
+                    real_gpu=False),
+    "none": gpu_type(sdk_path_env="",
+                     compiler="",
+                     default_arch="",
+                     llvm_target="",
+                     runtime_impl="",
+                     find_sdk_path=lambda compiler: None,
+                     find_version=lambda compiler: None,
+                     version_validator=lambda: None,
+                     llvm_validator=lambda: None,
+                     real_gpu=False)
 }
 
 
@@ -150,7 +206,8 @@ def get_llvm_override():
     if get() == 'amd':
         major_version = get_sdk_version().split('.')[0]
         if major_version == '5':
-            return '{}/llvm/bin/llvm-config'.format(get_sdk_path('amd'))
+            llvm_path = find_llvm_amd_bin_path(get_gpu_compiler())
+            return '{}/llvm-config'.format(llvm_path)
         pass
     return 'none'
 
@@ -214,66 +271,38 @@ def get_gpu_compiler():
 
 
 @memoize
-def get_sdk_path(for_gpu, sdk_type='bitcode'):
-    gpu_type = get()
-
+def get_sdk_path(for_gpu):
     # No SDK path if GPU is not being used.
-    if gpu_type in ('cpu', 'none'):
+    if not GPU_TYPES[get()].real_gpu:
         return 'none'
-
-    # Check vendor-specific environment variable for SDK path
     gpu = GPU_TYPES[for_gpu]
-    sub_env_names = {
-        "bitcode": gpu.sdk_path_env_bitcode,
-        "include": gpu.sdk_path_env_include,
-    }
-    assert sdk_type in sub_env_names
-
-    # get the sub env if it exists
-    chpl_sdk_path = os.environ.get(sub_env_names[sdk_type])
-    if chpl_sdk_path:
-        return chpl_sdk_path
-    
-    # otherwise use the basic one
+
+    def validate_path(p):
+        # TODO: for now, just do a simple validation that checks if the path exists
+        return (os.path.exists(p) and os.path.isdir(p))
+
+    # use user specify if given
     chpl_sdk_path = os.environ.get(gpu.sdk_path_env)
     if chpl_sdk_path:
+        if for_gpu == get() and not validate_path(chpl_sdk_path):
+            _reportMissingGpuReq(
+                "CHPL_GPU={} specified but SDK path '{}' does not exist."
+                .format(for_gpu, chpl_sdk_path))
+            return 'error'
         return chpl_sdk_path
 
-    # try to find the SDK by running `which` on a vendor-specific program.
-    exists, returncode, my_stdout, my_stderr = utils.try_run_command(["which",
-                                                                      gpu.compiler])
-
-    if exists and returncode == 0:
-        pass
-        # # Walk up from directories from the one containing the gpu compiler
-        # # (e.g.  `nvcc` or `hipcc`) until we find a directory that starts with
-        # # `runtime_impl` (e.g `cuda` or `rocm`)
-        # # TODO: this logic does not seem to work for spack
-        # real_path = os.path.realpath(my_stdout.strip()).strip()
-        # path_parts = real_path.split("/")
-        # chpl_sdk_path = "/"
-        # for part in path_parts:
-        #     if len(part) == 0: continue
-        #     chpl_sdk_path += part
-        #     if not part.startswith(gpu.runtime_impl):
-        #         chpl_sdk_path += "/"
-        #     else:
-        #         break
-        
-        # validate the SDK path found
-        if not (os.path.exists(chpl_sdk_path) and os.path.isdir(chpl_sdk_path)):
+    # find the sdk path from the compiler based on the one in PATH
+    sdk_path = gpu.find_sdk_path(gpu.compiler)
+    if sdk_path:
+        if not validate_path(sdk_path):
             _reportMissingGpuReq(
-                "Can't infer {} toolkit from '{}'. Try setting {}."
-                .format(get(), real_path, gpu.sdk_path_env)
-            )
+                "Can't find {} SDK path from '{}'. Try setting {}."
+                .format(for_gpu, gpu.compiler, gpu.sdk_path_env))
             return 'error'
-
-        return chpl_sdk_path
-    elif gpu_type == for_gpu:
-        _reportMissingGpuReq(("Can't find {} toolkit. Try setting {} to the " +
-                              "{} installation path.").format(gpu.runtime_impl,
-                                                              gpu.sdk_path_env,
-                                                              gpu.runtime_impl))
+        return sdk_path
+    elif for_gpu == get():
+        _reportMissingGpuReq("Can't infer {} toolkit from '{}'. Try setting {}."
+              .format(gpu.runtime_impl, gpu.compiler, gpu.sdk_path_env))
         return 'error'
     else:
         return ''
@@ -295,7 +324,7 @@ def get_runtime_compile_args():
     system = []
 
     gpu_type = get()
-    sdk_path = get_sdk_path(gpu_type, sdk_type='include')
+    sdk_path = get_sdk_path(gpu_type)
     incl = chpl_home_utils.get_chpl_runtime_incl()
 
     # this -D is needed since it affects code inside of headers
@@ -325,7 +354,7 @@ def get_runtime_link_args():
     system = []
 
     gpu_type = get()
-    sdk_path = get_sdk_path(gpu_type, sdk_type='include')
+    sdk_path = get_sdk_path(gpu_type)
 
     if gpu_type == "nvidia":
         system.append("-L" + os.path.join(sdk_path, "lib64"))
@@ -344,30 +373,60 @@ def get_runtime_link_args():
 
     return bundled, system
 
-
-
-
-
 def get_cuda_libdevice_path():
     if get() == 'nvidia':
+        # TODO:  # find lib device: #$ NVVMIR_LIBRARY_DIR=
         # TODO this only makes sense when we are generating for nvidia
         chpl_cuda_path = get_sdk_path('nvidia')
-
+        print(chpl_cuda_path)
+        compiler = get_gpu_compiler()
+        out = gpu_compiler_basic_compile(compiler, "cu")
+        if not out:
+            _reportMissingGpuReq("Can't find libdevice. Please make sure your CHPL_CUDA_PATH is " "set such that CHPL_CUDA_PATH points to the CUDA installation.")
+            return "error"
+        regex = r"^#\$ NVVMIR_LIBRARY_DIR=(.+)$"
+        match = re.search(regex, out, re.MULTILINE)
+        if not match:
+            _reportMissingGpuReq("Can't find libdevice. Please make sure your CHPL_CUDA_PATH is "
+                  "set such that CHPL_CUDA_PATH points to the CUDA installation.")
+            return 'error' 
+        libdevice_path = match.group(1)
         # there can be multiple libdevices for multiple compute architectures. Not
         # sure how realistic that is, nor I see multiple instances in the systems I
         # have access to. They are always named `libdevice.10.bc`, but I just want
         # to be sure here.
-        path_part = "/nvvm/libdevice/libdevice*.bc"
-        libdevices = glob.glob(chpl_cuda_path+path_part)
+        libdevices = glob.glob(os.path.join(libdevice_path, "libdevice.*.bc"))
         if len(libdevices) == 0:
             _reportMissingGpuReq("Can't find libdevice. Please make sure your CHPL_CUDA_PATH is "
-                  "set such that CHPL_CUDA_PATH{} exists.".format(path_part))
+                  "set such that CHPL_CUDA_PATH{} exists.")
             return 'error'
         else:
             return libdevices[0]
 
     return "none"
 
+def get_rocm_llvm_path():
+    if get() == 'amd':
+        compiler = get_gpu_compiler()
+        llvm_path = find_llvm_amd_bin_path(compiler)
+        if not llvm_path:
+            _reportMissingGpuReq("Could not find llvm-amd in {}".format(compiler))
+            return 'error'
+        # strip bin path component (and trailing /)
+        return os.path.dirname(llvm_path)
+    return 'none'
+
+def get_rocm_amdgcn_path():
+    if get() == 'amd':
+        compiler = get_gpu_compiler()
+        amdgcn_path = find_amdgcn_path(compiler)
+        if not amdgcn_path:
+            _reportMissingGpuReq("Could not find amdgcn in {}".format(compiler))
+            return 'error'
+        return amdgcn_path
+    return 'none'
+
+
 def validateLlvmBuiltForTgt(expectedTgt):
     # If we're using the bundled LLVM, llvm-config may not have been built
     # before we call chplenv. It seems safe to assume the bundled LLVM has been
@@ -452,57 +511,13 @@ def _validate_cuda_version_impl():
     return True
 
 def get_sdk_version():
-    version = 'none'
-    if get() == 'amd':
-        chpl_rocm_path = get_sdk_path('amd', sdk_type='include')
-        files_to_try = ['%s/.info/version-hiprt' % chpl_rocm_path,
-            '%s/.info/version-libs' % chpl_rocm_path]
-
-        version_filename = None
-        for fname in files_to_try:
-           if os.path.exists(fname):
-               version_filename = fname
-               break
-
-        rocm_version = None
-        if version_filename is not None:
-            rocm_version = open(version_filename).read()
-        else:
-            exists, returncode, my_stdout, my_stderr = utils.try_run_command(
-                ["hipcc", "--version"])
-            if exists and returncode == 0:
-                match = re.search(r"rocm?-([\d\.]+)", my_stdout)
-                if match:
-                    rocm_version = match.group(1)
-                else:
-                    match = re.search(r"llvm-amdgpu-([\d\.]+)", my_stdout)
-                    if match:
-                        rocm_version = match.group(1)
-        version = rocm_version
-    elif get() == 'nvidia':
-        chpl_cuda_path = get_sdk_path('nvidia')
-        version_file_json = '%s/version.json' % chpl_cuda_path
-        version_file_txt = '%s/version.txt' % chpl_cuda_path
-        cuda_version = None
-        if os.path.exists(version_file_json):
-            f = open(version_file_json)
-            version_json = json.load(f)
-            f.close()
-            cuda_version = version_json["cuda"]["version"]
-        elif os.path.exists(version_file_txt):
-            txt = open(version_file_txt).read()
-            match = re.search(r'\d+\.\d+\.\d+', txt)
-            if match:
-                cuda_version = match.group()
-        if cuda_version is None:
-            exists, returncode, my_stdout, my_stderr = utils.try_run_command(
-                ["nvcc", "--version"])
-            if exists and returncode == 0:
-                pattern = r"Cuda compilation tools, release ([\d\.]+)"
-                match = re.search(pattern, my_stdout)
-                if match:
-                    cuda_version = match.group(1)
-        version = cuda_version
+
+    gpu = GPU_TYPES[get()]
+    if not gpu.real_gpu:
+        return 'none'
+    version = gpu.find_version(get_gpu_compiler())
+    # TODO add validation for if the compiler matches what the user set CHPL_GPU_SDK_VERSION?
+
     version = version.strip() if version is not None else 'none'
     return version
 
@@ -517,7 +532,7 @@ def _validate_rocm_version_impl():
 
     rocm_version = get_sdk_version()
 
-    if rocm_version is None:
+    if rocm_version == 'none':
         _reportMissingGpuReq("Unable to determine ROCm version.")
         return False
 
diff --git a/util/chplenv/compile_link_args_utils.py b/util/chplenv/compile_link_args_utils.py
index ff2353f4b2b6..69794c07e186 100644
--- a/util/chplenv/compile_link_args_utils.py
+++ b/util/chplenv/compile_link_args_utils.py
@@ -50,7 +50,7 @@ def get_runtime_includes_and_defines():
         # this is needed since it affects code inside of headers
         bundled.append("-DCHPL_COMM_DEBUG")
 
-    gpu_bundled, gpu_system = chpl_gpu.get_runtime_includes_and_defines()
+    gpu_bundled, gpu_system = chpl_gpu.get_runtime_compile_args()
     bundled.extend(gpu_bundled)
     system.extend(gpu_system)
 
diff --git a/util/chplenv/printchplenv.py b/util/chplenv/printchplenv.py
index d74bc77c3f0e..737be8c5d9bb 100755
--- a/util/chplenv/printchplenv.py
+++ b/util/chplenv/printchplenv.py
@@ -107,6 +107,8 @@
     ChapelEnv('  CHPL_CUDA_PATH', INTERNAL),
     ChapelEnv('  CHPL_ROCM_PATH', INTERNAL),
     ChapelEnv('  CHPL_CUDA_LIBDEVICE_PATH', INTERNAL),
+    ChapelEnv('  CHPL_ROCM_LLVM_PATH', INTERNAL),
+    ChapelEnv('  CHPL_ROCM_AMDGCN_PATH', INTERNAL),
     ChapelEnv('CHPL_COMM', RUNTIME | LAUNCHER | DEFAULT, 'comm'),
     ChapelEnv('  CHPL_COMM_SUBSTRATE', RUNTIME | LAUNCHER | DEFAULT),
     ChapelEnv('  CHPL_GASNET_SEGMENT', RUNTIME | LAUNCHER | DEFAULT),
@@ -205,6 +207,8 @@ def compute_all_values():
     ENV_VALS['  CHPL_GPU'] = chpl_gpu.get()
     ENV_VALS['  CHPL_GPU_SDK_VERSION'] = chpl_gpu.get_sdk_version()
     ENV_VALS['  CHPL_CUDA_LIBDEVICE_PATH'] = chpl_gpu.get_cuda_libdevice_path()
+    ENV_VALS['  CHPL_ROCM_LLVM_PATH'] = chpl_gpu.get_rocm_llvm_path()
+    ENV_VALS['  CHPL_ROCM_AMDGCN_PATH'] = chpl_gpu.get_rocm_amdgcn_path()
     ENV_VALS['  CHPL_GPU_MEM_STRATEGY'] = chpl_gpu.get_gpu_mem_strategy()
     ENV_VALS['CHPL_COMM'] = chpl_comm.get()
     ENV_VALS['  CHPL_COMM_SUBSTRATE'] = chpl_comm_substrate.get()
@@ -369,6 +373,10 @@ def filter_tidy(chpl_env):
         return gpu == 'nvidia'
     elif chpl_env.name == '  CHPL_CUDA_LIBDEVICE_PATH':
         return gpu == 'nvidia'
+    elif chpl_env.name == '  CHPL_ROCM_LLVM_PATH':
+        return gpu == 'amd'
+    elif chpl_env.name == '  CHPL_ROCM_AMDGCN_PATH':
+        return gpu == 'amd'
     elif chpl_env.name == '  CHPL_ROCM_PATH':
         return gpu == 'amd'
     elif chpl_env.name == '  CHPL_GPU_ARCH':
diff --git a/util/chplenv/utils.py b/util/chplenv/utils.py
index d945be0c0125..2ade628227b9 100644
--- a/util/chplenv/utils.py
+++ b/util/chplenv/utils.py
@@ -70,8 +70,9 @@ def try_run_command(command, cmd_input=None, combine_output=False):
         return (False, 0, None, None)
     byte_cmd_input = str.encode(cmd_input, "utf-8") if cmd_input else None
     output = process.communicate(input=byte_cmd_input)
-    return (True, process.returncode, output[0].decode("utf-8"),
-            output[1].decode("utf-8"))
+    my_stdout = output[0].decode("utf-8")
+    my_sterr = output[1].decode("utf-8") if combine_output and output[1] else None
+    return (True, process.returncode, my_stdout, my_sterr)
 
 
 def run_command(command, stdout=True, stderr=False, cmd_input=None):

From 99772abe3df245239812419df628841ae108ec85 Mon Sep 17 00:00:00 2001
From: Jade Abraham <jade.abraham@hpe.com>
Date: Thu, 10 Oct 2024 14:50:18 -0500
Subject: [PATCH 11/21] fix var name

Signed-off-by: Jade Abraham <jade.abraham@hpe.com>
---
 compiler/include/driver.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/compiler/include/driver.h b/compiler/include/driver.h
index eb1aeafb225e..31233d9b6bca 100644
--- a/compiler/include/driver.h
+++ b/compiler/include/driver.h
@@ -154,7 +154,7 @@ extern const char* CHPL_TARGET_SYSTEM_LINK_ARGS;
 
 extern const char* CHPL_CUDA_LIBDEVICE_PATH;
 extern const char* CHPL_ROCM_LLVM_PATH;
-extern const char* CHPL_ROCM_LIBDEVICE_PATH;
+extern const char* CHPL_ROCM_AMDGCN_PATH;
 extern const char* CHPL_GPU;
 extern const char* CHPL_GPU_ARCH;
 

From b9b1483b073b6929519aa91198dea2e23a314560 Mon Sep 17 00:00:00 2001
From: Jade Abraham <jade.abraham@hpe.com>
Date: Thu, 10 Oct 2024 15:09:54 -0500
Subject: [PATCH 12/21] cleanups

Signed-off-by: Jade Abraham <jade.abraham@hpe.com>
---
 util/chplenv/chpl_gpu.py | 21 +++++----------------
 1 file changed, 5 insertions(+), 16 deletions(-)

diff --git a/util/chplenv/chpl_gpu.py b/util/chplenv/chpl_gpu.py
index 73ab5e7dde7b..b48096a1908b 100644
--- a/util/chplenv/chpl_gpu.py
+++ b/util/chplenv/chpl_gpu.py
@@ -1,7 +1,5 @@
-import utils
 import os
 import glob
-import json
 import chpl_locale_model
 import chpl_platform
 import chpl_llvm
@@ -9,7 +7,7 @@
 import re
 import chpl_tasks
 import chpl_home_utils
-from utils import error, warning, memoize, run_command, try_run_command, which, is_ver_in_range
+from utils import error, warning, memoize, try_run_command, which, is_ver_in_range
 
 def _validate_cuda_version():
     return _validate_cuda_version_impl()
@@ -104,7 +102,7 @@ def _find_cuda_version(compiler: str):
     # 'Cuda compilation tools, release'
     regex = r"Cuda compilation tools, release ([\d\.]+)"
 
-    exists, returncode, out, _ = utils.try_run_command([compiler, "--version"])
+    exists, returncode, out, _ = try_run_command([compiler, "--version"])
     if not (exists and returncode == 0):
         return None
     
@@ -281,7 +279,7 @@ def validate_path(p):
         # TODO: for now, just do a simple validation that checks if the path exists
         return (os.path.exists(p) and os.path.isdir(p))
 
-    # use user specify if given
+    # use user specified if given
     chpl_sdk_path = os.environ.get(gpu.sdk_path_env)
     if chpl_sdk_path:
         if for_gpu == get() and not validate_path(chpl_sdk_path):
@@ -375,7 +373,6 @@ def get_runtime_link_args():
 
 def get_cuda_libdevice_path():
     if get() == 'nvidia':
-        # TODO:  # find lib device: #$ NVVMIR_LIBRARY_DIR=
         # TODO this only makes sense when we are generating for nvidia
         chpl_cuda_path = get_sdk_path('nvidia')
         print(chpl_cuda_path)
@@ -435,7 +432,7 @@ def validateLlvmBuiltForTgt(expectedTgt):
     if chpl_llvm.get() == 'bundled':
         return True
 
-    exists, returncode, my_stdout, my_stderr = utils.try_run_command(
+    exists, returncode, my_stdout, _ = try_run_command(
         [chpl_llvm.get_llvm_config(), "--targets-built"])
 
     if not exists or returncode != 0:
@@ -511,13 +508,11 @@ def _validate_cuda_version_impl():
     return True
 
 def get_sdk_version():
-
     gpu = GPU_TYPES[get()]
     if not gpu.real_gpu:
         return 'none'
     version = gpu.find_version(get_gpu_compiler())
-    # TODO add validation for if the compiler matches what the user set CHPL_GPU_SDK_VERSION?
-
+    # TODO: add validation for if the compiler matches what the user set CHPL_GPU_SDK_VERSION?
     version = version.strip() if version is not None else 'none'
     return version
 
@@ -568,10 +563,4 @@ def validate(chplLocaleModel):
 
     gpu.validate_llvm()
 
-    for depr_env in ("CHPL_GPU_CODEGEN", "CHPL_GPU_RUNTIME"):
-        if os.environ.get(depr_env):
-            warning(depr_env + " is deprecated and now ignored. Please use " +
-                    "'CHPL_GPU=[nvidia|amd|cpu]' to choose a GPU target " +
-                    "explicitly.")
-
     return True

From a999c0e46fb653c4992df61d919aa7a5eb7b900f Mon Sep 17 00:00:00 2001
From: Jade Abraham <jade.abraham@hpe.com>
Date: Thu, 10 Oct 2024 16:11:15 -0500
Subject: [PATCH 13/21] fix chplenv test

Signed-off-by: Jade Abraham <jade.abraham@hpe.com>
---
 modules/standard/ChplConfig.chpl             | 1 -
 test/compflags/albrecht/chplenv/chplenv.chpl | 1 +
 2 files changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/standard/ChplConfig.chpl b/modules/standard/ChplConfig.chpl
index cda29ca67483..1e21654bc79e 100644
--- a/modules/standard/ChplConfig.chpl
+++ b/modules/standard/ChplConfig.chpl
@@ -194,7 +194,6 @@ module ChplConfig {
   param CHPL_GPU_SDK_VERSION:string;
   CHPL_GPU_SDK_VERSION = __primitive("get compiler variable", "CHPL_GPU_SDK_VERSION");
 
-
   @chpldoc.nodoc
   @unstable("'ChplConfig.CHPL_LIB_PIC' is unstable and may be replaced with a different way to access this information in the future")
   param CHPL_LIB_PIC: string;
diff --git a/test/compflags/albrecht/chplenv/chplenv.chpl b/test/compflags/albrecht/chplenv/chplenv.chpl
index 1af9ae1a0b2c..358bd8645122 100644
--- a/test/compflags/albrecht/chplenv/chplenv.chpl
+++ b/test/compflags/albrecht/chplenv/chplenv.chpl
@@ -8,6 +8,7 @@ writeln("CHPL_TARGET_ARCH=",CHPL_TARGET_ARCH);
 writeln("CHPL_TARGET_CPU=",CHPL_TARGET_CPU);
 writeln("CHPL_LOCALE_MODEL=",CHPL_LOCALE_MODEL);
 writeln("CHPL_GPU=", CHPL_GPU);
+writeln("CHPL_GPU_SDK_VERSION=", CHPL_GPU_SDK_VERSION);
 writeln("CHPL_GPU_MEM_STRATEGY=", CHPL_GPU_MEM_STRATEGY);
 writeln("CHPL_COMM=",CHPL_COMM);
 writeln("CHPL_COMM_SUBSTRATE=",CHPL_COMM_SUBSTRATE);

From 4af3cf8494db7dc039092e472f629f3b54ba0329 Mon Sep 17 00:00:00 2001
From: Jade Abraham <jade.abraham@hpe.com>
Date: Mon, 14 Oct 2024 09:30:01 -0500
Subject: [PATCH 14/21] remove extra code

Signed-off-by: Jade Abraham <jade.abraham@hpe.com>
---
 util/chplenv/chpl_gpu.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/util/chplenv/chpl_gpu.py b/util/chplenv/chpl_gpu.py
index b48096a1908b..5da3f6bc7c0c 100644
--- a/util/chplenv/chpl_gpu.py
+++ b/util/chplenv/chpl_gpu.py
@@ -374,8 +374,6 @@ def get_runtime_link_args():
 def get_cuda_libdevice_path():
     if get() == 'nvidia':
         # TODO this only makes sense when we are generating for nvidia
-        chpl_cuda_path = get_sdk_path('nvidia')
-        print(chpl_cuda_path)
         compiler = get_gpu_compiler()
         out = gpu_compiler_basic_compile(compiler, "cu")
         if not out:

From 50c283fa19acb701ed7a3d496555cbf2913e1aa4 Mon Sep 17 00:00:00 2001
From: Jade Abraham <jade.abraham@hpe.com>
Date: Mon, 14 Oct 2024 09:30:16 -0500
Subject: [PATCH 15/21] fix finding gcc prefix when command fails

Signed-off-by: Jade Abraham <jade.abraham@hpe.com>
---
 util/chplenv/chpl_llvm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/util/chplenv/chpl_llvm.py b/util/chplenv/chpl_llvm.py
index b2e6735b3b1b..2d710057244f 100755
--- a/util/chplenv/chpl_llvm.py
+++ b/util/chplenv/chpl_llvm.py
@@ -676,7 +676,7 @@ def get_gcc_prefix_dir():
         else:
             # Try to figure out the GCC prefix by running gcc
             out, err = run_command(['gcc', '-v'], stdout=True, stderr=True)
-            out = out + err
+            out = (out or '') + (err or '')
 
             # look for the --prefix= specified when GCC was configured
             words = out.split()

From 9b0973eca266456b30fb654cf422c753d7e32fbc Mon Sep 17 00:00:00 2001
From: Jade Abraham <jade.abraham@hpe.com>
Date: Mon, 14 Oct 2024 09:34:50 -0500
Subject: [PATCH 16/21] cleanup finding libdevice

Signed-off-by: Jade Abraham <jade.abraham@hpe.com>
---
 util/chplenv/chpl_gpu.py | 40 +++++++++++++++++++---------------------
 1 file changed, 19 insertions(+), 21 deletions(-)

diff --git a/util/chplenv/chpl_gpu.py b/util/chplenv/chpl_gpu.py
index 5da3f6bc7c0c..a407beb6e30e 100644
--- a/util/chplenv/chpl_gpu.py
+++ b/util/chplenv/chpl_gpu.py
@@ -66,6 +66,22 @@ def _find_cuda_sdk_path(compiler: str):
     match = re.search(regex, out, re.MULTILINE)
     return match.group(1) if match else None
 
+def find_cuda_libdevice_path(compiler: str):
+    out = gpu_compiler_basic_compile(compiler, "cu")
+    if not out:
+        return None
+    regex = r"^#\$ NVVMIR_LIBRARY_DIR=(.+)$"
+    match = re.search(regex, out, re.MULTILINE)
+    if not match:
+        return None
+    libdevice_path = match.group(1)
+    # there can be multiple libdevices for multiple compute architectures. Not
+    # sure how realistic that is, nor I see multiple instances in the systems I
+    # have access to. They are always named `libdevice.10.bc`, but I just want
+    # to be sure here.
+    libdevices = glob.glob(os.path.join(libdevice_path, "libdevice.*.bc"))
+    return libdevices[0] if len(libdevices) > 0 else None
+
 
 def find_llvm_amd_bin_path(compiler: str):
     out = gpu_compiler_basic_compile(compiler, "hip")
@@ -375,29 +391,11 @@ def get_cuda_libdevice_path():
     if get() == 'nvidia':
         # TODO this only makes sense when we are generating for nvidia
         compiler = get_gpu_compiler()
-        out = gpu_compiler_basic_compile(compiler, "cu")
-        if not out:
+        libdevice = find_cuda_libdevice_path(compiler)
+        if not libdevice:
             _reportMissingGpuReq("Can't find libdevice. Please make sure your CHPL_CUDA_PATH is " "set such that CHPL_CUDA_PATH points to the CUDA installation.")
-            return "error"
-        regex = r"^#\$ NVVMIR_LIBRARY_DIR=(.+)$"
-        match = re.search(regex, out, re.MULTILINE)
-        if not match:
-            _reportMissingGpuReq("Can't find libdevice. Please make sure your CHPL_CUDA_PATH is "
-                  "set such that CHPL_CUDA_PATH points to the CUDA installation.")
-            return 'error' 
-        libdevice_path = match.group(1)
-        # there can be multiple libdevices for multiple compute architectures. Not
-        # sure how realistic that is, nor I see multiple instances in the systems I
-        # have access to. They are always named `libdevice.10.bc`, but I just want
-        # to be sure here.
-        libdevices = glob.glob(os.path.join(libdevice_path, "libdevice.*.bc"))
-        if len(libdevices) == 0:
-            _reportMissingGpuReq("Can't find libdevice. Please make sure your CHPL_CUDA_PATH is "
-                  "set such that CHPL_CUDA_PATH{} exists.")
             return 'error'
-        else:
-            return libdevices[0]
-
+        return libdevice
     return "none"
 
 def get_rocm_llvm_path():

From b8299dea16e33cc7951fc1e8f34041a8805e2f76 Mon Sep 17 00:00:00 2001
From: Jade Abraham <jade.abraham@hpe.com>
Date: Mon, 14 Oct 2024 09:50:52 -0500
Subject: [PATCH 17/21] resolve TODO, as get() only returns nvidia if
 LOCALE_MODEL is flat

Signed-off-by: Jade Abraham <jade.abraham@hpe.com>
---
 util/chplenv/chpl_gpu.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/util/chplenv/chpl_gpu.py b/util/chplenv/chpl_gpu.py
index a407beb6e30e..a750b818b115 100644
--- a/util/chplenv/chpl_gpu.py
+++ b/util/chplenv/chpl_gpu.py
@@ -389,7 +389,6 @@ def get_runtime_link_args():
 
 def get_cuda_libdevice_path():
     if get() == 'nvidia':
-        # TODO this only makes sense when we are generating for nvidia
         compiler = get_gpu_compiler()
         libdevice = find_cuda_libdevice_path(compiler)
         if not libdevice:

From 99b2bb952a7db0524b21f64268f260f92f2dc40d Mon Sep 17 00:00:00 2001
From: Jade Abraham <jade.abraham@hpe.com>
Date: Mon, 14 Oct 2024 10:28:09 -0500
Subject: [PATCH 18/21] cleanup errors

Signed-off-by: Jade Abraham <jade.abraham@hpe.com>
---
 util/chplenv/chpl_gpu.py | 23 ++++++++++++++---------
 1 file changed, 14 insertions(+), 9 deletions(-)

diff --git a/util/chplenv/chpl_gpu.py b/util/chplenv/chpl_gpu.py
index a750b818b115..2730438a935c 100644
--- a/util/chplenv/chpl_gpu.py
+++ b/util/chplenv/chpl_gpu.py
@@ -280,7 +280,7 @@ def get_gpu_compiler():
     bin_dir = os.path.join(sdk_path, 'bin')
     full_path = os.path.join(bin_dir, name)
     if not os.path.exists(full_path):
-        _reportMissingGpuReq("Could not find {} in {}".format(name, bin_dir))
+        _reportMissingGpuReq("Can't find {} in '{}'".format(name, bin_dir))
     return full_path
 
 
@@ -300,8 +300,8 @@ def validate_path(p):
     if chpl_sdk_path:
         if for_gpu == get() and not validate_path(chpl_sdk_path):
             _reportMissingGpuReq(
-                "CHPL_GPU={} specified but SDK path '{}' does not exist."
-                .format(for_gpu, chpl_sdk_path))
+                "{}='{}' does not exist. Make sure the toolkit path is correct."
+                .format(gpu.sdk_path_env, chpl_sdk_path))
             return 'error'
         return chpl_sdk_path
 
@@ -310,8 +310,8 @@ def validate_path(p):
     if sdk_path:
         if not validate_path(sdk_path):
             _reportMissingGpuReq(
-                "Can't find {} SDK path from '{}'. Try setting {}."
-                .format(for_gpu, gpu.compiler, gpu.sdk_path_env))
+                "Inferred {} toolkit is not valid. Try setting {}."
+                .format(gpu.runtime_impl, gpu.sdk_path_env))
             return 'error'
         return sdk_path
     elif for_gpu == get():
@@ -319,7 +319,12 @@ def validate_path(p):
               .format(gpu.runtime_impl, gpu.compiler, gpu.sdk_path_env))
         return 'error'
     else:
-        return ''
+        return 'none'
+
+@memoize
+def is_sdk_path_user_specified(for_gpu):
+    return os.environ.get(GPU_TYPES[for_gpu].sdk_path_env) is not None
+
 
 def get_gpu_mem_strategy():
     memtype = os.environ.get("CHPL_GPU_MEM_STRATEGY")
@@ -392,7 +397,7 @@ def get_cuda_libdevice_path():
         compiler = get_gpu_compiler()
         libdevice = find_cuda_libdevice_path(compiler)
         if not libdevice:
-            _reportMissingGpuReq("Can't find libdevice. Please make sure your CHPL_CUDA_PATH is " "set such that CHPL_CUDA_PATH points to the CUDA installation.")
+            _reportMissingGpuReq("Can't determine libdevice path from {}".format(compiler))
             return 'error'
         return libdevice
     return "none"
@@ -402,7 +407,7 @@ def get_rocm_llvm_path():
         compiler = get_gpu_compiler()
         llvm_path = find_llvm_amd_bin_path(compiler)
         if not llvm_path:
-            _reportMissingGpuReq("Could not find llvm-amd in {}".format(compiler))
+            _reportMissingGpuReq("Can't determine AMD LLVM path from {}".format(compiler))
             return 'error'
         # strip bin path component (and trailing /)
         return os.path.dirname(llvm_path)
@@ -413,7 +418,7 @@ def get_rocm_amdgcn_path():
         compiler = get_gpu_compiler()
         amdgcn_path = find_amdgcn_path(compiler)
         if not amdgcn_path:
-            _reportMissingGpuReq("Could not find amdgcn in {}".format(compiler))
+            _reportMissingGpuReq("Can't determine amdgcn path from {}".format(compiler))
             return 'error'
         return amdgcn_path
     return 'none'

From 83ded41646da103d41347c8a1587c8e9b6df5a93 Mon Sep 17 00:00:00 2001
From: Jade Abraham <jade.abraham@hpe.com>
Date: Mon, 14 Oct 2024 10:54:38 -0500
Subject: [PATCH 19/21] fix printchplbuilds parsing

Signed-off-by: Jade Abraham <jade.abraham@hpe.com>
---
 util/chplenv/printchplbuilds.py | 1 +
 util/chplenv/printchplenv.py    | 4 ++--
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/util/chplenv/printchplbuilds.py b/util/chplenv/printchplbuilds.py
index c51f24e8d7f2..fb400bf0a4f0 100755
--- a/util/chplenv/printchplbuilds.py
+++ b/util/chplenv/printchplbuilds.py
@@ -45,6 +45,7 @@ class State(Enum):
     'cpu':          'CHPL_TARGET_CPU',
     'loc':          'CHPL_LOCALE_MODEL',
     'gpu':          'CHPL_GPU',
+    'gpu_vers':     'CHPL_GPU_SDK_VERSION',
     'gpu_mem':      'CHPL_GPU_MEM_STRATEGY',
     'comm':         'CHPL_COMM',
     'tasks':        'CHPL_TASKS',
diff --git a/util/chplenv/printchplenv.py b/util/chplenv/printchplenv.py
index 737be8c5d9bb..757a2b479518 100755
--- a/util/chplenv/printchplenv.py
+++ b/util/chplenv/printchplenv.py
@@ -101,9 +101,9 @@
     ChapelEnv('CHPL_TARGET_BACKEND_CPU', INTERNAL),
     ChapelEnv('CHPL_LOCALE_MODEL', RUNTIME | LAUNCHER | DEFAULT, 'loc'),
     ChapelEnv('  CHPL_GPU', RUNTIME | DEFAULT, 'gpu'),
-    ChapelEnv('  CHPL_GPU_SDK_VERSION', RUNTIME),
+    ChapelEnv('  CHPL_GPU_SDK_VERSION', RUNTIME, 'gpu_vers'),
     ChapelEnv('  CHPL_GPU_ARCH', INTERNAL),
-    ChapelEnv('  CHPL_GPU_MEM_STRATEGY', RUNTIME , 'gpu_mem' ),
+    ChapelEnv('  CHPL_GPU_MEM_STRATEGY', RUNTIME , 'gpu_mem'),
     ChapelEnv('  CHPL_CUDA_PATH', INTERNAL),
     ChapelEnv('  CHPL_ROCM_PATH', INTERNAL),
     ChapelEnv('  CHPL_CUDA_LIBDEVICE_PATH', INTERNAL),

From 86b7aa1a7485be7737ff495c513cb9820ded45b8 Mon Sep 17 00:00:00 2001
From: Jade Abraham <jade.abraham@hpe.com>
Date: Mon, 14 Oct 2024 12:05:54 -0500
Subject: [PATCH 20/21] make rocm llvm path more robust

Signed-off-by: Jade Abraham <jade.abraham@hpe.com>
---
 runtime/src/gpu/amd/Makefile.share | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/runtime/src/gpu/amd/Makefile.share b/runtime/src/gpu/amd/Makefile.share
index 035e7d135eab..0f7cf7b363ab 100644
--- a/runtime/src/gpu/amd/Makefile.share
+++ b/runtime/src/gpu/amd/Makefile.share
@@ -28,4 +28,4 @@ RUNTIME_CFLAGS += -Wno-strict-prototypes
 
 $(RUNTIME_OBJ_DIR)/gpu-amd-cub.o: gpu-amd-cub.cc \
                                          $(RUNTIME_OBJ_DIR_STAMP)
-	PATH=$(PATH):$(CHPL_MAKE_ROCM_PATH)/llvm/bin $(CXX) -c -std=c++17 $(RUNTIME_CXXFLAGS) $(RUNTIME_INCLS) -o $@ $<
+	PATH=$(PATH):$(CHPL_MAKE_ROCM_LLVM_PATH)/bin $(CXX) -c -std=c++17 $(RUNTIME_CXXFLAGS) $(RUNTIME_INCLS) -o $@ $<

From 67f1c887687dcccd2802b82dea3e49b4a50fe16e Mon Sep 17 00:00:00 2001
From: Jade Abraham <jade.abraham@hpe.com>
Date: Mon, 14 Oct 2024 13:17:23 -0500
Subject: [PATCH 21/21] replace rocm-version.h with `-DROCM_VERSION_MAJOR=...`

Signed-off-by: Jade Abraham <jade.abraham@hpe.com>
---
 runtime/Makefile                              | 11 --------
 runtime/include/gpu/amd/chpl-gpu-dev-reduce.h |  1 -
 runtime/include/gpu/amd/rocm-version.h        | 26 -------------------
 runtime/src/gpu/amd/gpu-amd-cub.cc            |  1 -
 runtime/src/gpu/amd/gpu-amd.c                 |  1 -
 util/chplenv/chpl_gpu.py                      |  3 +++
 6 files changed, 3 insertions(+), 40 deletions(-)
 delete mode 100644 runtime/include/gpu/amd/rocm-version.h

diff --git a/runtime/Makefile b/runtime/Makefile
index 4526b28b6520..9d422e075ced 100644
--- a/runtime/Makefile
+++ b/runtime/Makefile
@@ -64,17 +64,6 @@ $(CHPL_ENV_HEADER): $(CHPL_MAKE_HOME)/util/printchplenv $(CHPL_MAKE_HOME)/util/c
 	  sed 's/^ *//;s/ *$$//' | \
 	  sed 's/[^0-9A-Za-z]/_/g' | \
 	  awk '{ print "#define " toupper($$1) }' >> $(CHPL_ENV_HEADER)
-	@$(CHPL_MAKE_HOME)/util/printchplenv --only CHPL_GPU_SDK_VERSION --value | \
-		grep -q none && \
-			echo "#define CHPL_GPU_SDK_VERSION_MAJOR 0" >> $(CHPL_ENV_HEADER) && \
-			echo "#define CHPL_GPU_SDK_VERSION_MINOR 0" >> $(CHPL_ENV_HEADER) && \
-			echo "#define CHPL_GPU_SDK_VERSION_PATCH 0" >> $(CHPL_ENV_HEADER) \
-		|| \
-			$(CHPL_MAKE_HOME)/util/printchplenv --only CHPL_GPU_SDK_VERSION --value | \
-			sed 's/\./ /g' | \
-			awk '{ print "#define CHPL_GPU_SDK_VERSION_MAJOR " $$1; \
-						 print "#define CHPL_GPU_SDK_VERSION_MINOR " $$2; \
-						 print "#define CHPL_GPU_SDK_VERSION_PATCH " $$3 }' >> $(CHPL_ENV_HEADER)
 	@echo "#endif /* _CHPL_ENV_GEN_H_ */" >> $(CHPL_ENV_HEADER)
 
 THIRD_PARTY_PKGS = $(shell $(CHPL_MAKE_PYTHON) $(CHPL_MAKE_HOME)/util/chplenv/third-party-pkgs)
diff --git a/runtime/include/gpu/amd/chpl-gpu-dev-reduce.h b/runtime/include/gpu/amd/chpl-gpu-dev-reduce.h
index 83d0b166414a..ca1aef0759da 100644
--- a/runtime/include/gpu/amd/chpl-gpu-dev-reduce.h
+++ b/runtime/include/gpu/amd/chpl-gpu-dev-reduce.h
@@ -25,7 +25,6 @@
 #include "chpltypes.h"
 #include <hip/hip_common.h>
 #include <hip/hip_runtime.h>
-#include "gpu/amd/rocm-version.h"
 #include "gpu/amd/rocm-utils.h"
 
 #if ROCM_VERSION_MAJOR >= 5
diff --git a/runtime/include/gpu/amd/rocm-version.h b/runtime/include/gpu/amd/rocm-version.h
deleted file mode 100644
index 8da037d82da0..000000000000
--- a/runtime/include/gpu/amd/rocm-version.h
+++ /dev/null
@@ -1,26 +0,0 @@
-
-/*
- * Copyright 2020-2024 Hewlett Packard Enterprise Development LP
- * Copyright 2004-2019 Cray Inc.
- * Other additional copyright holders may be indicated within.  *
- * The entirety of this work is licensed under the Apache License,
- * Version 2.0 (the "License"); you may not use this file except
- * in compliance with the License.
- *
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __HIP_PLATFORM_AMD__
-#define __HIP_PLATFORM_AMD__
-#endif
-
-// CHPL_GPU_SDK_VERSION_MAJOR is determined by printchplenv
-#define ROCM_VERSION_MAJOR CHPL_GPU_SDK_VERSION_MAJOR
diff --git a/runtime/src/gpu/amd/gpu-amd-cub.cc b/runtime/src/gpu/amd/gpu-amd-cub.cc
index 35adda537201..9b0f437a9e57 100644
--- a/runtime/src/gpu/amd/gpu-amd-cub.cc
+++ b/runtime/src/gpu/amd/gpu-amd-cub.cc
@@ -21,7 +21,6 @@
 
 
 #include <hip/hip_common.h>
-#include "gpu/amd/rocm-version.h"
 
 #if ROCM_VERSION_MAJOR >= 5
 // if we include this all the time, we get unused function errors
diff --git a/runtime/src/gpu/amd/gpu-amd.c b/runtime/src/gpu/amd/gpu-amd.c
index 25f48d109223..49d9cf5445a0 100644
--- a/runtime/src/gpu/amd/gpu-amd.c
+++ b/runtime/src/gpu/amd/gpu-amd.c
@@ -30,7 +30,6 @@
 #include "chpl-env-gen.h"
 #include "chpl-linefile-support.h"
 #include "gpu/amd/rocm-utils.h"
-#include "gpu/amd/rocm-version.h"
 #include "chpl-topo.h"
 
 #include <assert.h>
diff --git a/util/chplenv/chpl_gpu.py b/util/chplenv/chpl_gpu.py
index 2730438a935c..17c4ab2719fb 100644
--- a/util/chplenv/chpl_gpu.py
+++ b/util/chplenv/chpl_gpu.py
@@ -363,6 +363,9 @@ def get_runtime_compile_args():
         # -isystem instead of -I silences warnings from inside these includes.
         system.append("-isystem" + os.path.join(sdk_path, "include"))
         system.append("-isystem" + os.path.join(sdk_path, "hip", "include"))
+
+        major_version = get_sdk_version().split('.')[0]
+        bundled.append("-DROCM_VERSION_MAJOR=" + major_version)
     
     return bundled, system