From 79ab7a9cec365929875b2e6cf973d200b75956fe Mon Sep 17 00:00:00 2001
From: Cole Kendrick <kendrickcj@ornl.gov>
Date: Tue, 12 Jan 2021 16:46:38 -0500
Subject: [PATCH 01/36] Initial hipify from perl script

---
 src/build_info.hpp.in        |  2 +-
 src/device/kronmult_cuda.cpp | 34 +++++++++----------
 src/kronmult.cpp             |  4 +--
 src/kronmult.hpp             |  2 +-
 src/lib_dispatch.cpp         | 64 ++++++++++++++++++------------------
 src/lib_dispatch_tests.cpp   |  6 ++--
 src/program_options.cpp      |  2 +-
 src/tensors.hpp              | 18 +++++-----
 8 files changed, 65 insertions(+), 67 deletions(-)
diff --git a/src/build_info.hpp.in b/src/build_info.hpp.in
index 089a874cd..019abe0b7 100644
--- a/src/build_info.hpp.in
+++ b/src/build_info.hpp.in
@@ -6,7 +6,7 @@
 #define BUILD_TIME "@BUILD_TIME@"
 
 #cmakedefine ASGARD_IO_HIGHFIVE
-#cmakedefine ASGARD_USE_CUDA
+#cmakedefine ASGARD_USE_HIP
 #cmakedefine ASGARD_USE_OPENMP
 #cmakedefine ASGARD_USE_MPI
 #cmakedefine ASGARD_USE_MATLAB
diff --git a/src/device/kronmult_cuda.cpp b/src/device/kronmult_cuda.cpp
index 9cc8a9d85..c543531a0 100644
--- a/src/device/kronmult_cuda.cpp
+++ b/src/device/kronmult_cuda.cpp
@@ -1,9 +1,8 @@
 #include "kronmult_cuda.hpp"
 #include "build_info.hpp"
 
-#ifdef ASGARD_USE_CUDA
-#include <cuda.h>
-#include <cuda_runtime.h>
+#ifdef ASGARD_USE_HIP
+#include <hip/hip_runtime.h>
 #define USE_GPU
 #define GLOBAL_FUNCTION __global__
 #define SYNCTHREADS __syncthreads()
@@ -47,7 +46,7 @@ GLOBAL_FUNCTION void
 stage_inputs_kronmult_kernel(P const *const x, P *const workspace,
                              int const num_elems, int const num_copies)
 {
-#ifdef ASGARD_USE_CUDA
+#ifdef ASGARD_USE_HIP
 
   expect(blockIdx.y == 0);
   expect(blockIdx.z == 0);
@@ -90,7 +89,7 @@ void stage_inputs_kronmult(P const *const x, P *const workspace,
   expect(num_elems > 0);
   expect(num_copies > 0);
 
-#ifdef ASGARD_USE_CUDA
+#ifdef ASGARD_USE_HIP
 
   auto constexpr warp_size   = 32;
   auto constexpr num_warps   = 8;
@@ -99,8 +98,7 @@ void stage_inputs_kronmult(P const *const x, P *const workspace,
   auto const total_copies = static_cast<int64_t>(num_elems) * num_copies;
   auto const num_blocks   = (total_copies + num_threads - 1) / num_threads;
 
-  stage_inputs_kronmult_kernel<P>
-      <<<num_blocks, num_threads>>>(x, workspace, num_elems, num_copies);
+  hipLaunchKernelGGL(HIP_KERNEL_NAME(stage_inputs_kronmult_kernel<P>), dim3(num_blocks), dim3(num_threads), 0, 0, x, workspace, num_elems, num_copies);
 
   auto const stat = cudaDeviceSynchronize();
   expect(stat == cudaSuccess);
@@ -175,7 +173,7 @@ prepare_kronmult_kernel(int const *const flattened_table,
   auto const coord_size = num_dims * 2;
   auto const num_elems  = static_cast<int64_t>(num_cols) * num_rows;
 
-#ifdef ASGARD_USE_CUDA
+#ifdef ASGARD_USE_HIP
 
   expect(blockIdx.y == 0);
   expect(blockIdx.z == 0);
@@ -192,7 +190,7 @@ prepare_kronmult_kernel(int const *const flattened_table,
   auto const increment = 1;
 #endif
 
-#ifndef ASGARD_USE_CUDA
+#ifndef ASGARD_USE_HIP
 #ifdef ASGARD_USE_OPENMP
 #pragma omp parallel for
 #endif
@@ -273,7 +271,7 @@ void prepare_kronmult(int const *const flattened_table,
   expect(input_ptrs);
   expect(output_ptrs);
 
-#ifdef ASGARD_USE_CUDA
+#ifdef ASGARD_USE_HIP
   auto constexpr warp_size   = 32;
   auto constexpr num_warps   = 8;
   auto constexpr num_threads = num_warps * warp_size;
@@ -281,7 +279,7 @@ void prepare_kronmult(int const *const flattened_table,
       static_cast<int64_t>(elem_col_stop - elem_col_start + 1) *
       (elem_row_stop - elem_row_start + 1);
   auto const num_blocks = (num_krons / num_threads) + 1;
-  prepare_kronmult_kernel<P><<<num_blocks, num_threads>>>(
+  hipLaunchKernelGGL(HIP_KERNEL_NAME(prepare_kronmult_kernel<P>), dim3(num_blocks), dim3(num_threads), 0, 0, 
       flattened_table, operators, operator_lda, element_x, element_work, fx,
       operator_ptrs, work_ptrs, input_ptrs, output_ptrs, degree, num_terms,
       num_dims, elem_row_start, elem_row_stop, elem_col_start, elem_col_stop);
@@ -304,7 +302,7 @@ void call_kronmult(int const n, P *x_ptrs[], P *output_ptrs[], P *work_ptrs[],
                    P const *const operator_ptrs[], int const lda,
                    int const num_krons, int const num_dims)
 {
-#ifdef ASGARD_USE_CUDA
+#ifdef ASGARD_USE_HIP
   {
     int constexpr warpsize    = 32;
     int constexpr nwarps      = 1;
@@ -313,27 +311,27 @@ void call_kronmult(int const n, P *x_ptrs[], P *output_ptrs[], P *work_ptrs[],
     switch (num_dims)
     {
     case 1:
-      kronmult1_xbatched<P><<<num_krons, num_threads>>>(
+      hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult1_xbatched<P>), dim3(num_krons), dim3(num_threads), 0, 0, 
           n, operator_ptrs, lda, x_ptrs, output_ptrs, work_ptrs, num_krons);
       break;
     case 2:
-      kronmult2_xbatched<P><<<num_krons, num_threads>>>(
+      hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult2_xbatched<P>), dim3(num_krons), dim3(num_threads), 0, 0, 
           n, operator_ptrs, lda, x_ptrs, output_ptrs, work_ptrs, num_krons);
       break;
     case 3:
-      kronmult3_xbatched<P><<<num_krons, num_threads>>>(
+      hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult3_xbatched<P>), dim3(num_krons), dim3(num_threads), 0, 0, 
           n, operator_ptrs, lda, x_ptrs, output_ptrs, work_ptrs, num_krons);
       break;
     case 4:
-      kronmult4_xbatched<P><<<num_krons, num_threads>>>(
+      hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult4_xbatched<P>), dim3(num_krons), dim3(num_threads), 0, 0, 
           n, operator_ptrs, lda, x_ptrs, output_ptrs, work_ptrs, num_krons);
       break;
     case 5:
-      kronmult5_xbatched<P><<<num_krons, num_threads>>>(
+      hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult5_xbatched<P>), dim3(num_krons), dim3(num_threads), 0, 0, 
           n, operator_ptrs, lda, x_ptrs, output_ptrs, work_ptrs, num_krons);
       break;
     case 6:
-      kronmult6_xbatched<P><<<num_krons, num_threads>>>(
+      hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult6_xbatched<P>), dim3(num_krons), dim3(num_threads), 0, 0, 
           n, operator_ptrs, lda, x_ptrs, output_ptrs, work_ptrs, num_krons);
       break;
     default:
diff --git a/src/kronmult.cpp b/src/kronmult.cpp
index 44d8f0098..a040c28f4 100644
--- a/src/kronmult.cpp
+++ b/src/kronmult.cpp
@@ -3,8 +3,8 @@
 #include "lib_dispatch.hpp"
 #include "tools.hpp"
 
-#ifdef ASGARD_USE_CUDA
-#include <cuda_runtime.h>
+#ifdef ASGARD_USE_HIP
+#include <hip/hip_runtime.h>
 #endif
 
 #ifdef ASGARD_USE_OPENMP
diff --git a/src/kronmult.hpp b/src/kronmult.hpp
index 47e2f46f3..ad6cdbbf7 100644
--- a/src/kronmult.hpp
+++ b/src/kronmult.hpp
@@ -1,5 +1,5 @@
 #pragma once
-#ifdef ASGARD_USE_CUDA
+#ifdef ASGARD_USE_HIP
 #define USE_GPU
 #endif
 #include "distribution.hpp"
diff --git a/src/lib_dispatch.cpp b/src/lib_dispatch.cpp
index 15277a3b0..e39e952ba 100644
--- a/src/lib_dispatch.cpp
+++ b/src/lib_dispatch.cpp
@@ -60,9 +60,9 @@ extern "C"
 #pragma GCC diagnostic pop
 #endif
 
-#ifdef ASGARD_USE_CUDA
-#include <cublas_v2.h>
-#include <cuda_runtime.h>
+#ifdef ASGARD_USE_HIP
+#include <hipblas.h>
+#include <hip/hip_runtime.h>
 #endif
 
 #ifdef ASGARD_USE_SCALAPACK
@@ -104,7 +104,7 @@ struct device_handler
 
   void set_device(int const local_rank)
   {
-#ifdef ASGARD_USE_CUDA
+#ifdef ASGARD_USE_HIP
     int num_devices;
     auto success = cudaGetDeviceCount(&num_devices);
 
@@ -133,14 +133,14 @@ struct device_handler
 #endif
   ~device_handler()
   {
-#ifdef ASGARD_USE_CUDA
-    cublasDestroy(handle);
+#ifdef ASGARD_USE_HIP
+    hipblasDestroy(handle);
 #endif
   }
 
 private:
-#ifdef ASGARD_USE_CUDA
-  cublasHandle_t handle;
+#ifdef ASGARD_USE_HIP
+  hipblasHandle_t handle;
 #endif
 };
 static device_handler device;
@@ -155,16 +155,16 @@ void initialize_libraries(int const local_rank)
 #endif
 }
 
-#ifdef ASGARD_USE_CUDA
-inline cublasOperation_t cublas_trans(char trans)
+#ifdef ASGARD_USE_HIP
+inline hipblasOperation_t cublas_trans(char trans)
 {
   if (trans == 'N' || trans == 'n')
   {
-    return CUBLAS_OP_N;
+    return HIPBLAS_OP_N;
   }
   else
   {
-    return CUBLAS_OP_T;
+    return HIPBLAS_OP_T;
   }
 }
 #endif
@@ -182,7 +182,7 @@ void rotg(P *a, P *b, P *c, P *s, resource const resrc)
   if (resrc == resource::device)
   {
     // device-specific specialization if needed
-#ifdef ASGARD_USE_CUDA
+#ifdef ASGARD_USE_HIP
     // function instantiated for these two fp types
     if constexpr (std::is_same<P, double>::value)
     {
@@ -219,7 +219,7 @@ P nrm2(int *n, P *x, int *incx, resource const resrc)
   if (resrc == resource::device)
   {
     // device-specific specialization if needed
-#ifdef ASGARD_USE_CUDA
+#ifdef ASGARD_USE_HIP
     // no non-fp blas on device
     expect(std::is_floating_point_v<P>);
     P norm;
@@ -272,7 +272,7 @@ void copy(int *n, P *x, int *incx, P *y, int *incy, resource const resrc)
   if (resrc == resource::device)
   {
     // device-specific specialization if needed
-#ifdef ASGARD_USE_CUDA
+#ifdef ASGARD_USE_HIP
     // no non-fp blas on device
     expect(std::is_floating_point_v<P>);
 
@@ -323,7 +323,7 @@ P dot(int *n, P *x, int *incx, P *y, int *incy, resource const resrc)
   if (resrc == resource::device)
   {
     // device-specific specialization if needed
-#ifdef ASGARD_USE_CUDA
+#ifdef ASGARD_USE_HIP
     // no non-fp blas on device
     expect(std::is_floating_point_v<P>);
 
@@ -379,7 +379,7 @@ void axpy(int *n, P *alpha, P *x, int *incx, P *y, int *incy,
   if (resrc == resource::device)
   {
     // device-specific specialization if needed
-#ifdef ASGARD_USE_CUDA
+#ifdef ASGARD_USE_HIP
     // no non-fp blas on device
     expect(std::is_floating_point_v<P>);
 
@@ -429,7 +429,7 @@ void scal(int *n, P *alpha, P *x, int *incx, resource const resrc)
   if (resrc == resource::device)
   {
     // device-specific specialization if needed
-#ifdef ASGARD_USE_CUDA
+#ifdef ASGARD_USE_HIP
     // no non-fp blas on device
     expect(std::is_floating_point_v<P>);
 
@@ -561,7 +561,7 @@ void gemv(char const *trans, int *m, int *n, P *alpha, P *A, int *lda, P *x,
   if (resrc == resource::device)
   {
     // device-specific specialization if needed
-#ifdef ASGARD_USE_CUDA
+#ifdef ASGARD_USE_HIP
     // no non-fp blas on device
     expect(std::is_floating_point_v<P>);
 
@@ -569,14 +569,14 @@ void gemv(char const *trans, int *m, int *n, P *alpha, P *A, int *lda, P *x,
     if constexpr (std::is_same<P, double>::value)
     {
       auto const success =
-          cublasDgemv(device.get_handle(), cublas_trans(*trans), *m, *n, alpha,
+          hipblasDgemv(device.get_handle(), cublas_trans(*trans), *m, *n, alpha,
                       A, *lda, x, *incx, beta, y, *incy);
       expect(success == 0);
     }
     else if constexpr (std::is_same<P, float>::value)
     {
       auto const success =
-          cublasSgemv(device.get_handle(), cublas_trans(*trans), *m, *n, alpha,
+          hipblasSgemv(device.get_handle(), cublas_trans(*trans), *m, *n, alpha,
                       A, *lda, x, *incx, beta, y, *incy);
       expect(success == 0);
     }
@@ -627,21 +627,21 @@ void gemm(char const *transa, char const *transb, int *m, int *n, int *k,
   if (resrc == resource::device)
   {
     // device-specific specialization if needed
-#ifdef ASGARD_USE_CUDA
+#ifdef ASGARD_USE_HIP
     // no non-fp blas on device
     expect(std::is_floating_point_v<P>);
 
     // instantiated for these two fp types
     if constexpr (std::is_same<P, double>::value)
     {
-      auto const success = cublasDgemm(
+      auto const success = hipblasDgemm(
           device.get_handle(), cublas_trans(*transa), cublas_trans(*transb), *m,
           *n, *k, alpha, A, *lda, B, *ldb, beta, C, *ldc);
       expect(success == 0);
     }
     else if constexpr (std::is_same<P, float>::value)
     {
-      auto const success = cublasSgemm(
+      auto const success = hipblasSgemm(
           device.get_handle(), cublas_trans(*transa), cublas_trans(*transb), *m,
           *n, *k, alpha, A, *lda, B, *ldb, beta, C, *ldc);
       expect(success == 0);
@@ -686,7 +686,7 @@ void getrf(int *m, int *n, P *A, int *lda, int *ipiv, int *info,
   if (resrc == resource::device)
   {
     // device-specific specialization if needed
-#ifdef ASGARD_USE_CUDA
+#ifdef ASGARD_USE_HIP
 
     // no non-fp blas on device
     expect(std::is_floating_point_v<P>);
@@ -747,7 +747,7 @@ void getri(int *n, P *A, int *lda, int *ipiv, P *work, int *lwork, int *info,
   if (resrc == resource::device)
   {
     // device-specific specialization if needed
-#ifdef ASGARD_USE_CUDA
+#ifdef ASGARD_USE_HIP
 
     // no non-fp blas on device
     expect(std::is_floating_point_v<P>);
@@ -824,7 +824,7 @@ void batched_gemm(P **const &a, int *lda, char const *transa, P **const &b,
   if (resrc == resource::device)
   {
     // device-specific specialization if needed
-#ifdef ASGARD_USE_CUDA
+#ifdef ASGARD_USE_HIP
     // no non-fp blas on device
     expect(std::is_floating_point_v<P>);
 
@@ -849,7 +849,7 @@ void batched_gemm(P **const &a, int *lda, char const *transa, P **const &b,
     // instantiated for these two fp types
     if constexpr (std::is_same<P, double>::value)
     {
-      auto const success = cublasDgemmBatched(
+      auto const success = hipblasDgemmBatched(
           device.get_handle(), cublas_trans(*transa), cublas_trans(*transb), *m,
           *n, *k, alpha, a_d, *lda, b_d, *ldb, beta, c_d, *ldc, *num_batch);
       auto const cuda_stat = cudaDeviceSynchronize();
@@ -858,7 +858,7 @@ void batched_gemm(P **const &a, int *lda, char const *transa, P **const &b,
     }
     else if constexpr (std::is_same<P, float>::value)
     {
-      auto const success = cublasSgemmBatched(
+      auto const success = hipblasSgemmBatched(
           device.get_handle(), cublas_trans(*transa), cublas_trans(*transb), *m,
           *n, *k, alpha, a_d, *lda, b_d, *ldb, beta, c_d, *ldc, *num_batch);
       auto const cuda_stat = cudaDeviceSynchronize();
@@ -912,7 +912,7 @@ void batched_gemv(P **const &a, int *lda, char const *trans, P **const &x,
   if (resrc == resource::device)
   {
     // device-specific specialization if needed
-#ifdef ASGARD_USE_CUDA
+#ifdef ASGARD_USE_HIP
     // no non-fp blas on device
     expect(std::is_floating_point_v<P>);
     char const transb = 'n';
@@ -945,7 +945,7 @@ void batched_gemv(P **const &a, int *lda, char const *trans, P **const &x,
     // instantiated for these two fp types
     if constexpr (std::is_same<P, double>::value)
     {
-      auto const success = cublasDgemmBatched(
+      auto const success = hipblasDgemmBatched(
           device.get_handle(), cublas_trans(*trans), cublas_trans(transb),
           gemm_m, gemm_n, gemm_k, alpha, a_d, *lda, x_d, ldb, beta, y_d, ldc,
           *num_batch);
@@ -955,7 +955,7 @@ void batched_gemv(P **const &a, int *lda, char const *trans, P **const &x,
     }
     else if constexpr (std::is_same<P, float>::value)
     {
-      auto const success = cublasSgemmBatched(
+      auto const success = hipblasSgemmBatched(
           device.get_handle(), cublas_trans(*trans), cublas_trans(transb),
           gemm_m, gemm_n, gemm_k, alpha, a_d, *lda, x_d, ldb, beta, y_d, ldc,
           *num_batch);
diff --git a/src/lib_dispatch_tests.cpp b/src/lib_dispatch_tests.cpp
index 87ea0c9ea..a6095cec0 100644
--- a/src/lib_dispatch_tests.cpp
+++ b/src/lib_dispatch_tests.cpp
@@ -793,7 +793,7 @@ TEMPLATE_TEST_CASE("dot product (lib_dispatch::dot)", "[lib_dispatch]", float,
 TEMPLATE_TEST_CASE("device inversion test (lib_dispatch::getrf/getri)",
                    "[lib_dispatch]", float, double)
 {
-#ifdef ASGARD_USE_CUDA
+#ifdef ASGARD_USE_HIP
 
   fk::matrix<TestType> const test{{0.767135868133925, -0.641484652834663},
                                   {0.641484652834663, 0.767135868133926}};
@@ -810,7 +810,7 @@ TEMPLATE_TEST_CASE("device inversion test (lib_dispatch::getrf/getri)",
   lib_dispatch::getrf(&m, &n, test_d.data(), &lda, ipiv_d.data(), info_d.data(),
                       resource::device);
 
-  auto stat = cudaDeviceSynchronize();
+  auto stat = hipDeviceSynchronize();
   REQUIRE(stat == 0);
   fk::vector<int> const info_check(info_d.clone_onto_host());
   REQUIRE(info_check(0) == 0);
@@ -822,7 +822,7 @@ TEMPLATE_TEST_CASE("device inversion test (lib_dispatch::getrf/getri)",
   lib_dispatch::getri(&n, test_d.data(), &lda, ipiv_d.data(), work.data(),
                       &size, info_d.data(), resource::device);
 
-  stat = cudaDeviceSynchronize();
+  stat = hipDeviceSynchronize();
   REQUIRE(stat == 0);
   fk::vector<int> const info_check_2(info_d.clone_onto_host());
   REQUIRE(info_check_2(0) == 0);
diff --git a/src/program_options.cpp b/src/program_options.cpp
index 548c17f3d..6663686ef 100644
--- a/src/program_options.cpp
+++ b/src/program_options.cpp
@@ -211,7 +211,7 @@ parser::parser(int argc, char **argv)
     }
   }
 
-#ifdef ASGARD_USE_CUDA
+#ifdef ASGARD_USE_HIP
   if (use_implicit_stepping)
   {
     std::cerr << "GPU acceleration not implemented for implicit stepping\n";
diff --git a/src/tensors.hpp b/src/tensors.hpp
index 90eecd8c7..7b3560250 100644
--- a/src/tensors.hpp
+++ b/src/tensors.hpp
@@ -1,8 +1,8 @@
 #pragma once
 #include "build_info.hpp"
 
-#ifdef ASGARD_USE_CUDA
-#include <cuda_runtime.h>
+#ifdef ASGARD_USE_HIP
+#include <hip/hip_runtime.h>
 #endif
 
 #include "lib_dispatch.hpp"
@@ -683,7 +683,7 @@ template<typename P>
 inline void
 copy_on_device(P *const dest, P const *const source, int const num_elems)
 {
-#ifdef ASGARD_USE_CUDA
+#ifdef ASGARD_USE_HIP
   auto const success =
       cudaMemcpy(dest, source, num_elems * sizeof(P), cudaMemcpyDeviceToDevice);
   expect(success == cudaSuccess);
@@ -696,7 +696,7 @@ template<typename P>
 inline void
 copy_to_device(P *const dest, P const *const source, int const num_elems)
 {
-#ifdef ASGARD_USE_CUDA
+#ifdef ASGARD_USE_HIP
   auto const success =
       cudaMemcpy(dest, source, num_elems * sizeof(P), cudaMemcpyHostToDevice);
   expect(success == cudaSuccess);
@@ -709,7 +709,7 @@ template<typename P>
 inline void
 copy_to_host(P *const dest, P const *const source, int const num_elems)
 {
-#ifdef ASGARD_USE_CUDA
+#ifdef ASGARD_USE_HIP
   auto const success =
       cudaMemcpy(dest, source, num_elems * sizeof(P), cudaMemcpyDeviceToHost);
   expect(success == cudaSuccess);
@@ -726,9 +726,9 @@ copy_matrix_on_device(fk::matrix<P, mem, resource::device> &dest,
   expect(source.nrows() == dest.nrows());
   expect(source.ncols() == dest.ncols());
 
-#ifdef ASGARD_USE_CUDA
+#ifdef ASGARD_USE_HIP
   auto const success =
-      cudaMemcpy2D(dest.data(), dest.stride() * sizeof(P), source.data(),
+      hipMemcpy2D(dest.data(), dest.stride() * sizeof(P), source.data(),
                    source.stride() * sizeof(P), source.nrows() * sizeof(P),
                    source.ncols(), cudaMemcpyDeviceToDevice);
   expect(success == 0);
@@ -747,7 +747,7 @@ copy_matrix_to_device(fk::matrix<P, mem, resource::device> &dest,
   expect(source.ncols() == dest.ncols());
 #ifdef ASGARD_USE_CUDA
   auto const success =
-      cudaMemcpy2D(dest.data(), dest.stride() * sizeof(P), source.data(),
+      hipMemcpy2D(dest.data(), dest.stride() * sizeof(P), source.data(),
                    source.stride() * sizeof(P), source.nrows() * sizeof(P),
                    source.ncols(), cudaMemcpyHostToDevice);
   expect(success == 0);
@@ -766,7 +766,7 @@ copy_matrix_to_host(fk::matrix<P, mem, resource::host> &dest,
   expect(source.ncols() == dest.ncols());
 #ifdef ASGARD_USE_CUDA
   auto const success =
-      cudaMemcpy2D(dest.data(), dest.stride() * sizeof(P), source.data(),
+      hipMemcpy2D(dest.data(), dest.stride() * sizeof(P), source.data(),
                    source.stride() * sizeof(P), source.nrows() * sizeof(P),
                    source.ncols(), cudaMemcpyDeviceToHost);
   expect(success == 0);

From f70d02bc34920550bf7b0da63094383f92be4fd5 Mon Sep 17 00:00:00 2001
From: Cole Kendrick <kendrickcj@ornl.gov>
Date: Tue, 26 Jan 2021 11:20:49 -0500
Subject: [PATCH 02/36] WIP HIP CMake configuration

---
 CMakeLists.txt | 38 ++++++++++++++++++++++++++++----------
 1 file changed, 28 insertions(+), 10 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index d6ea3fee9..5fc2ccdc5 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -78,7 +78,7 @@ option (ASGARD_PROFILE_PERF "enable profiling support for using linux perf" "")
 option (ASGARD_PROFILE_VALGRIND "enable profiling support for using valgrind" "")
 option (ASGARD_GRAPHVIZ_PATH "optional location of bin/ containing dot executable" "")
 option (ASGARD_IO_HIGHFIVE "Use the HighFive HDF5 header library for I/O" OFF)
-option (ASGARD_USE_CUDA "Optional CUDA support for asgard" OFF)
+option (ASGARD_USE_HIP "Optional HIP support for asgard" ON)
 option (ASGARD_USE_OPENMP "Optional openMP support for asgard" ON)
 option (ASGARD_USE_MPI "Optional distributed computing support for asgard" OFF)
 include(CMakeDependentOption)
@@ -136,12 +136,30 @@ if(ASGARD_USE_OPENMP)
   endif()
 endif()
 
-if(ASGARD_USE_CUDA)
-  find_package(CUDA 9.0 REQUIRED) # eventually want to remove this - how to set min version with enable_language?
-  include_directories(${CUDA_INCLUDE_DIRS})
-  enable_language(CUDA) 
-  set (CMAKE_CUDA_STANDARD 14)
-  set (CMAKE_CUDA_STANDARD_REQUIRED ON)
+if(ASGARD_USE_HIP)
+  # find the HIP directory for the cmake files, taken from HIP samples
+  if(NOT DEFINED HIP_PATH)
+    if(NOT DEFINED ENV{HIP_PATH})
+      set(HIP_PATH "/opt/rocm/hip" CACHE PATH "Path to which HIP has been installed")
+    else()
+      set(HIP_PATH $ENV{HIP_PATH} CACHE PATH "Path to which HIP has been installed")
+    endif()
+  endif()
+  set(CMAKE_MODULE_PATH "${HIP_PATH}/cmake" ${CMAKE_MODULE_PATH})
+
+  find_package(HIP 4.0 REQUIRED)
+  # Print some debug info about HIP configuration
+  message(STATUS "HIP PLATFORM: ${HIP_PLATFORM}")
+  message(STATUS "HIP COMPILER: ${HIP_COMPILER}")
+  message(STATUS "HIP RUNTIME: ${HIP_RUNTIME}")
+  message(STATUS "HIP Include dir: ${HIP_INCLUDE_DIRS}")
+  message(STATUS "HIP Libraries: ${HIP_LIBRARIES}")
+  #include_directories(${HIP_INCLUDE_DIRS})
+  message(STATUS ${HIP_INCLUDE_DIRS})
+
+  #enable_language(HIP) 
+  #set (CMAKE_CUDA_STANDARD 14)
+  #set (CMAKE_CUDA_STANDARD_REQUIRED ON)
 endif()
 
 # build component to interface with Ed's kronmult lib
@@ -158,7 +176,7 @@ if(ASGARD_USE_CUDA)
 endif()
 
 if(ASGARD_USE_MKL)
-  if(ASGARD_USE_CUDA)
+  if(ASGARD_USE_HIP)
     set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --compiler-options -fopenmp")
   else()
     target_compile_options (kronmult_cuda PRIVATE "-fopenmp") # CMAKE doesn't handle MKL openmp link properly
@@ -330,8 +348,8 @@ target_link_libraries (quadrature PRIVATE matlab_utilities tensors)
 target_link_libraries (solver PRIVATE distribution fast_math lib_dispatch tensors)
 
 target_link_libraries (tensors PRIVATE lib_dispatch)
-if (ASGARD_USE_CUDA)
-  target_link_libraries (tensors PRIVATE ${CUDA_LIBRARIES})
+if (ASGARD_USE_HIP)
+  target_link_libraries (tensors PRIVATE ${HIP_LIBRARIES})
 endif ()
 if (ASGARD_USE_SCALAPACK)	
   add_compile_definitions (ASGARD_USE_SCALAPACK)

From 0e65742bc27602d6c222cbde1902703b80a13b71 Mon Sep 17 00:00:00 2001
From: Cole Kendrick <kendrickcj@ornl.gov>
Date: Tue, 2 Feb 2021 11:19:32 -0500
Subject: [PATCH 03/36] Hipifying some missed parts

---
 src/device/kronmult_cuda.cpp |  13 ++--
 src/lib_dispatch.cpp         | 112 +++++++++++++++++------------------
 src/tensors.hpp              |  30 +++++-----
 3 files changed, 78 insertions(+), 77 deletions(-)

diff --git a/src/device/kronmult_cuda.cpp b/src/device/kronmult_cuda.cpp
index c543531a0..9e5fcb139 100644
--- a/src/device/kronmult_cuda.cpp
+++ b/src/device/kronmult_cuda.cpp
@@ -1,3 +1,4 @@
+#include "hip/hip_runtime.h"
 #include "kronmult_cuda.hpp"
 #include "build_info.hpp"
 
@@ -100,8 +101,8 @@ void stage_inputs_kronmult(P const *const x, P *const workspace,
 
   hipLaunchKernelGGL(HIP_KERNEL_NAME(stage_inputs_kronmult_kernel<P>), dim3(num_blocks), dim3(num_threads), 0, 0, x, workspace, num_elems, num_copies);
 
-  auto const stat = cudaDeviceSynchronize();
-  expect(stat == cudaSuccess);
+  auto const stat = hipDeviceSynchronize();
+  expect(stat == hipSuccess);
 #else
   stage_inputs_kronmult_kernel(x, workspace, num_elems, num_copies);
 #endif
@@ -283,8 +284,8 @@ void prepare_kronmult(int const *const flattened_table,
       flattened_table, operators, operator_lda, element_x, element_work, fx,
       operator_ptrs, work_ptrs, input_ptrs, output_ptrs, degree, num_terms,
       num_dims, elem_row_start, elem_row_stop, elem_col_start, elem_col_stop);
-  auto const stat = cudaDeviceSynchronize();
-  expect(stat == cudaSuccess);
+  auto const stat = hipDeviceSynchronize();
+  expect(stat == hipSuccess);
 #else
   prepare_kronmult_kernel(
       flattened_table, operators, operator_lda, element_x, element_work, fx,
@@ -341,8 +342,8 @@ void call_kronmult(int const n, P *x_ptrs[], P *output_ptrs[], P *work_ptrs[],
     // -------------------------------------------
     // note important to wait for kernel to finish
     // -------------------------------------------
-    auto const stat = cudaDeviceSynchronize();
-    expect(stat == cudaSuccess);
+    auto const stat = hipDeviceSynchronize();
+    expect(stat == hipSuccess);
   }
 #else
 
diff --git a/src/lib_dispatch.cpp b/src/lib_dispatch.cpp
index e39e952ba..d6dbbe1cf 100644
--- a/src/lib_dispatch.cpp
+++ b/src/lib_dispatch.cpp
@@ -61,8 +61,8 @@ extern "C"
 #endif
 
 #ifdef ASGARD_USE_HIP
-#include <hipblas.h>
 #include <hip/hip_runtime.h>
+#include <hipblas.h>
 #endif
 
 #ifdef ASGARD_USE_SCALAPACK
@@ -94,11 +94,11 @@ struct device_handler
   device_handler()
   {
 #ifdef ASGARD_USE_CUDA
-    auto success = cublasCreate(&handle);
-    expect(success == CUBLAS_STATUS_SUCCESS);
+    auto success = hipblasCreate(&handle);
+    expect(success == HIPBLAS_STATUS_SUCCESS);
 
-    success = cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_HOST);
-    expect(success == CUBLAS_STATUS_SUCCESS);
+    success = hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_HOST);
+    expect(success == HIPBLAS_STATUS_SUCCESS);
 #endif
   }
 
@@ -106,22 +106,22 @@ struct device_handler
   {
 #ifdef ASGARD_USE_HIP
     int num_devices;
-    auto success = cudaGetDeviceCount(&num_devices);
+    auto success = hipGetDeviceCount(&num_devices);
 
-    expect(success == cudaSuccess);
+    expect(success == hipSuccess);
     expect(local_rank >= 0);
     expect(local_rank < num_devices);
 
     if (handle)
     {
-      auto const cublas_success = cublasDestroy(handle);
-      expect(cublas_success == CUBLAS_STATUS_SUCCESS);
+      auto const cublas_success = hipblasDestroy(handle);
+      expect(cublas_success == HIPBLAS_STATUS_SUCCESS);
     }
 
-    success = cudaSetDevice(local_rank);
-    expect(success == cudaSuccess);
-    auto const cublas_success = cublasCreate(&handle);
-    expect(cublas_success == CUBLAS_STATUS_SUCCESS);
+    success = hipSetDevice(local_rank);
+    expect(success == hipSuccess);
+    auto const cublas_success = hipblasCreate(&handle);
+    expect(cublas_success == HIPBLAS_STATUS_SUCCESS);
 
 #else
     ignore(local_rank);
@@ -129,7 +129,7 @@ struct device_handler
   }
 
 #ifdef ASGARD_USE_CUDA
-  cublasHandle_t const &get_handle() const { return handle; }
+  hipblasHandle_t const &get_handle() const { return handle; }
 #endif
   ~device_handler()
   {
@@ -186,12 +186,12 @@ void rotg(P *a, P *b, P *c, P *s, resource const resrc)
     // function instantiated for these two fp types
     if constexpr (std::is_same<P, double>::value)
     {
-      auto const success = cublasDrotg(device.get_handle(), a, b, c, s);
+      auto const success = hipblasDrotg(device.get_handle(), a, b, c, s);
       expect(success == 0);
     }
     else if constexpr (std::is_same<P, float>::value)
     {
-      auto const success = cublasSrotg(device.get_handle(), a, b, c, s);
+      auto const success = hipblasSrotg(device.get_handle(), a, b, c, s);
       expect(success == 0);
     }
     return;
@@ -227,13 +227,13 @@ P nrm2(int *n, P *x, int *incx, resource const resrc)
     if constexpr (std::is_same<P, double>::value)
     {
       auto const success =
-          cublasDnrm2(device.get_handle(), *n, x, *incx, &norm);
+          hipblasDnrm2(device.get_handle(), *n, x, *incx, &norm);
       expect(success == 0);
     }
     else if constexpr (std::is_same<P, float>::value)
     {
       auto const success =
-          cublasSnrm2(device.get_handle(), *n, x, *incx, &norm);
+          hipblasSnrm2(device.get_handle(), *n, x, *incx, &norm);
       expect(success == 0);
     }
     return norm;
@@ -280,13 +280,13 @@ void copy(int *n, P *x, int *incx, P *y, int *incy, resource const resrc)
     if constexpr (std::is_same<P, double>::value)
     {
       auto const success =
-          cublasDcopy(device.get_handle(), *n, x, *incx, y, *incy);
+          hipblasDcopy(device.get_handle(), *n, x, *incx, y, *incy);
       expect(success == 0);
     }
     else if constexpr (std::is_same<P, float>::value)
     {
       auto const success =
-          cublasScopy(device.get_handle(), *n, x, *incx, y, *incy);
+          hipblasScopy(device.get_handle(), *n, x, *incx, y, *incy);
       expect(success == 0);
     }
     return;
@@ -332,13 +332,13 @@ P dot(int *n, P *x, int *incx, P *y, int *incy, resource const resrc)
     if constexpr (std::is_same<P, double>::value)
     {
       auto const success =
-          cublasDdot(device.get_handle(), *n, x, *incx, y, *incy, &result);
+          hipblasDdot(device.get_handle(), *n, x, *incx, y, *incy, &result);
       expect(success == 0);
     }
     else if constexpr (std::is_same<P, float>::value)
     {
       auto const success =
-          cublasSdot(device.get_handle(), *n, x, *incx, y, *incy, &result);
+          hipblasSdot(device.get_handle(), *n, x, *incx, y, *incy, &result);
       expect(success == 0);
     }
     return result;
@@ -387,13 +387,13 @@ void axpy(int *n, P *alpha, P *x, int *incx, P *y, int *incy,
     if constexpr (std::is_same<P, double>::value)
     {
       auto const success =
-          cublasDaxpy(device.get_handle(), *n, alpha, x, *incx, y, *incy);
+          hipblasDaxpy(device.get_handle(), *n, alpha, x, *incx, y, *incy);
       expect(success == 0);
     }
     else if constexpr (std::is_same<P, float>::value)
     {
       auto const success =
-          cublasSaxpy(device.get_handle(), *n, alpha, x, *incx, y, *incy);
+          hipblasSaxpy(device.get_handle(), *n, alpha, x, *incx, y, *incy);
       expect(success == 0);
     }
     return;
@@ -437,13 +437,13 @@ void scal(int *n, P *alpha, P *x, int *incx, resource const resrc)
     if constexpr (std::is_same<P, double>::value)
     {
       auto const success =
-          cublasDscal(device.get_handle(), *n, alpha, x, *incx);
+          hipblasDscal(device.get_handle(), *n, alpha, x, *incx);
       expect(success == 0);
     }
     else if constexpr (std::is_same<P, float>::value)
     {
       auto const success =
-          cublasSscal(device.get_handle(), *n, alpha, x, *incx);
+          hipblasSscal(device.get_handle(), *n, alpha, x, *incx);
       expect(success == 0);
     }
     return;
@@ -570,14 +570,14 @@ void gemv(char const *trans, int *m, int *n, P *alpha, P *A, int *lda, P *x,
     {
       auto const success =
           hipblasDgemv(device.get_handle(), cublas_trans(*trans), *m, *n, alpha,
-                      A, *lda, x, *incx, beta, y, *incy);
+                       A, *lda, x, *incx, beta, y, *incy);
       expect(success == 0);
     }
     else if constexpr (std::is_same<P, float>::value)
     {
       auto const success =
           hipblasSgemv(device.get_handle(), cublas_trans(*trans), *m, *n, alpha,
-                      A, *lda, x, *incx, beta, y, *incy);
+                       A, *lda, x, *incx, beta, y, *incy);
       expect(success == 0);
     }
     return;
@@ -694,9 +694,9 @@ void getrf(int *m, int *n, P *A, int *lda, int *ipiv, int *info,
     ignore(m);
 
     P **A_d;
-    auto stat = cudaMalloc((void **)&A_d, sizeof(P *));
+    auto stat = hipMalloc((void **)&A_d, sizeof(P *));
     expect(stat == 0);
-    stat = cudaMemcpy(A_d, &A, sizeof(P *), cudaMemcpyHostToDevice);
+    stat = hipMemcpy(A_d, &A, sizeof(P *), hipMemcpyHostToDevice);
     expect(stat == 0);
 
     // instantiated for these two fp types
@@ -757,14 +757,14 @@ void getri(int *n, P *A, int *lda, int *ipiv, P *work, int *lwork, int *info,
 
     P const **A_d;
     P **work_d;
-    auto stat = cudaMalloc((void **)&A_d, sizeof(P *));
+    auto stat = hipMalloc((void **)&A_d, sizeof(P *));
     expect(stat == 0);
-    stat = cudaMalloc((void **)&work_d, sizeof(P *));
+    stat = hipMalloc((void **)&work_d, sizeof(P *));
     expect(stat == 0);
 
-    stat = cudaMemcpy(A_d, &A, sizeof(P *), cudaMemcpyHostToDevice);
+    stat = hipMemcpy(A_d, &A, sizeof(P *), hipMemcpyHostToDevice);
     expect(stat == 0);
-    stat = cudaMemcpy(work_d, &work, sizeof(P *), cudaMemcpyHostToDevice);
+    stat = hipMemcpy(work_d, &work, sizeof(P *), hipMemcpyHostToDevice);
     expect(stat == 0);
 
     // instantiated for these two fp types
@@ -833,17 +833,17 @@ void batched_gemm(P **const &a, int *lda, char const *transa, P **const &b,
     P **c_d;
     size_t const list_size = *num_batch * sizeof(P *);
 
-    auto stat = cudaMalloc((void **)&a_d, list_size);
+    auto stat = hipMalloc((void **)&a_d, list_size);
     expect(stat == 0);
-    stat = cudaMalloc((void **)&b_d, list_size);
+    stat = hipMalloc((void **)&b_d, list_size);
     expect(stat == 0);
-    stat = cudaMalloc((void **)&c_d, list_size);
+    stat = hipMalloc((void **)&c_d, list_size);
     expect(stat == 0);
-    stat = cudaMemcpy(a_d, a, list_size, cudaMemcpyHostToDevice);
+    stat = hipMemcpy(a_d, a, list_size, hipMemcpyHostToDevice);
     expect(stat == 0);
-    stat = cudaMemcpy(b_d, b, list_size, cudaMemcpyHostToDevice);
+    stat = hipMemcpy(b_d, b, list_size, hipMemcpyHostToDevice);
     expect(stat == 0);
-    stat = cudaMemcpy(c_d, c, list_size, cudaMemcpyHostToDevice);
+    stat = hipMemcpy(c_d, c, list_size, hipMemcpyHostToDevice);
     expect(stat == 0);
 
     // instantiated for these two fp types
@@ -852,7 +852,7 @@ void batched_gemm(P **const &a, int *lda, char const *transa, P **const &b,
       auto const success = hipblasDgemmBatched(
           device.get_handle(), cublas_trans(*transa), cublas_trans(*transb), *m,
           *n, *k, alpha, a_d, *lda, b_d, *ldb, beta, c_d, *ldc, *num_batch);
-      auto const cuda_stat = cudaDeviceSynchronize();
+      auto const cuda_stat = hipDeviceSynchronize();
       expect(cuda_stat == 0);
       expect(success == 0);
     }
@@ -861,16 +861,16 @@ void batched_gemm(P **const &a, int *lda, char const *transa, P **const &b,
       auto const success = hipblasSgemmBatched(
           device.get_handle(), cublas_trans(*transa), cublas_trans(*transb), *m,
           *n, *k, alpha, a_d, *lda, b_d, *ldb, beta, c_d, *ldc, *num_batch);
-      auto const cuda_stat = cudaDeviceSynchronize();
+      auto const cuda_stat = hipDeviceSynchronize();
       expect(cuda_stat == 0);
       expect(success == 0);
     }
 
-    stat = cudaFree(a_d);
+    stat = hipFree(a_d);
     expect(stat == 0);
-    stat = cudaFree(b_d);
+    stat = hipFree(b_d);
     expect(stat == 0);
-    stat = cudaFree(c_d);
+    stat = hipFree(c_d);
     expect(stat == 0);
 
     return;
@@ -929,17 +929,17 @@ void batched_gemv(P **const &a, int *lda, char const *trans, P **const &x,
     P **y_d;
     size_t const list_size = *num_batch * sizeof(P *);
 
-    auto stat = cudaMalloc((void **)&a_d, list_size);
+    auto stat = hipMalloc((void **)&a_d, list_size);
     expect(stat == 0);
-    stat = cudaMalloc((void **)&x_d, list_size);
+    stat = hipMalloc((void **)&x_d, list_size);
     expect(stat == 0);
-    stat = cudaMalloc((void **)&y_d, list_size);
+    stat = hipMalloc((void **)&y_d, list_size);
     expect(stat == 0);
-    stat = cudaMemcpy(a_d, a, list_size, cudaMemcpyHostToDevice);
+    stat = hipMemcpy(a_d, a, list_size, hipMemcpyHostToDevice);
     expect(stat == 0);
-    stat = cudaMemcpy(x_d, x, list_size, cudaMemcpyHostToDevice);
+    stat = hipMemcpy(x_d, x, list_size, hipMemcpyHostToDevice);
     expect(stat == 0);
-    stat = cudaMemcpy(y_d, y, list_size, cudaMemcpyHostToDevice);
+    stat = hipMemcpy(y_d, y, list_size, hipMemcpyHostToDevice);
     expect(stat == 0);
 
     // instantiated for these two fp types
@@ -949,7 +949,7 @@ void batched_gemv(P **const &a, int *lda, char const *trans, P **const &x,
           device.get_handle(), cublas_trans(*trans), cublas_trans(transb),
           gemm_m, gemm_n, gemm_k, alpha, a_d, *lda, x_d, ldb, beta, y_d, ldc,
           *num_batch);
-      auto const cuda_stat = cudaDeviceSynchronize();
+      auto const cuda_stat = hipDeviceSynchronize();
       expect(cuda_stat == 0);
       expect(success == 0);
     }
@@ -959,16 +959,16 @@ void batched_gemv(P **const &a, int *lda, char const *trans, P **const &x,
           device.get_handle(), cublas_trans(*trans), cublas_trans(transb),
           gemm_m, gemm_n, gemm_k, alpha, a_d, *lda, x_d, ldb, beta, y_d, ldc,
           *num_batch);
-      auto const cuda_stat = cudaDeviceSynchronize();
+      auto const cuda_stat = hipDeviceSynchronize();
       expect(cuda_stat == 0);
       expect(success == 0);
     }
 
-    stat = cudaFree(a_d);
+    stat = hipFree(a_d);
     expect(stat == 0);
-    stat = cudaFree(x_d);
+    stat = hipFree(x_d);
     expect(stat == 0);
-    stat = cudaFree(y_d);
+    stat = hipFree(y_d);
     expect(stat == 0);
 
     return;
diff --git a/src/tensors.hpp b/src/tensors.hpp
index 7b3560250..1e08065b1 100644
--- a/src/tensors.hpp
+++ b/src/tensors.hpp
@@ -640,8 +640,8 @@ inline void
 allocate_device(P *&ptr, int const num_elems, bool const initialize = true)
 {
 #ifdef ASGARD_USE_CUDA
-  auto success = cudaMalloc((void **)&ptr, num_elems * sizeof(P));
-  assert(success == cudaSuccess);
+  auto success = hipMalloc((void **)&ptr, num_elems * sizeof(P));
+  assert(success == hipSuccess);
   if (num_elems > 0)
   {
     expect(ptr != nullptr);
@@ -649,8 +649,8 @@ allocate_device(P *&ptr, int const num_elems, bool const initialize = true)
 
   if (initialize)
   {
-    success = cudaMemset((void *)ptr, 0, num_elems * sizeof(P));
-    expect(success == cudaSuccess);
+    success = hipMemset((void *)ptr, 0, num_elems * sizeof(P));
+    expect(success == hipSuccess);
   }
 
 #else
@@ -669,11 +669,11 @@ template<typename P>
 inline void delete_device(P *const ptr)
 {
 #ifdef ASGARD_USE_CUDA
-  auto const success = cudaFree(ptr);
+  auto const success = hipFree(ptr);
   // the device runtime may be unloaded at process shut down
   // (when static storage duration destructors are called)
   // returning a cudartUnloading error code.
-  expect((success == cudaSuccess) || (success == cudaErrorCudartUnloading));
+  expect((success == hipSuccess) || (success == hipErrorDeinitialized));
 #else
   delete[] ptr;
 #endif
@@ -685,8 +685,8 @@ copy_on_device(P *const dest, P const *const source, int const num_elems)
 {
 #ifdef ASGARD_USE_HIP
   auto const success =
-      cudaMemcpy(dest, source, num_elems * sizeof(P), cudaMemcpyDeviceToDevice);
-  expect(success == cudaSuccess);
+      hipMemcpy(dest, source, num_elems * sizeof(P), hipMemcpyDeviceToDevice);
+  expect(success == hipSuccess);
 #else
   std::copy(source, source + num_elems, dest);
 #endif
@@ -698,8 +698,8 @@ copy_to_device(P *const dest, P const *const source, int const num_elems)
 {
 #ifdef ASGARD_USE_HIP
   auto const success =
-      cudaMemcpy(dest, source, num_elems * sizeof(P), cudaMemcpyHostToDevice);
-  expect(success == cudaSuccess);
+      hipMemcpy(dest, source, num_elems * sizeof(P), hipMemcpyHostToDevice);
+  expect(success == hipSuccess);
 #else
   std::copy(source, source + num_elems, dest);
 #endif
@@ -711,8 +711,8 @@ copy_to_host(P *const dest, P const *const source, int const num_elems)
 {
 #ifdef ASGARD_USE_HIP
   auto const success =
-      cudaMemcpy(dest, source, num_elems * sizeof(P), cudaMemcpyDeviceToHost);
-  expect(success == cudaSuccess);
+      hipMemcpy(dest, source, num_elems * sizeof(P), hipMemcpyDeviceToHost);
+  expect(success == hipSuccess);
 #else
   std::copy(source, source + num_elems, dest);
 #endif
@@ -730,7 +730,7 @@ copy_matrix_on_device(fk::matrix<P, mem, resource::device> &dest,
   auto const success =
       hipMemcpy2D(dest.data(), dest.stride() * sizeof(P), source.data(),
                    source.stride() * sizeof(P), source.nrows() * sizeof(P),
-                   source.ncols(), cudaMemcpyDeviceToDevice);
+                   source.ncols(), hipMemcpyDeviceToDevice);
   expect(success == 0);
 #else
   std::copy(source.begin(), source.end(), dest.begin());
@@ -749,7 +749,7 @@ copy_matrix_to_device(fk::matrix<P, mem, resource::device> &dest,
   auto const success =
       hipMemcpy2D(dest.data(), dest.stride() * sizeof(P), source.data(),
                    source.stride() * sizeof(P), source.nrows() * sizeof(P),
-                   source.ncols(), cudaMemcpyHostToDevice);
+                   source.ncols(), hipMemcpyHostToDevice);
   expect(success == 0);
 #else
   std::copy(source.begin(), source.end(), dest.begin());
@@ -768,7 +768,7 @@ copy_matrix_to_host(fk::matrix<P, mem, resource::host> &dest,
   auto const success =
       hipMemcpy2D(dest.data(), dest.stride() * sizeof(P), source.data(),
                    source.stride() * sizeof(P), source.nrows() * sizeof(P),
-                   source.ncols(), cudaMemcpyDeviceToHost);
+                   source.ncols(), hipMemcpyDeviceToHost);
   expect(success == 0);
 #else
   std::copy(source.begin(), source.end(), dest.begin());

From 3daf6459552207c13391db2a71a6416cdc9a672e Mon Sep 17 00:00:00 2001
From: Cole Kendrick <kendrickcj@ornl.gov>
Date: Tue, 2 Feb 2021 12:03:51 -0500
Subject: [PATCH 04/36] Updating formatting for hipified files

---
 src/device/kronmult_cuda.cpp | 52 +++++++++++++--------
 src/tensors.hpp              | 88 ++++++++++++++++++------------------
 2 files changed, 78 insertions(+), 62 deletions(-)

diff --git a/src/device/kronmult_cuda.cpp b/src/device/kronmult_cuda.cpp
index 9e5fcb139..52bf40f90 100644
--- a/src/device/kronmult_cuda.cpp
+++ b/src/device/kronmult_cuda.cpp
@@ -1,6 +1,6 @@
-#include "hip/hip_runtime.h"
 #include "kronmult_cuda.hpp"
 #include "build_info.hpp"
+#include "hip/hip_runtime.h"
 
 #ifdef ASGARD_USE_HIP
 #include <hip/hip_runtime.h>
@@ -99,7 +99,9 @@ void stage_inputs_kronmult(P const *const x, P *const workspace,
   auto const total_copies = static_cast<int64_t>(num_elems) * num_copies;
   auto const num_blocks   = (total_copies + num_threads - 1) / num_threads;
 
-  hipLaunchKernelGGL(HIP_KERNEL_NAME(stage_inputs_kronmult_kernel<P>), dim3(num_blocks), dim3(num_threads), 0, 0, x, workspace, num_elems, num_copies);
+  hipLaunchKernelGGL(HIP_KERNEL_NAME(stage_inputs_kronmult_kernel<P>),
+                     dim3(num_blocks), dim3(num_threads), 0, 0, x, workspace,
+                     num_elems, num_copies);
 
   auto const stat = hipDeviceSynchronize();
   expect(stat == hipSuccess);
@@ -280,10 +282,12 @@ void prepare_kronmult(int const *const flattened_table,
       static_cast<int64_t>(elem_col_stop - elem_col_start + 1) *
       (elem_row_stop - elem_row_start + 1);
   auto const num_blocks = (num_krons / num_threads) + 1;
-  hipLaunchKernelGGL(HIP_KERNEL_NAME(prepare_kronmult_kernel<P>), dim3(num_blocks), dim3(num_threads), 0, 0, 
-      flattened_table, operators, operator_lda, element_x, element_work, fx,
-      operator_ptrs, work_ptrs, input_ptrs, output_ptrs, degree, num_terms,
-      num_dims, elem_row_start, elem_row_stop, elem_col_start, elem_col_stop);
+  hipLaunchKernelGGL(HIP_KERNEL_NAME(prepare_kronmult_kernel<P>),
+                     dim3(num_blocks), dim3(num_threads), 0, 0, flattened_table,
+                     operators, operator_lda, element_x, element_work, fx,
+                     operator_ptrs, work_ptrs, input_ptrs, output_ptrs, degree,
+                     num_terms, num_dims, elem_row_start, elem_row_stop,
+                     elem_col_start, elem_col_stop);
   auto const stat = hipDeviceSynchronize();
   expect(stat == hipSuccess);
 #else
@@ -312,28 +316,40 @@ void call_kronmult(int const n, P *x_ptrs[], P *output_ptrs[], P *work_ptrs[],
     switch (num_dims)
     {
     case 1:
-      hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult1_xbatched<P>), dim3(num_krons), dim3(num_threads), 0, 0, 
-          n, operator_ptrs, lda, x_ptrs, output_ptrs, work_ptrs, num_krons);
+      hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult1_xbatched<P>),
+                         dim3(num_krons), dim3(num_threads), 0, 0, n,
+                         operator_ptrs, lda, x_ptrs, output_ptrs, work_ptrs,
+                         num_krons);
       break;
     case 2:
-      hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult2_xbatched<P>), dim3(num_krons), dim3(num_threads), 0, 0, 
-          n, operator_ptrs, lda, x_ptrs, output_ptrs, work_ptrs, num_krons);
+      hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult2_xbatched<P>),
+                         dim3(num_krons), dim3(num_threads), 0, 0, n,
+                         operator_ptrs, lda, x_ptrs, output_ptrs, work_ptrs,
+                         num_krons);
       break;
     case 3:
-      hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult3_xbatched<P>), dim3(num_krons), dim3(num_threads), 0, 0, 
-          n, operator_ptrs, lda, x_ptrs, output_ptrs, work_ptrs, num_krons);
+      hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult3_xbatched<P>),
+                         dim3(num_krons), dim3(num_threads), 0, 0, n,
+                         operator_ptrs, lda, x_ptrs, output_ptrs, work_ptrs,
+                         num_krons);
       break;
     case 4:
-      hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult4_xbatched<P>), dim3(num_krons), dim3(num_threads), 0, 0, 
-          n, operator_ptrs, lda, x_ptrs, output_ptrs, work_ptrs, num_krons);
+      hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult4_xbatched<P>),
+                         dim3(num_krons), dim3(num_threads), 0, 0, n,
+                         operator_ptrs, lda, x_ptrs, output_ptrs, work_ptrs,
+                         num_krons);
       break;
     case 5:
-      hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult5_xbatched<P>), dim3(num_krons), dim3(num_threads), 0, 0, 
-          n, operator_ptrs, lda, x_ptrs, output_ptrs, work_ptrs, num_krons);
+      hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult5_xbatched<P>),
+                         dim3(num_krons), dim3(num_threads), 0, 0, n,
+                         operator_ptrs, lda, x_ptrs, output_ptrs, work_ptrs,
+                         num_krons);
       break;
     case 6:
-      hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult6_xbatched<P>), dim3(num_krons), dim3(num_threads), 0, 0, 
-          n, operator_ptrs, lda, x_ptrs, output_ptrs, work_ptrs, num_krons);
+      hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult6_xbatched<P>),
+                         dim3(num_krons), dim3(num_threads), 0, 0, n,
+                         operator_ptrs, lda, x_ptrs, output_ptrs, work_ptrs,
+                         num_krons);
       break;
     default:
       expect(false);
diff --git a/src/tensors.hpp b/src/tensors.hpp
index 1e08065b1..dd27cbf07 100644
--- a/src/tensors.hpp
+++ b/src/tensors.hpp
@@ -240,10 +240,10 @@ class vector
   template<mem_type omem, resource r_ = resrc, typename = enable_for_host<r_>>
   vector<P> operator-(vector<P, omem> const &right) const;
   template<mem_type omem, resource r_ = resrc, typename = enable_for_host<r_>>
-  P operator*(vector<P, omem> const &)const;
+  P operator*(vector<P, omem> const &) const;
 
   template<mem_type omem, resource r_ = resrc, typename = enable_for_host<r_>>
-  vector<P> operator*(matrix<P, omem> const &)const;
+  vector<P> operator*(matrix<P, omem> const &) const;
 
   template<resource r_ = resrc, typename = enable_for_host<r_>>
   vector<P> operator*(P const) const;
@@ -475,9 +475,9 @@ class matrix
   template<resource r_ = resrc, typename = enable_for_host<r_>>
   matrix<P> operator*(P const) const;
   template<mem_type omem, resource r_ = resrc, typename = enable_for_host<r_>>
-  vector<P> operator*(vector<P, omem> const &)const;
+  vector<P> operator*(vector<P, omem> const &) const;
   template<mem_type omem, resource r_ = resrc, typename = enable_for_host<r_>>
-  matrix<P> operator*(matrix<P, omem> const &)const;
+  matrix<P> operator*(matrix<P, omem> const &) const;
   template<mem_type omem, resource r_ = resrc, typename = enable_for_host<r_>>
   matrix<P> operator+(matrix<P, omem> const &) const;
   template<mem_type omem, resource r_ = resrc, typename = enable_for_host<r_>>
@@ -729,8 +729,8 @@ copy_matrix_on_device(fk::matrix<P, mem, resource::device> &dest,
 #ifdef ASGARD_USE_HIP
   auto const success =
       hipMemcpy2D(dest.data(), dest.stride() * sizeof(P), source.data(),
-                   source.stride() * sizeof(P), source.nrows() * sizeof(P),
-                   source.ncols(), hipMemcpyDeviceToDevice);
+                  source.stride() * sizeof(P), source.nrows() * sizeof(P),
+                  source.ncols(), hipMemcpyDeviceToDevice);
   expect(success == 0);
 #else
   std::copy(source.begin(), source.end(), dest.begin());
@@ -748,8 +748,8 @@ copy_matrix_to_device(fk::matrix<P, mem, resource::device> &dest,
 #ifdef ASGARD_USE_CUDA
   auto const success =
       hipMemcpy2D(dest.data(), dest.stride() * sizeof(P), source.data(),
-                   source.stride() * sizeof(P), source.nrows() * sizeof(P),
-                   source.ncols(), hipMemcpyHostToDevice);
+                  source.stride() * sizeof(P), source.nrows() * sizeof(P),
+                  source.ncols(), hipMemcpyHostToDevice);
   expect(success == 0);
 #else
   std::copy(source.begin(), source.end(), dest.begin());
@@ -767,8 +767,8 @@ copy_matrix_to_host(fk::matrix<P, mem, resource::host> &dest,
 #ifdef ASGARD_USE_CUDA
   auto const success =
       hipMemcpy2D(dest.data(), dest.stride() * sizeof(P), source.data(),
-                   source.stride() * sizeof(P), source.nrows() * sizeof(P),
-                   source.ncols(), hipMemcpyDeviceToHost);
+                  source.stride() * sizeof(P), source.nrows() * sizeof(P),
+                  source.ncols(), hipMemcpyDeviceToHost);
   expect(success == 0);
 #else
   std::copy(source.begin(), source.end(), dest.begin());
@@ -996,8 +996,8 @@ fk::vector<P, mem, resrc>::vector(vector<P, mem, resrc> const &a)
 // http://stackoverflow.com/questions/3279543/what-is-the-copy-and-swap-idiom
 //
 template<typename P, mem_type mem, resource resrc>
-fk::vector<P, mem, resrc> &fk::vector<P, mem, resrc>::
-operator=(vector<P, mem, resrc> const &a)
+fk::vector<P, mem, resrc> &
+fk::vector<P, mem, resrc>::operator=(vector<P, mem, resrc> const &a)
 {
   static_assert(mem != mem_type::const_view,
                 "cannot copy assign into const_view!");
@@ -1042,8 +1042,8 @@ fk::vector<P, mem, resrc>::vector(vector<P, mem, resrc> &&a)
 // vector move assignment
 //
 template<typename P, mem_type mem, resource resrc>
-fk::vector<P, mem, resrc> &fk::vector<P, mem, resrc>::
-operator=(vector<P, mem, resrc> &&a)
+fk::vector<P, mem, resrc> &
+fk::vector<P, mem, resrc>::operator=(vector<P, mem, resrc> &&a)
 {
   static_assert(mem != mem_type::const_view,
                 "cannot move assign into const_view!");
@@ -1089,8 +1089,8 @@ fk::vector<P, mem, resrc>::vector(vector<PP, omem> const &a)
 //
 template<typename P, mem_type mem, resource resrc>
 template<typename PP, mem_type omem, mem_type, typename, resource, typename>
-fk::vector<P, mem> &fk::vector<P, mem, resrc>::
-operator=(vector<PP, omem> const &a)
+fk::vector<P, mem> &
+fk::vector<P, mem, resrc>::operator=(vector<PP, omem> const &a)
 {
   expect(size() == a.size());
 
@@ -1124,8 +1124,8 @@ fk::vector<P, mem, resrc>::vector(vector<P, omem, resrc> const &a)
 // assignment owner <-> view
 template<typename P, mem_type mem, resource resrc>
 template<mem_type omem, mem_type, typename>
-fk::vector<P, mem, resrc> &fk::vector<P, mem, resrc>::
-operator=(vector<P, omem, resrc> const &a)
+fk::vector<P, mem, resrc> &
+fk::vector<P, mem, resrc>::operator=(vector<P, omem, resrc> const &a)
 {
   expect(size() == a.size());
   if constexpr (resrc == resource::host)
@@ -1191,8 +1191,8 @@ fk::vector<P, mem, resrc> &fk::vector<P, mem, resrc>::transfer_from(
 //
 template<typename P, mem_type mem, resource resrc>
 template<mem_type, typename, resource, typename>
-fk::vector<P, mem> &fk::vector<P, mem, resrc>::
-operator=(std::vector<P> const &v)
+fk::vector<P, mem> &
+fk::vector<P, mem, resrc>::operator=(std::vector<P> const &v)
 {
   expect(size() == static_cast<int>(v.size()));
   std::memcpy(data_, v.data(), v.size() * sizeof(P));
@@ -1278,8 +1278,8 @@ bool fk::vector<P, mem, resrc>::operator<(vector<P, omem> const &other) const
 //
 template<typename P, mem_type mem, resource resrc>
 template<mem_type omem, resource, typename>
-fk::vector<P> fk::vector<P, mem, resrc>::
-operator+(vector<P, omem> const &right) const
+fk::vector<P>
+fk::vector<P, mem, resrc>::operator+(vector<P, omem> const &right) const
 {
   expect(size() == right.size());
   vector<P> ans(size());
@@ -1293,8 +1293,8 @@ operator+(vector<P, omem> const &right) const
 //
 template<typename P, mem_type mem, resource resrc>
 template<mem_type omem, resource, typename>
-fk::vector<P> fk::vector<P, mem, resrc>::
-operator-(vector<P, omem> const &right) const
+fk::vector<P>
+fk::vector<P, mem, resrc>::operator-(vector<P, omem> const &right) const
 {
   expect(size() == right.size());
   vector<P> ans(size());
@@ -1323,8 +1323,8 @@ P fk::vector<P, mem, resrc>::operator*(vector<P, omem> const &right) const
 //
 template<typename P, mem_type mem, resource resrc>
 template<mem_type omem, resource, typename>
-fk::vector<P> fk::vector<P, mem, resrc>::
-operator*(fk::matrix<P, omem> const &A) const
+fk::vector<P>
+fk::vector<P, mem, resrc>::operator*(fk::matrix<P, omem> const &A) const
 {
   // check dimension compatibility
   expect(size() == A.nrows());
@@ -1780,8 +1780,8 @@ fk::matrix<P, mem, resrc>::matrix(matrix<P, mem, resrc> const &a)
 // http://stackoverflow.com/questions/3279543/what-is-the-copy-and-swap-idiom
 //
 template<typename P, mem_type mem, resource resrc>
-fk::matrix<P, mem, resrc> &fk::matrix<P, mem, resrc>::
-operator=(matrix<P, mem, resrc> const &a)
+fk::matrix<P, mem, resrc> &
+fk::matrix<P, mem, resrc>::operator=(matrix<P, mem, resrc> const &a)
 {
   static_assert(mem != mem_type::const_view,
                 "cannot copy assign into const_view!");
@@ -1834,8 +1834,8 @@ fk::matrix<P, mem, resrc>::matrix(matrix<P, omem, resrc> const &a)
 // assignment owner <-> view
 template<typename P, mem_type mem, resource resrc>
 template<mem_type omem, mem_type, typename>
-fk::matrix<P, mem, resrc> &fk::matrix<P, mem, resrc>::
-operator=(matrix<P, omem, resrc> const &a)
+fk::matrix<P, mem, resrc> &
+fk::matrix<P, mem, resrc>::operator=(matrix<P, omem, resrc> const &a)
 {
   expect(nrows() == a.nrows());
   expect(ncols() == a.ncols());
@@ -1874,8 +1874,8 @@ fk::matrix<P, mem, resrc>::matrix(matrix<PP, omem> const &a)
 //
 template<typename P, mem_type mem, resource resrc>
 template<typename PP, mem_type omem, mem_type, typename, resource, typename>
-fk::matrix<P, mem> &fk::matrix<P, mem, resrc>::
-operator=(matrix<PP, omem> const &a)
+fk::matrix<P, mem> &
+fk::matrix<P, mem, resrc>::operator=(matrix<PP, omem> const &a)
 {
   expect((nrows() == a.nrows()) && (ncols() == a.ncols()));
 
@@ -1966,8 +1966,8 @@ fk::matrix<P, mem, resrc>::matrix(matrix<P, mem, resrc> &&a)
 // matrix move assignment
 //
 template<typename P, mem_type mem, resource resrc>
-fk::matrix<P, mem, resrc> &fk::matrix<P, mem, resrc>::
-operator=(matrix<P, mem, resrc> &&a)
+fk::matrix<P, mem, resrc> &
+fk::matrix<P, mem, resrc>::operator=(matrix<P, mem, resrc> &&a)
 {
   static_assert(mem != mem_type::const_view,
                 "cannot move assign into const_view!");
@@ -1998,8 +1998,8 @@ operator=(matrix<P, mem, resrc> &&a)
 //
 template<typename P, mem_type mem, resource resrc>
 template<mem_type omem, mem_type, typename, resource, typename>
-fk::matrix<P, mem> &fk::matrix<P, mem, resrc>::
-operator=(fk::vector<P, omem> const &v)
+fk::matrix<P, mem> &
+fk::matrix<P, mem, resrc>::operator=(fk::vector<P, omem> const &v)
 {
   expect(nrows() * ncols() == v.size());
 
@@ -2085,8 +2085,8 @@ bool fk::matrix<P, mem, resrc>::operator<(matrix<P, omem> const &other) const
 //
 template<typename P, mem_type mem, resource resrc>
 template<mem_type omem, resource, typename>
-fk::matrix<P> fk::matrix<P, mem, resrc>::
-operator+(matrix<P, omem> const &right) const
+fk::matrix<P>
+fk::matrix<P, mem, resrc>::operator+(matrix<P, omem> const &right) const
 {
   expect(nrows() == right.nrows() && ncols() == right.ncols());
 
@@ -2106,8 +2106,8 @@ operator+(matrix<P, omem> const &right) const
 //
 template<typename P, mem_type mem, resource resrc>
 template<mem_type omem, resource, typename>
-fk::matrix<P> fk::matrix<P, mem, resrc>::
-operator-(matrix<P, omem> const &right) const
+fk::matrix<P>
+fk::matrix<P, mem, resrc>::operator-(matrix<P, omem> const &right) const
 {
   expect(nrows() == right.nrows() && ncols() == right.ncols());
 
@@ -2145,8 +2145,8 @@ fk::matrix<P> fk::matrix<P, mem, resrc>::operator*(P const right) const
 //
 template<typename P, mem_type mem, resource resrc>
 template<mem_type omem, resource, typename>
-fk::vector<P> fk::matrix<P, mem, resrc>::
-operator*(fk::vector<P, omem> const &right) const
+fk::vector<P>
+fk::matrix<P, mem, resrc>::operator*(fk::vector<P, omem> const &right) const
 {
   // check dimension compatibility
   expect(ncols() == right.size());
@@ -2172,8 +2172,8 @@ operator*(fk::vector<P, omem> const &right) const
 //
 template<typename P, mem_type mem, resource resrc>
 template<mem_type omem, resource, typename>
-fk::matrix<P> fk::matrix<P, mem, resrc>::
-operator*(matrix<P, omem> const &B) const
+fk::matrix<P>
+fk::matrix<P, mem, resrc>::operator*(matrix<P, omem> const &B) const
 {
   expect(ncols() == B.nrows()); // k == k
 

From e2491dc2debf7479d4d892be18ba4bd81b2956bd Mon Sep 17 00:00:00 2001
From: Cole Kendrick <kendrickcj@ornl.gov>
Date: Mon, 15 Feb 2021 14:18:02 -0500
Subject: [PATCH 05/36] Minor cmake adjustments for hip

---
 CMakeLists.txt       | 27 ++++++++++++++++++---------
 src/lib_dispatch.cpp |  6 +++---
 src/tensors.hpp      |  8 ++++----
 3 files changed, 25 insertions(+), 16 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5fc2ccdc5..da414be06 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -157,18 +157,26 @@ if(ASGARD_USE_HIP)
   #include_directories(${HIP_INCLUDE_DIRS})
   message(STATUS ${HIP_INCLUDE_DIRS})
 
-  #enable_language(HIP) 
+  #enable_language(HIP) # no cmake support yet?
   #set (CMAKE_CUDA_STANDARD 14)
   #set (CMAKE_CUDA_STANDARD_REQUIRED ON)
+
+
+  #find_package(hipBLAS)
+  #if (hipBLAS_FOUND)
+  #   message(STATUS "Found rocBLAS version ${rocBLAS_VERSION}")
+  #endif()
+
 endif()
 
 # build component to interface with Ed's kronmult lib
 ##TODO: link to kronmult as interface library
 add_library(kronmult_cuda SHARED src/device/kronmult_cuda.cpp)
-if(ASGARD_USE_CUDA)
-  set_source_files_properties( src/device/kronmult_cuda.cpp PROPERTIES LANGUAGE CUDA ) # no .cu extension
-  set_target_properties( kronmult_cuda PROPERTIES CUDA_ARCHITECTURES OFF)
-  set_target_properties( kronmult_cuda PROPERTIES COMPILE_FLAGS "-arch sm_70 -g -lineinfo --ptxas-options=-O3")
+if(ASGARD_USE_HIP)
+  # there is no hip language property in cmake yet?
+  #set_source_files_properties( src/device/kronmult_cuda.cpp PROPERTIES LANGUAGE CUDA ) # no .cu extension
+  #set_target_properties( kronmult_cuda PROPERTIES CUDA_ARCHITECTURES OFF)
+  #set_target_properties( kronmult_cuda PROPERTIES COMPILE_FLAGS "-arch sm_70 -g -lineinfo --ptxas-options=-O3")
   set_target_properties( kronmult_cuda PROPERTIES LINK_FLAGS "-Wl,-rpath,${CMAKE_BINARY_DIR}")
 
 #  Turn on GPU support in kronmult.
@@ -320,9 +328,10 @@ else ()
   target_link_libraries (lib_dispatch PRIVATE LINALG::LINALG)
 endif ()
 
-if (ASGARD_USE_CUDA)
-  target_link_libraries(lib_dispatch PRIVATE ${CUDA_LIBRARIES}
-					     ${CUDA_CUBLAS_LIBRARIES})
+if (ASGARD_USE_HIP)
+  target_link_libraries(lib_dispatch PRIVATE ${HIP_LIBRARIES})
+  target_include_directories(lib_dispatch PRIVATE ${HIP_INCLUDE_DIRS})
+  target_include_directories(kronmult_cuda PRIVATE ${HIP_INCLUDE_DIRS})
 endif()
 
 if (ASGARD_USE_OPENMP AND NOT ASGARD_USE_MKL)
@@ -450,7 +459,7 @@ if (ASGARD_BUILD_TESTS)
       target_link_libraries (${component}-tests PRIVATE ${component} MPI::MPI_CXX)
       if (${component} IN_LIST mpi_test_components)
         set(test_ranks "4")
-      	if (ASGARD_USE_CUDA)
+        if (ASGARD_USE_HIP)
           set(test_ranks "1")
         endif ()
         if (${ASGARD_TESTING_RANKS})
diff --git a/src/lib_dispatch.cpp b/src/lib_dispatch.cpp
index d6dbbe1cf..0a274a87c 100644
--- a/src/lib_dispatch.cpp
+++ b/src/lib_dispatch.cpp
@@ -93,7 +93,7 @@ struct device_handler
 {
   device_handler()
   {
-#ifdef ASGARD_USE_CUDA
+#ifdef ASGARD_USE_HIP
     auto success = hipblasCreate(&handle);
     expect(success == HIPBLAS_STATUS_SUCCESS);
 
@@ -128,7 +128,7 @@ struct device_handler
 #endif
   }
 
-#ifdef ASGARD_USE_CUDA
+#ifdef ASGARD_USE_HIP
   hipblasHandle_t const &get_handle() const { return handle; }
 #endif
   ~device_handler()
@@ -147,7 +147,7 @@ static device_handler device;
 
 void initialize_libraries(int const local_rank)
 {
-#ifdef ASGARD_USE_CUDA
+#ifdef ASGARD_USE_HIP
   expect(local_rank >= 0);
   device.set_device(local_rank);
 #else
diff --git a/src/tensors.hpp b/src/tensors.hpp
index dd27cbf07..87234e1a9 100644
--- a/src/tensors.hpp
+++ b/src/tensors.hpp
@@ -639,7 +639,7 @@ template<typename P>
 inline void
 allocate_device(P *&ptr, int const num_elems, bool const initialize = true)
 {
-#ifdef ASGARD_USE_CUDA
+#ifdef ASGARD_USE_HIP
   auto success = hipMalloc((void **)&ptr, num_elems * sizeof(P));
   assert(success == hipSuccess);
   if (num_elems > 0)
@@ -668,7 +668,7 @@ allocate_device(P *&ptr, int const num_elems, bool const initialize = true)
 template<typename P>
 inline void delete_device(P *const ptr)
 {
-#ifdef ASGARD_USE_CUDA
+#ifdef ASGARD_USE_HIP
   auto const success = hipFree(ptr);
   // the device runtime may be unloaded at process shut down
   // (when static storage duration destructors are called)
@@ -745,7 +745,7 @@ copy_matrix_to_device(fk::matrix<P, mem, resource::device> &dest,
 {
   expect(source.nrows() == dest.nrows());
   expect(source.ncols() == dest.ncols());
-#ifdef ASGARD_USE_CUDA
+#ifdef ASGARD_USE_HIP
   auto const success =
       hipMemcpy2D(dest.data(), dest.stride() * sizeof(P), source.data(),
                   source.stride() * sizeof(P), source.nrows() * sizeof(P),
@@ -764,7 +764,7 @@ copy_matrix_to_host(fk::matrix<P, mem, resource::host> &dest,
 {
   expect(source.nrows() == dest.nrows());
   expect(source.ncols() == dest.ncols());
-#ifdef ASGARD_USE_CUDA
+#ifdef ASGARD_USE_HIP
   auto const success =
       hipMemcpy2D(dest.data(), dest.stride() * sizeof(P), source.data(),
                   source.stride() * sizeof(P), source.nrows() * sizeof(P),

From 4b94cc87edd56df3466b4ca9367056d4fbb29498 Mon Sep 17 00:00:00 2001
From: Cole Kendrick <kendrickcj@ornl.gov>
Date: Thu, 11 Mar 2021 15:37:21 -0500
Subject: [PATCH 06/36] Add initial hip platform configuration

---
 CMakeLists.txt | 38 ++++++++++++++++++++++++++++++++++++--
 1 file changed, 36 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index da414be06..d06059a14 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -136,6 +136,8 @@ if(ASGARD_USE_OPENMP)
   endif()
 endif()
 
+set(ASGARD_PLATFORM_NVCC 0)
+set(ASGARD_PLATFORM_HCC 0)
 if(ASGARD_USE_HIP)
   # find the HIP directory for the cmake files, taken from HIP samples
   if(NOT DEFINED HIP_PATH)
@@ -145,6 +147,34 @@ if(ASGARD_USE_HIP)
       set(HIP_PATH $ENV{HIP_PATH} CACHE PATH "Path to which HIP has been installed")
     endif()
   endif()
+
+  find_program(ASGARD_HIPCONFIG_PATH hipconfig HINTS "${HIP_PATH}/bin")
+  if(ASGARD_HIPCONFIG_PATH)
+    execute_process(COMMAND ${ASGARD_HIPCONFIG_PATH} --platform OUTPUT_VARIABLE ASGARD_HIP_PLATFORM)
+  elseif(DEFINED ENV{HIP_PLATFORM})
+    set(ASGARD_HIP_PLATFORM "$ENV{HIP_PLATFORM}")
+  else()
+    message(FATAL_ERROR "Could not determine HIP platform, make sure HIP_PLATFORM is set")
+  endif()
+
+  message(STATUS "HIP Platform configured for ${ASGARD_HIP_PLATFORM}")
+  if(ASGARD_HIP_PLATFORM STREQUAL "hcc")
+    set(ASGARD_PLATFORM_HCC 1)
+  elseif(ASGARD_HIP_PLATFORM STREQUAL "nvcc")
+    set(ASGARD_PLATFORM_NVCC 1)
+  endif()
+
+  # double check for cuda path since HIP uses it internally
+  if(ASGARD_PLATFORM_NVCC)
+    if (NOT DEFINED ENV{CUDA_PATH})
+      find_path(ASGARD_HIP_DEFAULT_CUDA_PATH "cuda.h" PATH /usr/local/cuda/include NO_DEFAULT_PATH)
+      if (NOT ASGARD_HIP_DEFAULT_CUDA_PATH)
+        message(FATAL_ERROR "Make sure the CUDA_PATH env is set to locate for HIP")
+      endif()
+    endif()
+    message(STATUS "Found CUDA_PATH: $ENV{CUDA_PATH}")
+  endif()
+
   set(CMAKE_MODULE_PATH "${HIP_PATH}/cmake" ${CMAKE_MODULE_PATH})
 
   find_package(HIP 4.0 REQUIRED)
@@ -174,7 +204,7 @@ endif()
 add_library(kronmult_cuda SHARED src/device/kronmult_cuda.cpp)
 if(ASGARD_USE_HIP)
   # there is no hip language property in cmake yet?
-  #set_source_files_properties( src/device/kronmult_cuda.cpp PROPERTIES LANGUAGE CUDA ) # no .cu extension
+  #set_source_files_properties( src/device/kronmult_cuda.cpp PROPERTIES LANGUAGE HIP ) # no .cu extension
   #set_target_properties( kronmult_cuda PROPERTIES CUDA_ARCHITECTURES OFF)
   #set_target_properties( kronmult_cuda PROPERTIES COMPILE_FLAGS "-arch sm_70 -g -lineinfo --ptxas-options=-O3")
   set_target_properties( kronmult_cuda PROPERTIES LINK_FLAGS "-Wl,-rpath,${CMAKE_BINARY_DIR}")
@@ -307,7 +337,11 @@ if (ASGARD_USE_OPENMP AND NOT ASGARD_USE_MKL)
    target_link_libraries(kronmult PRIVATE OpenMP::OpenMP_CXX)
 endif ()
 
-target_link_libraries(kronmult_cuda PUBLIC kron)
+if (ASGARD_USE_HIP)
+  target_link_libraries(kronmult_cuda PUBLIC kron hip::device)
+else ()
+  target_link_libraries(kronmult_cuda PUBLIC kron)
+endif()
 
 if (ASGARD_USE_OPENMP AND NOT ASGARD_USE_MKL)
    target_link_libraries(kronmult_cuda PRIVATE OpenMP::OpenMP_CXX)

From e8161031e60c3f6a8c88246d39a1d3a4af290551 Mon Sep 17 00:00:00 2001
From: Cole Kendrick <kendrickcj@ornl.gov>
Date: Tue, 16 Mar 2021 14:29:56 -0400
Subject: [PATCH 07/36] Update cmake cuda options and add hipblas

---
 CMakeLists.txt | 49 +++++++++++++++++++++++++++++++++++--------------
 1 file changed, 35 insertions(+), 14 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index d06059a14..141f2dd1e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -148,6 +148,14 @@ if(ASGARD_USE_HIP)
     endif()
   endif()
 
+  if(NOT DEFINED HIPBLAS_PATH)
+    if(NOT DEFINED ENV{HIPBLAS_PATH})
+        set(HIPBLAS_PATH "${HIP_PATH}/../hipblas" CACHE PATH "Path to which HIPBLAS has been installed")
+    else()
+        set(HIPBLAS_PATH $ENV{HIPBLAS_PATH} CACHE PATH "Path to which HIPBLAS has been installed")
+    endif()
+endif()
+
   find_program(ASGARD_HIPCONFIG_PATH hipconfig HINTS "${HIP_PATH}/bin")
   if(ASGARD_HIPCONFIG_PATH)
     execute_process(COMMAND ${ASGARD_HIPCONFIG_PATH} --platform OUTPUT_VARIABLE ASGARD_HIP_PLATFORM)
@@ -176,8 +184,10 @@ if(ASGARD_USE_HIP)
   endif()
 
   set(CMAKE_MODULE_PATH "${HIP_PATH}/cmake" ${CMAKE_MODULE_PATH})
+  list(APPEND CMAKE_PREFIX_PATH "${HIPBLAS_PATH}/lib/cmake")
 
   find_package(HIP 4.0 REQUIRED)
+  find_package(hipblas REQUIRED)
   # Print some debug info about HIP configuration
   message(STATUS "HIP PLATFORM: ${HIP_PLATFORM}")
   message(STATUS "HIP COMPILER: ${HIP_COMPILER}")
@@ -188,14 +198,16 @@ if(ASGARD_USE_HIP)
   message(STATUS ${HIP_INCLUDE_DIRS})
 
   #enable_language(HIP) # no cmake support yet?
-  #set (CMAKE_CUDA_STANDARD 14)
-  #set (CMAKE_CUDA_STANDARD_REQUIRED ON)
-
+  if(ASGARD_PLATFORM_NVCC)
+    enable_language(CUDA)
+    set (CMAKE_CUDA_STANDARD 14)
+    set (CMAKE_CUDA_STANDARD_REQUIRED ON)
+  endif()
 
   #find_package(hipBLAS)
-  #if (hipBLAS_FOUND)
-  #   message(STATUS "Found rocBLAS version ${rocBLAS_VERSION}")
-  #endif()
+  if (hipBLAS_FOUND)
+     message(STATUS "Found rocBLAS version ${rocBLAS_VERSION}: ${HIPBLAS_INCLUDE_DIRS}")
+  endif()
 
 endif()
 
@@ -203,10 +215,14 @@ endif()
 ##TODO: link to kronmult as interface library
 add_library(kronmult_cuda SHARED src/device/kronmult_cuda.cpp)
 if(ASGARD_USE_HIP)
-  # there is no hip language property in cmake yet?
-  #set_source_files_properties( src/device/kronmult_cuda.cpp PROPERTIES LANGUAGE HIP ) # no .cu extension
-  #set_target_properties( kronmult_cuda PROPERTIES CUDA_ARCHITECTURES OFF)
-  #set_target_properties( kronmult_cuda PROPERTIES COMPILE_FLAGS "-arch sm_70 -g -lineinfo --ptxas-options=-O3")
+  if(ASGARD_PLATFORM_HCC)
+    set_source_files_properties( src/device/kronmult_cuda.cpp PROPERTIES LANGUAGE HIP ) # no .cu extension
+  elseif(ASGARD_PLATFORM_NVCC)
+    # there is no hip language property in cmake yet?
+    set_source_files_properties( src/device/kronmult_cuda.cpp PROPERTIES LANGUAGE CUDA ) # no .cu extension
+    set_target_properties( kronmult_cuda PROPERTIES CUDA_ARCHITECTURES OFF)
+    set_target_properties( kronmult_cuda PROPERTIES COMPILE_FLAGS "-arch sm_70 -g -lineinfo --ptxas-options=-O3")
+  endif()
   set_target_properties( kronmult_cuda PROPERTIES LINK_FLAGS "-Wl,-rpath,${CMAKE_BINARY_DIR}")
 
 #  Turn on GPU support in kronmult.
@@ -338,7 +354,7 @@ if (ASGARD_USE_OPENMP AND NOT ASGARD_USE_MKL)
 endif ()
 
 if (ASGARD_USE_HIP)
-  target_link_libraries(kronmult_cuda PUBLIC kron hip::device)
+  target_link_libraries(kronmult_cuda PUBLIC kron)# hip::device)
 else ()
   target_link_libraries(kronmult_cuda PUBLIC kron)
 endif()
@@ -363,9 +379,14 @@ else ()
 endif ()
 
 if (ASGARD_USE_HIP)
-  target_link_libraries(lib_dispatch PRIVATE ${HIP_LIBRARIES})
-  target_include_directories(lib_dispatch PRIVATE ${HIP_INCLUDE_DIRS})
-  target_include_directories(kronmult_cuda PRIVATE ${HIP_INCLUDE_DIRS})
+  if(ASGARD_PLATFORM_HCC)
+    target_link_libraries(lib_dispatch PRIVATE hip::device)
+  elseif(ASGARD_PLATFORM_NVCC)
+    target_link_libraries(lib_dispatch PRIVATE ${CUDA_LIBRARIES})
+  endif()
+  target_link_libraries(lib_dispatch PRIVATE roc::hipblas)
+  target_include_directories(lib_dispatch PUBLIC ${HIP_INCLUDE_DIRS} ${HIPBLAS_INCLUDE_DIRS})
+  target_include_directories(kronmult_cuda PUBLIC ${HIP_INCLUDE_DIRS} ${HIPBLAS_INCLUDE_DIRS})
 endif()
 
 if (ASGARD_USE_OPENMP AND NOT ASGARD_USE_MKL)

From 0d986487804969e25eb7de9238e02d3a5b86a514 Mon Sep 17 00:00:00 2001
From: Cole Kendrick <kendrickcj@ornl.gov>
Date: Mon, 26 Apr 2021 10:14:11 -0400
Subject: [PATCH 08/36] CMake adjustments and initial platform options

---
 CMakeLists.txt | 37 ++++++++++++++++++++++++-------------
 1 file changed, 24 insertions(+), 13 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 141f2dd1e..5f2aed325 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -136,10 +136,11 @@ if(ASGARD_USE_OPENMP)
   endif()
 endif()
 
+# convenience flags for which HIP platform has been setup
 set(ASGARD_PLATFORM_NVCC 0)
 set(ASGARD_PLATFORM_HCC 0)
 if(ASGARD_USE_HIP)
-  # find the HIP directory for the cmake files, taken from HIP samples
+  # search for HIP and libraries
   if(NOT DEFINED HIP_PATH)
     if(NOT DEFINED ENV{HIP_PATH})
       set(HIP_PATH "/opt/rocm/hip" CACHE PATH "Path to which HIP has been installed")
@@ -154,8 +155,9 @@ if(ASGARD_USE_HIP)
     else()
         set(HIPBLAS_PATH $ENV{HIPBLAS_PATH} CACHE PATH "Path to which HIPBLAS has been installed")
     endif()
-endif()
+  endif()
 
+  # try to find hipconfig executable which can help detect platforms and include dirs
   find_program(ASGARD_HIPCONFIG_PATH hipconfig HINTS "${HIP_PATH}/bin")
   if(ASGARD_HIPCONFIG_PATH)
     execute_process(COMMAND ${ASGARD_HIPCONFIG_PATH} --platform OUTPUT_VARIABLE ASGARD_HIP_PLATFORM)
@@ -165,7 +167,7 @@ endif()
     message(FATAL_ERROR "Could not determine HIP platform, make sure HIP_PLATFORM is set")
   endif()
 
-  message(STATUS "HIP Platform configured for ${ASGARD_HIP_PLATFORM}")
+  message(STATUS "HIP platform has been detected as ${ASGARD_HIP_PLATFORM}")
   if(ASGARD_HIP_PLATFORM STREQUAL "hcc")
     set(ASGARD_PLATFORM_HCC 1)
   elseif(ASGARD_HIP_PLATFORM STREQUAL "nvcc")
@@ -183,7 +185,12 @@ endif()
     message(STATUS "Found CUDA_PATH: $ENV{CUDA_PATH}")
   endif()
 
-  set(CMAKE_MODULE_PATH "${HIP_PATH}/cmake" ${CMAKE_MODULE_PATH})
+  # look for HIP cmake configs in different locations
+  list(APPEND CMAKE_MODULE_PATH "${HIP_PATH}/cmake")
+  if(ASGARD_PLATFORM_HCC)
+    # note: causes issues on nvidia, but might be needed on amd platforms?
+    list(APPEND CMAKE_PREFIX_PATH "${HIP_PATH}/lib/cmake")
+  endif()
   list(APPEND CMAKE_PREFIX_PATH "${HIPBLAS_PATH}/lib/cmake")
 
   find_package(HIP 4.0 REQUIRED)
@@ -192,23 +199,27 @@ endif()
   message(STATUS "HIP PLATFORM: ${HIP_PLATFORM}")
   message(STATUS "HIP COMPILER: ${HIP_COMPILER}")
   message(STATUS "HIP RUNTIME: ${HIP_RUNTIME}")
-  message(STATUS "HIP Include dir: ${HIP_INCLUDE_DIRS}")
+  message(STATUS "HIP Includes: ${HIP_INCLUDE_DIRS}")
   message(STATUS "HIP Libraries: ${HIP_LIBRARIES}")
   #include_directories(${HIP_INCLUDE_DIRS})
-  message(STATUS ${HIP_INCLUDE_DIRS})
 
-  #enable_language(HIP) # no cmake support yet?
   if(ASGARD_PLATFORM_NVCC)
-    enable_language(CUDA)
-    set (CMAKE_CUDA_STANDARD 14)
-    set (CMAKE_CUDA_STANDARD_REQUIRED ON)
+    enable_language(CUDA) # TODO: check if these are necessary with HIP on nvidia?
+    #set (CMAKE_CUDA_STANDARD 14)
+    #set (CMAKE_CUDA_STANDARD_REQUIRED ON)
+  elseif(ASGARD_PLATFORM_HCC)
+    #enable_language(HIP) # not yet added to latest cmake, but should be available soon
   endif()
 
-  #find_package(hipBLAS)
   if (hipBLAS_FOUND)
      message(STATUS "Found rocBLAS version ${rocBLAS_VERSION}: ${HIPBLAS_INCLUDE_DIRS}")
   endif()
 
+  set(ASGARD_HIP_FLAGS "-std=c++14")
+  set(ASGARD_AMD_ARCH "gfx_906")
+  set(ASGARD_AMD_FLAGS "") # amdgpu specific options
+  set(ASGARD_NVCC_ARCH "sm_86")
+  set(ASGARD_NVCC_FLAGS "") # nvcc specific options
 endif()
 
 # build component to interface with Ed's kronmult lib
@@ -216,12 +227,12 @@ endif()
 add_library(kronmult_cuda SHARED src/device/kronmult_cuda.cpp)
 if(ASGARD_USE_HIP)
   if(ASGARD_PLATFORM_HCC)
-    set_source_files_properties( src/device/kronmult_cuda.cpp PROPERTIES LANGUAGE HIP ) # no .cu extension
+    set_source_files_properties( src/device/kronmult_cuda.cpp PROPERTIES LANGUAGE HIP )
   elseif(ASGARD_PLATFORM_NVCC)
     # there is no hip language property in cmake yet?
     set_source_files_properties( src/device/kronmult_cuda.cpp PROPERTIES LANGUAGE CUDA ) # no .cu extension
     set_target_properties( kronmult_cuda PROPERTIES CUDA_ARCHITECTURES OFF)
-    set_target_properties( kronmult_cuda PROPERTIES COMPILE_FLAGS "-arch sm_70 -g -lineinfo --ptxas-options=-O3")
+    set_target_properties( kronmult_cuda PROPERTIES COMPILE_FLAGS "-arch sm_86 -g -lineinfo --ptxas-options=-O3")
   endif()
   set_target_properties( kronmult_cuda PROPERTIES LINK_FLAGS "-Wl,-rpath,${CMAKE_BINARY_DIR}")
 

From 0a78c9e39a979129d143d72bbfe3ffdb3ee89d71 Mon Sep 17 00:00:00 2001
From: Cole Kendrick <kendrickcj@ornl.gov>
Date: Mon, 17 May 2021 13:18:42 -0400
Subject: [PATCH 09/36] Add in nvidia platform update for hip 4.2

---
 CMakeLists.txt | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5f2aed325..45861bfe5 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -170,7 +170,8 @@ if(ASGARD_USE_HIP)
   message(STATUS "HIP platform has been detected as ${ASGARD_HIP_PLATFORM}")
   if(ASGARD_HIP_PLATFORM STREQUAL "hcc")
     set(ASGARD_PLATFORM_HCC 1)
-  elseif(ASGARD_HIP_PLATFORM STREQUAL "nvcc")
+  # hip <= 4.1 uses "nvcc" to identify nvidia platforms, >= 4.2 uses "nvidia"
+  elseif(ASGARD_HIP_PLATFORM STREQUAL "nvcc" OR ASGARD_HIP_PLATFORM STREQUAL "nvidia")
     set(ASGARD_PLATFORM_NVCC 1)
   endif()
 
@@ -204,11 +205,15 @@ if(ASGARD_USE_HIP)
   #include_directories(${HIP_INCLUDE_DIRS})
 
   if(ASGARD_PLATFORM_NVCC)
+    find_package(CUDA 9.0 REQUIRED)
+    include_directories(${CUDA_INCLUDE_DIRS})
     enable_language(CUDA) # TODO: check if these are necessary with HIP on nvidia?
-    #set (CMAKE_CUDA_STANDARD 14)
-    #set (CMAKE_CUDA_STANDARD_REQUIRED ON)
+    set (CMAKE_CUDA_STANDARD 14)
+    set (CMAKE_CUDA_STANDARD_REQUIRED ON)
+    add_compile_definitions(__HIP_PLATFORM_NVCC__)
   elseif(ASGARD_PLATFORM_HCC)
     #enable_language(HIP) # not yet added to latest cmake, but should be available soon
+    add_compile_definitions(__HIP_PLATFORM_AMD__)
   endif()
 
   if (hipBLAS_FOUND)
@@ -220,6 +225,8 @@ if(ASGARD_USE_HIP)
   set(ASGARD_AMD_FLAGS "") # amdgpu specific options
   set(ASGARD_NVCC_ARCH "sm_86")
   set(ASGARD_NVCC_FLAGS "") # nvcc specific options
+
+  #hip_add_library(kronmult_cuda SHARED src/device/kronmult_cuda.cpp)
 endif()
 
 # build component to interface with Ed's kronmult lib

From b397160625dc8491bb608c0ac67cdaf5579b1f23 Mon Sep 17 00:00:00 2001
From: Cole Kendrick <kendrickcj@ornl.gov>
Date: Mon, 17 May 2021 16:33:19 -0400
Subject: [PATCH 10/36] Update tensor linking and missed cublas calls

---
 CMakeLists.txt       | 13 ++++++++++---
 src/lib_dispatch.cpp | 44 ++++++++++++++++++++++----------------------
 2 files changed, 32 insertions(+), 25 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 45861bfe5..f10a4bdaf 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -226,6 +226,10 @@ if(ASGARD_USE_HIP)
   set(ASGARD_NVCC_ARCH "sm_86")
   set(ASGARD_NVCC_FLAGS "") # nvcc specific options
 
+  # assume this include path since ${HIP_INCLUDE_DIRS} not being set on nvidia platform
+  include_directories("${HIP_PATH}/include")
+  include_directories(${HIPBLAS_INCLUDE_DIRS})
+
   #hip_add_library(kronmult_cuda SHARED src/device/kronmult_cuda.cpp)
 endif()
 
@@ -403,8 +407,6 @@ if (ASGARD_USE_HIP)
     target_link_libraries(lib_dispatch PRIVATE ${CUDA_LIBRARIES})
   endif()
   target_link_libraries(lib_dispatch PRIVATE roc::hipblas)
-  target_include_directories(lib_dispatch PUBLIC ${HIP_INCLUDE_DIRS} ${HIPBLAS_INCLUDE_DIRS})
-  target_include_directories(kronmult_cuda PUBLIC ${HIP_INCLUDE_DIRS} ${HIPBLAS_INCLUDE_DIRS})
 endif()
 
 if (ASGARD_USE_OPENMP AND NOT ASGARD_USE_MKL)
@@ -431,7 +433,12 @@ target_link_libraries (solver PRIVATE distribution fast_math lib_dispatch tensor
 
 target_link_libraries (tensors PRIVATE lib_dispatch)
 if (ASGARD_USE_HIP)
-  target_link_libraries (tensors PRIVATE ${HIP_LIBRARIES})
+  if(ASGARD_PLATFORM_HCC)
+    target_link_libraries(tensors PRIVATE hip::device)
+  elseif(ASGARD_PLATFORM_NVCC)
+    target_link_libraries(tensors PRIVATE ${CUDA_LIBRARIES})
+  endif()
+  target_link_libraries (tensors PRIVATE roc::hipblas)
 endif ()
 if (ASGARD_USE_SCALAPACK)	
   add_compile_definitions (ASGARD_USE_SCALAPACK)
diff --git a/src/lib_dispatch.cpp b/src/lib_dispatch.cpp
index 0a274a87c..3bbf840f8 100644
--- a/src/lib_dispatch.cpp
+++ b/src/lib_dispatch.cpp
@@ -156,7 +156,7 @@ void initialize_libraries(int const local_rank)
 }
 
 #ifdef ASGARD_USE_HIP
-inline hipblasOperation_t cublas_trans(char trans)
+inline hipblasOperation_t hipblas_trans(char trans)
 {
   if (trans == 'N' || trans == 'n')
   {
@@ -569,15 +569,15 @@ void gemv(char const *trans, int *m, int *n, P *alpha, P *A, int *lda, P *x,
     if constexpr (std::is_same<P, double>::value)
     {
       auto const success =
-          hipblasDgemv(device.get_handle(), cublas_trans(*trans), *m, *n, alpha,
-                       A, *lda, x, *incx, beta, y, *incy);
+          hipblasDgemv(device.get_handle(), hipblas_trans(*trans), *m, *n,
+                       alpha, A, *lda, x, *incx, beta, y, *incy);
       expect(success == 0);
     }
     else if constexpr (std::is_same<P, float>::value)
     {
       auto const success =
-          hipblasSgemv(device.get_handle(), cublas_trans(*trans), *m, *n, alpha,
-                       A, *lda, x, *incx, beta, y, *incy);
+          hipblasSgemv(device.get_handle(), hipblas_trans(*trans), *m, *n,
+                       alpha, A, *lda, x, *incx, beta, y, *incy);
       expect(success == 0);
     }
     return;
@@ -635,15 +635,15 @@ void gemm(char const *transa, char const *transb, int *m, int *n, int *k,
     if constexpr (std::is_same<P, double>::value)
     {
       auto const success = hipblasDgemm(
-          device.get_handle(), cublas_trans(*transa), cublas_trans(*transb), *m,
-          *n, *k, alpha, A, *lda, B, *ldb, beta, C, *ldc);
+          device.get_handle(), hipblas_trans(*transa), hipblas_trans(*transb),
+          *m, *n, *k, alpha, A, *lda, B, *ldb, beta, C, *ldc);
       expect(success == 0);
     }
     else if constexpr (std::is_same<P, float>::value)
     {
       auto const success = hipblasSgemm(
-          device.get_handle(), cublas_trans(*transa), cublas_trans(*transb), *m,
-          *n, *k, alpha, A, *lda, B, *ldb, beta, C, *ldc);
+          device.get_handle(), hipblas_trans(*transa), hipblas_trans(*transb),
+          *m, *n, *k, alpha, A, *lda, B, *ldb, beta, C, *ldc);
       expect(success == 0);
     }
     return;
@@ -702,14 +702,14 @@ void getrf(int *m, int *n, P *A, int *lda, int *ipiv, int *info,
     // instantiated for these two fp types
     if constexpr (std::is_same<P, double>::value)
     {
-      auto const success = cublasDgetrfBatched(device.get_handle(), *n, A_d,
-                                               *lda, ipiv, info, 1);
+      auto const success = hipblasDgetrfBatched(device.get_handle(), *n, A_d,
+                                                *lda, ipiv, info, 1);
       expect(success == 0);
     }
     else if constexpr (std::is_same<P, float>::value)
     {
-      auto const success = cublasSgetrfBatched(device.get_handle(), *n, A_d,
-                                               *lda, ipiv, info, 1);
+      auto const success = hipblasSgetrfBatched(device.get_handle(), *n, A_d,
+                                                *lda, ipiv, info, 1);
       expect(success == 0);
     }
     return;
@@ -755,7 +755,7 @@ void getri(int *n, P *A, int *lda, int *ipiv, P *work, int *lwork, int *info,
     expect(*lwork == (*n) * (*n));
     ignore(lwork);
 
-    P const **A_d;
+    P **A_d; // hipBlas loses const to DgetriBatched
     P **work_d;
     auto stat = hipMalloc((void **)&A_d, sizeof(P *));
     expect(stat == 0);
@@ -770,13 +770,13 @@ void getri(int *n, P *A, int *lda, int *ipiv, P *work, int *lwork, int *info,
     // instantiated for these two fp types
     if constexpr (std::is_same<P, double>::value)
     {
-      auto const success = cublasDgetriBatched(
+      auto const success = hipblasDgetriBatched(
           device.get_handle(), *n, A_d, *lda, nullptr, work_d, *n, info, 1);
       expect(success == 0);
     }
     else if constexpr (std::is_same<P, float>::value)
     {
-      auto const success = cublasSgetriBatched(
+      auto const success = hipblasSgetriBatched(
           device.get_handle(), *n, A_d, *lda, nullptr, work_d, *n, info, 1);
       expect(success == 0);
     }
@@ -850,8 +850,8 @@ void batched_gemm(P **const &a, int *lda, char const *transa, P **const &b,
     if constexpr (std::is_same<P, double>::value)
     {
       auto const success = hipblasDgemmBatched(
-          device.get_handle(), cublas_trans(*transa), cublas_trans(*transb), *m,
-          *n, *k, alpha, a_d, *lda, b_d, *ldb, beta, c_d, *ldc, *num_batch);
+          device.get_handle(), hipblas_trans(*transa), hipblas_trans(*transb),
+          *m, *n, *k, alpha, a_d, *lda, b_d, *ldb, beta, c_d, *ldc, *num_batch);
       auto const cuda_stat = hipDeviceSynchronize();
       expect(cuda_stat == 0);
       expect(success == 0);
@@ -859,8 +859,8 @@ void batched_gemm(P **const &a, int *lda, char const *transa, P **const &b,
     else if constexpr (std::is_same<P, float>::value)
     {
       auto const success = hipblasSgemmBatched(
-          device.get_handle(), cublas_trans(*transa), cublas_trans(*transb), *m,
-          *n, *k, alpha, a_d, *lda, b_d, *ldb, beta, c_d, *ldc, *num_batch);
+          device.get_handle(), hipblas_trans(*transa), hipblas_trans(*transb),
+          *m, *n, *k, alpha, a_d, *lda, b_d, *ldb, beta, c_d, *ldc, *num_batch);
       auto const cuda_stat = hipDeviceSynchronize();
       expect(cuda_stat == 0);
       expect(success == 0);
@@ -946,7 +946,7 @@ void batched_gemv(P **const &a, int *lda, char const *trans, P **const &x,
     if constexpr (std::is_same<P, double>::value)
     {
       auto const success = hipblasDgemmBatched(
-          device.get_handle(), cublas_trans(*trans), cublas_trans(transb),
+          device.get_handle(), hipblas_trans(*trans), hipblas_trans(transb),
           gemm_m, gemm_n, gemm_k, alpha, a_d, *lda, x_d, ldb, beta, y_d, ldc,
           *num_batch);
       auto const cuda_stat = hipDeviceSynchronize();
@@ -956,7 +956,7 @@ void batched_gemv(P **const &a, int *lda, char const *trans, P **const &x,
     else if constexpr (std::is_same<P, float>::value)
     {
       auto const success = hipblasSgemmBatched(
-          device.get_handle(), cublas_trans(*trans), cublas_trans(transb),
+          device.get_handle(), hipblas_trans(*trans), hipblas_trans(transb),
           gemm_m, gemm_n, gemm_k, alpha, a_d, *lda, x_d, ldb, beta, y_d, ldc,
           *num_batch);
       auto const cuda_stat = hipDeviceSynchronize();

From 15a9d3598973d49c5f1db7f5c172d8239dc2e46a Mon Sep 17 00:00:00 2001
From: Cole Kendrick <kendrickcj@ornl.gov>
Date: Wed, 26 May 2021 15:41:07 -0400
Subject: [PATCH 11/36] Update hip compiler option variables, hip_add_library
 for kronmult

---
 CMakeLists.txt | 32 ++++++++++++++++++++------------
 1 file changed, 20 insertions(+), 12 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index f10a4bdaf..3a7c16154 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -194,6 +194,16 @@ if(ASGARD_USE_HIP)
   endif()
   list(APPEND CMAKE_PREFIX_PATH "${HIPBLAS_PATH}/lib/cmake")
 
+  set(ASGARD_HIP_FLAGS "-std=c++14" CACHE STRING "HIP compiler flags for both AMD and NVIDIA")
+  set(ASGARD_AMD_ARCH "906" CACHE STRING "AMD GPU architecture number")
+  set(ASGARD_AMD_FLAGS "--amdgpu-target=gfx_${ASGARD_AMD_ARCH}" CACHE STRING "Flags to pass to amd-clang for HIP") # amdgpu specific options
+  set(ASGARD_NVCC_ARCH "86" CACHE STRING "Nvidia architecture number") # -gencode arch=compute_xx,code=compute_xx
+  set(ASGARD_NVCC_FLAGS "-gencode arch=compute_${ASGARD_NVCC_ARCH},code=compute_${ASGARD_NVCC_ARCH}" CACHE STRING "Flags to pass to NVCC") # nvcc specific options
+
+  set(HIP_HIPCC_FLAGS "${HIP_HIPCC_FLAGS} ${ASGARD_HIP_FLAGS}" CACHE STRING "")
+  set(HIP_CLANG_FLAGS "${HIP_CLANG_FLAGS} ${ASGARD_AMD_FLAGS}" CACHE STRING "")
+  set(HIP_NVCC_FLAGS "${HIP_NVCC_FLAGS} ${ASGARD_NVCC_FLAGS}" CACHE STRING "")
+
   find_package(HIP 4.0 REQUIRED)
   find_package(hipblas REQUIRED)
   # Print some debug info about HIP configuration
@@ -202,7 +212,6 @@ if(ASGARD_USE_HIP)
   message(STATUS "HIP RUNTIME: ${HIP_RUNTIME}")
   message(STATUS "HIP Includes: ${HIP_INCLUDE_DIRS}")
   message(STATUS "HIP Libraries: ${HIP_LIBRARIES}")
-  #include_directories(${HIP_INCLUDE_DIRS})
 
   if(ASGARD_PLATFORM_NVCC)
     find_package(CUDA 9.0 REQUIRED)
@@ -220,22 +229,21 @@ if(ASGARD_USE_HIP)
      message(STATUS "Found rocBLAS version ${rocBLAS_VERSION}: ${HIPBLAS_INCLUDE_DIRS}")
   endif()
 
-  set(ASGARD_HIP_FLAGS "-std=c++14")
-  set(ASGARD_AMD_ARCH "gfx_906")
-  set(ASGARD_AMD_FLAGS "") # amdgpu specific options
-  set(ASGARD_NVCC_ARCH "sm_86")
-  set(ASGARD_NVCC_FLAGS "") # nvcc specific options
-
-  # assume this include path since ${HIP_INCLUDE_DIRS} not being set on nvidia platform
+  include_directories(${HIP_INCLUDE_DIRS})
+  # assume this include path since HIP_INCLUDE_DIRS is not being set on nvidia platform
   include_directories("${HIP_PATH}/include")
   include_directories(${HIPBLAS_INCLUDE_DIRS})
 
-  #hip_add_library(kronmult_cuda SHARED src/device/kronmult_cuda.cpp)
+  hip_add_library(kronmult_cuda SHARED src/device/kronmult_cuda.cpp
+                  HIPCC_OPTIONS "${ASGARD_HIP_FLAGS}"
+                  NVCC_OPTIONS "${ASGARD_NVCC_FLAGS}"
+                  CLANG_OPTIONS "${ASGARD_AMD_FLAGS}")
+else()
+  # build component to interface with Ed's kronmult lib
+  ##TODO: link to kronmult as interface library
+  add_library(kronmult_cuda SHARED src/device/kronmult_cuda.cpp)
 endif()
 
-# build component to interface with Ed's kronmult lib
-##TODO: link to kronmult as interface library
-add_library(kronmult_cuda SHARED src/device/kronmult_cuda.cpp)
 if(ASGARD_USE_HIP)
   if(ASGARD_PLATFORM_HCC)
     set_source_files_properties( src/device/kronmult_cuda.cpp PROPERTIES LANGUAGE HIP )

From b0a81a1b5c190f3e9159a325d5063fd198238e3e Mon Sep 17 00:00:00 2001
From: Cole Kendrick <kendrickcj@ornl.gov>
Date: Wed, 9 Jun 2021 11:56:25 -0400
Subject: [PATCH 12/36] Update nvidia arch flag for kronmult_cuda

---
 CMakeLists.txt | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3a7c16154..74611902e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -194,6 +194,8 @@ if(ASGARD_USE_HIP)
   endif()
   list(APPEND CMAKE_PREFIX_PATH "${HIPBLAS_PATH}/lib/cmake")
 
+  set(HIP_VERBOSE_BUILD ON CACHE STRING "Verbose compilation for HIP")
+
   set(ASGARD_HIP_FLAGS "-std=c++14" CACHE STRING "HIP compiler flags for both AMD and NVIDIA")
   set(ASGARD_AMD_ARCH "906" CACHE STRING "AMD GPU architecture number")
   set(ASGARD_AMD_FLAGS "--amdgpu-target=gfx_${ASGARD_AMD_ARCH}" CACHE STRING "Flags to pass to amd-clang for HIP") # amdgpu specific options
@@ -251,7 +253,7 @@ if(ASGARD_USE_HIP)
     # there is no hip language property in cmake yet?
     set_source_files_properties( src/device/kronmult_cuda.cpp PROPERTIES LANGUAGE CUDA ) # no .cu extension
     set_target_properties( kronmult_cuda PROPERTIES CUDA_ARCHITECTURES OFF)
-    set_target_properties( kronmult_cuda PROPERTIES COMPILE_FLAGS "-arch sm_86 -g -lineinfo --ptxas-options=-O3")
+    set_target_properties( kronmult_cuda PROPERTIES COMPILE_FLAGS "-arch sm_${ASGARD_NVCC_ARCH} -g -lineinfo --ptxas-options=-O3")
   endif()
   set_target_properties( kronmult_cuda PROPERTIES LINK_FLAGS "-Wl,-rpath,${CMAKE_BINARY_DIR}")
 
@@ -415,6 +417,7 @@ if (ASGARD_USE_HIP)
     target_link_libraries(lib_dispatch PRIVATE ${CUDA_LIBRARIES})
   endif()
   target_link_libraries(lib_dispatch PRIVATE roc::hipblas)
+  #set_target_properties(lib_dispatch PROPERTIES LINKER_LANGUAGE HIP)
 endif()
 
 if (ASGARD_USE_OPENMP AND NOT ASGARD_USE_MKL)
@@ -447,6 +450,7 @@ if (ASGARD_USE_HIP)
     target_link_libraries(tensors PRIVATE ${CUDA_LIBRARIES})
   endif()
   target_link_libraries (tensors PRIVATE roc::hipblas)
+  #set_target_properties(tensors PROPERTIES LINKER_LANGUAGE HIP)
 endif ()
 if (ASGARD_USE_SCALAPACK)	
   add_compile_definitions (ASGARD_USE_SCALAPACK)

From 0889a35395e745b7b066c8ae63c2e52f41bb84cd Mon Sep 17 00:00:00 2001
From: Cole Kendrick <kendrickcj@ornl.gov>
Date: Wed, 16 Jun 2021 13:17:39 -0400
Subject: [PATCH 13/36] Set hip_clang_path in cmake to find for amd platform

---
 CMakeLists.txt | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 74611902e..f38c5d135 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -149,6 +149,15 @@ if(ASGARD_USE_HIP)
     endif()
   endif()
 
+  # set HIP_CLANG_PATH for potential installs in non-standard locations (such as rocm with spack)
+  if (NOT DEFINED HIP_CLANG_PATH)
+    if(NOT DEFINED ENV{HIP_CLANG_PATH})
+      set(HIP_CLANG_PATH "${ROCM_PATH}/llvm/bin" CACHE PATH "Path to HIP clang binaries")
+   else()
+      set(HIP_CLANG_PATH $ENV{HIP_CLANG_PATH} CACHE PATH "Path to HIP clang binaries")
+   endif()
+  endif()
+
   if(NOT DEFINED HIPBLAS_PATH)
     if(NOT DEFINED ENV{HIPBLAS_PATH})
         set(HIPBLAS_PATH "${HIP_PATH}/../hipblas" CACHE PATH "Path to which HIPBLAS has been installed")
@@ -187,10 +196,10 @@ if(ASGARD_USE_HIP)
   endif()
 
   # look for HIP cmake configs in different locations
-  list(APPEND CMAKE_MODULE_PATH "${HIP_PATH}/cmake")
+  list(APPEND CMAKE_MODULE_PATH "${HIP_PATH}/cmake" "${ROCM_PATH}")
   if(ASGARD_PLATFORM_HCC)
     # note: causes issues on nvidia, but might be needed on amd platforms?
-    list(APPEND CMAKE_PREFIX_PATH "${HIP_PATH}/lib/cmake")
+    list(APPEND CMAKE_PREFIX_PATH "${HIP_PATH}/lib/cmake" "${ROCM_PATH}")
   endif()
   list(APPEND CMAKE_PREFIX_PATH "${HIPBLAS_PATH}/lib/cmake")
 

From 3d2f4666ec679156cf1f4f22c21e2e6eb8cc9385 Mon Sep 17 00:00:00 2001
From: Cole Kendrick <kendrickcj@ornl.gov>
Date: Fri, 18 Jun 2021 11:40:03 -0400
Subject: [PATCH 14/36] Set the hip_clang_include_path needed for spack
 installs

---
 CMakeLists.txt | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index f38c5d135..11262f95e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -153,9 +153,18 @@ if(ASGARD_USE_HIP)
   if (NOT DEFINED HIP_CLANG_PATH)
     if(NOT DEFINED ENV{HIP_CLANG_PATH})
       set(HIP_CLANG_PATH "${ROCM_PATH}/llvm/bin" CACHE PATH "Path to HIP clang binaries")
-   else()
+    else()
       set(HIP_CLANG_PATH $ENV{HIP_CLANG_PATH} CACHE PATH "Path to HIP clang binaries")
-   endif()
+    endif()
+  endif()
+
+  # note: could probably grab this path directly using hipconfig?
+  if (NOT DEFINED HIP_CLANG_INCLUDE_PATH)
+    if(NOT DEFINED ENV{HIP_CLANG_INCLUDE_PATH})
+      set(HIP_CLANG_INCLUDE_PATH "${HIP_CLANG_PATH}/../lib/clang/12.0.0/include" CACHE PATH "Path to HIP clang include directory")
+    else()
+      set(HIP_CLANG_INCLUDE_PATH $ENV{HIP_CLANG_INCLUDE_PATH} CACHE PATH "Path to HIP clang include directory")
+    endif()
   endif()
 
   if(NOT DEFINED HIPBLAS_PATH)

From 119cbb2069ce377e13d5abd5435b2159501e84cb Mon Sep 17 00:00:00 2001
From: Cole Kendrick <kendrickcj@ornl.gov>
Date: Mon, 21 Jun 2021 13:51:27 -0600
Subject: [PATCH 15/36] Update hip build for amd platform

---
 CMakeLists.txt | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 11262f95e..6644787d6 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -186,7 +186,7 @@ if(ASGARD_USE_HIP)
   endif()
 
   message(STATUS "HIP platform has been detected as ${ASGARD_HIP_PLATFORM}")
-  if(ASGARD_HIP_PLATFORM STREQUAL "hcc")
+  if(ASGARD_HIP_PLATFORM STREQUAL "hcc" OR ASGARD_HIP_PLATFORM STREQUAL "amd")
     set(ASGARD_PLATFORM_HCC 1)
   # hip <= 4.1 uses "nvcc" to identify nvidia platforms, >= 4.2 uses "nvidia"
   elseif(ASGARD_HIP_PLATFORM STREQUAL "nvcc" OR ASGARD_HIP_PLATFORM STREQUAL "nvidia")
@@ -266,9 +266,9 @@ endif()
 
 if(ASGARD_USE_HIP)
   if(ASGARD_PLATFORM_HCC)
-    set_source_files_properties( src/device/kronmult_cuda.cpp PROPERTIES LANGUAGE HIP )
+    #set_source_files_properties( src/device/kronmult_cuda.cpp PROPERTIES LANGUAGE HIP ) # should work after cmake 3.21 release?
+    set_source_files_properties( src/device/kronmult_cuda.cpp PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1)
   elseif(ASGARD_PLATFORM_NVCC)
-    # there is no hip language property in cmake yet?
     set_source_files_properties( src/device/kronmult_cuda.cpp PROPERTIES LANGUAGE CUDA ) # no .cu extension
     set_target_properties( kronmult_cuda PROPERTIES CUDA_ARCHITECTURES OFF)
     set_target_properties( kronmult_cuda PROPERTIES COMPILE_FLAGS "-arch sm_${ASGARD_NVCC_ARCH} -g -lineinfo --ptxas-options=-O3")
@@ -280,8 +280,8 @@ if(ASGARD_USE_HIP)
 endif()
 
 if(ASGARD_USE_MKL)
-  if(ASGARD_USE_HIP)
-    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --compiler-options -fopenmp")
+  if(ASGARD_USE_HIP AND ASGARD_PLATFORM_NVCC)
+      set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --compiler-options -fopenmp")
   else()
     target_compile_options (kronmult_cuda PRIVATE "-fopenmp") # CMAKE doesn't handle MKL openmp link properly
     if(APPLE) # Need to link against the same openmp library as the MKL.
@@ -403,8 +403,8 @@ if (ASGARD_USE_OPENMP AND NOT ASGARD_USE_MKL)
    target_link_libraries(kronmult PRIVATE OpenMP::OpenMP_CXX)
 endif ()
 
-if (ASGARD_USE_HIP)
-  target_link_libraries(kronmult_cuda PUBLIC kron)# hip::device)
+if (ASGARD_USE_HIP AND ASGARD_PLATFORM_HCC)
+  target_link_libraries(kronmult_cuda PUBLIC kron hip::device)
 else ()
   target_link_libraries(kronmult_cuda PUBLIC kron)
 endif()

From b9002c6d1c797cedf650e45045cb2dd9cb407d32 Mon Sep 17 00:00:00 2001
From: Cole Kendrick <kendrickcj@ornl.gov>
Date: Mon, 28 Jun 2021 10:13:54 -0600
Subject: [PATCH 16/36] Add hcc platform def for backwards compatability

---
 CMakeLists.txt | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6644787d6..735f05066 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -186,6 +186,7 @@ if(ASGARD_USE_HIP)
   endif()
 
   message(STATUS "HIP platform has been detected as ${ASGARD_HIP_PLATFORM}")
+  # hip >= 4.2 is now using "amd" to identify platform
   if(ASGARD_HIP_PLATFORM STREQUAL "hcc" OR ASGARD_HIP_PLATFORM STREQUAL "amd")
     set(ASGARD_PLATFORM_HCC 1)
   # hip <= 4.1 uses "nvcc" to identify nvidia platforms, >= 4.2 uses "nvidia"
@@ -218,7 +219,7 @@ if(ASGARD_USE_HIP)
   set(ASGARD_AMD_ARCH "906" CACHE STRING "AMD GPU architecture number")
   set(ASGARD_AMD_FLAGS "--amdgpu-target=gfx_${ASGARD_AMD_ARCH}" CACHE STRING "Flags to pass to amd-clang for HIP") # amdgpu specific options
   set(ASGARD_NVCC_ARCH "86" CACHE STRING "Nvidia architecture number") # -gencode arch=compute_xx,code=compute_xx
-  set(ASGARD_NVCC_FLAGS "-gencode arch=compute_${ASGARD_NVCC_ARCH},code=compute_${ASGARD_NVCC_ARCH}" CACHE STRING "Flags to pass to NVCC") # nvcc specific options
+  set(ASGARD_NVCC_FLAGS "-gencode arch=compute_${ASGARD_NVCC_ARCH},code=compute_${ASGARD_NVCC_ARCH};--ptxas-options=-O3;-lineinfo;-g" CACHE STRING "Flags to pass to NVCC") # nvcc specific options
 
   set(HIP_HIPCC_FLAGS "${HIP_HIPCC_FLAGS} ${ASGARD_HIP_FLAGS}" CACHE STRING "")
   set(HIP_CLANG_FLAGS "${HIP_CLANG_FLAGS} ${ASGARD_AMD_FLAGS}" CACHE STRING "")
@@ -236,13 +237,15 @@ if(ASGARD_USE_HIP)
   if(ASGARD_PLATFORM_NVCC)
     find_package(CUDA 9.0 REQUIRED)
     include_directories(${CUDA_INCLUDE_DIRS})
-    enable_language(CUDA) # TODO: check if these are necessary with HIP on nvidia?
+    enable_language(CUDA)
     set (CMAKE_CUDA_STANDARD 14)
     set (CMAKE_CUDA_STANDARD_REQUIRED ON)
-    add_compile_definitions(__HIP_PLATFORM_NVCC__)
+    add_compile_definitions(__HIP_PLATFORM_NVCC__ __HIP_PLATFORM_NVIDIA__)
   elseif(ASGARD_PLATFORM_HCC)
-    #enable_language(HIP) # not yet added to latest cmake, but should be available soon
-    add_compile_definitions(__HIP_PLATFORM_AMD__)
+    #enable_language(HIP) # not yet added to latest cmake, but should be available in 3.21
+    # these compile definitions should be added automatically if using amd's clang, but
+    # may not necessarily be added if compiling with gcc or others
+    add_compile_definitions(__HIP_PLATFORM_HCC__ __HIP_PLATFORM_AMD__)
   endif()
 
   if (hipBLAS_FOUND)

From d08af1e01f6075579149da73da076d9ea907a1eb Mon Sep 17 00:00:00 2001
From: Cole Kendrick <kendrickcj@ornl.gov>
Date: Mon, 28 Jun 2021 14:28:33 -0600
Subject: [PATCH 17/36] Add temporary workaround for kronmult hip interface

---
 src/device/kronmult_cuda.cpp | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

diff --git a/src/device/kronmult_cuda.cpp b/src/device/kronmult_cuda.cpp
index 52bf40f90..0a346268b 100644
--- a/src/device/kronmult_cuda.cpp
+++ b/src/device/kronmult_cuda.cpp
@@ -1,8 +1,10 @@
 #include "kronmult_cuda.hpp"
 #include "build_info.hpp"
-#include "hip/hip_runtime.h"
 
 #ifdef ASGARD_USE_HIP
+// temporary workaround to use hip only on nvidia
+// this can be removed once the hipified kronmult is used
+#ifdef __HIP_PLATFORM_NVCC__
 #include <hip/hip_runtime.h>
 #define USE_GPU
 #define GLOBAL_FUNCTION __global__
@@ -11,6 +13,14 @@
 #define DEVICE_FUNCTION __device__
 #define HOST_FUNCTION __host__
 #else
+#undef ASGARD_USE_HIP
+#define GLOBAL_FUNCTION
+#define SYNCTHREADS
+#define SHARED_MEMORY
+#define DEVICE_FUNCTION
+#define HOST_FUNCTION
+#endif
+#else
 #define GLOBAL_FUNCTION
 #define SYNCTHREADS
 #define SHARED_MEMORY
@@ -397,6 +407,13 @@ void call_kronmult(int const n, P *x_ptrs[], P *output_ptrs[], P *work_ptrs[],
 #endif
 }
 
+// note - temporary workaround for compiling kronmult on amd platforms
+// this needs to be removed when hipified kronmult is used
+#ifndef __HIP_PLATFORM_NVCC__
+// redefine hip flag
+#define ASGARD_USE_HIP
+#endif
+
 template void stage_inputs_kronmult(float const *const x,
                                     float *const workspace, int const num_elems,
                                     int const num_copies);

From 3dc539d4b854bef85699fbc8b850c82d2f8ae59d Mon Sep 17 00:00:00 2001
From: Cole Kendrick <kendrickcj@ornl.gov>
Date: Mon, 28 Jun 2021 10:07:45 -0600
Subject: [PATCH 18/36] Temporarily suppress clang compiler warnings

---
 CMakeLists.txt | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 735f05066..7f46c0a4c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -246,6 +246,8 @@ if(ASGARD_USE_HIP)
     # these compile definitions should be added automatically if using amd's clang, but
     # may not necessarily be added if compiling with gcc or others
     add_compile_definitions(__HIP_PLATFORM_HCC__ __HIP_PLATFORM_AMD__)
+    # temporarily disable warnings generated when using clang
+    add_compile_options("-Wno-dtor-name" "-Wno-gnu-anonymous-struct" "-Wno-nested-anon-types")
   endif()
 
   if (hipBLAS_FOUND)

From ef16b563445926197093f762401b654152e06670 Mon Sep 17 00:00:00 2001
From: Cole Kendrick <kendrickcj@ornl.gov>
Date: Tue, 6 Jul 2021 08:26:03 -0600
Subject: [PATCH 19/36] Fix shared flags for hip platform, amd gpu target archs

---
 CMakeLists.txt | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7f46c0a4c..9c3a86ce2 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -215,16 +215,19 @@ if(ASGARD_USE_HIP)
 
   set(HIP_VERBOSE_BUILD ON CACHE STRING "Verbose compilation for HIP")
 
-  set(ASGARD_HIP_FLAGS "-std=c++14" CACHE STRING "HIP compiler flags for both AMD and NVIDIA")
+  set(ASGARD_HIP_FLAGS "-std=c++14 -g" CACHE STRING "HIP compiler flags for both AMD and NVIDIA")
   set(ASGARD_AMD_ARCH "906" CACHE STRING "AMD GPU architecture number")
-  set(ASGARD_AMD_FLAGS "--amdgpu-target=gfx_${ASGARD_AMD_ARCH}" CACHE STRING "Flags to pass to amd-clang for HIP") # amdgpu specific options
+  set(ASGARD_AMD_FLAGS "${ASGARD_HIP_FLAGS} --amdgpu-target=gfx${ASGARD_AMD_ARCH} -O3" CACHE STRING "Flags to pass to amd-clang for HIP") # amdgpu specific options
   set(ASGARD_NVCC_ARCH "86" CACHE STRING "Nvidia architecture number") # -gencode arch=compute_xx,code=compute_xx
-  set(ASGARD_NVCC_FLAGS "-gencode arch=compute_${ASGARD_NVCC_ARCH},code=compute_${ASGARD_NVCC_ARCH};--ptxas-options=-O3;-lineinfo;-g" CACHE STRING "Flags to pass to NVCC") # nvcc specific options
+  set(ASGARD_NVCC_FLAGS "${ASGARD_HIP_FLAGS} -gencode arch=compute_${ASGARD_NVCC_ARCH},code=compute_${ASGARD_NVCC_ARCH};--ptxas-options=-O3;-lineinfo" CACHE STRING "Flags to pass to NVCC") # nvcc specific options
 
   set(HIP_HIPCC_FLAGS "${HIP_HIPCC_FLAGS} ${ASGARD_HIP_FLAGS}" CACHE STRING "")
   set(HIP_CLANG_FLAGS "${HIP_CLANG_FLAGS} ${ASGARD_AMD_FLAGS}" CACHE STRING "")
   set(HIP_NVCC_FLAGS "${HIP_NVCC_FLAGS} ${ASGARD_NVCC_FLAGS}" CACHE STRING "")
 
+  set(AMDGPU_TARGETS "gfx${ASGARD_AMD_ARCH}")
+  set(GPU_TARGETS "gfx${ASGARD_AMD_ARCH}")
+
   find_package(HIP 4.0 REQUIRED)
   find_package(hipblas REQUIRED)
   # Print some debug info about HIP configuration

From a745fbe57bf7bf39ed9f6d4973bf2a7773a80515 Mon Sep 17 00:00:00 2001
From: Cole Kendrick <kendrickcj@ornl.gov>
Date: Tue, 6 Jul 2021 13:31:16 -0600
Subject: [PATCH 20/36] Move kronmult source properties before add_lib

---
 CMakeLists.txt | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9c3a86ce2..13ae90442 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -215,11 +215,11 @@ if(ASGARD_USE_HIP)
 
   set(HIP_VERBOSE_BUILD ON CACHE STRING "Verbose compilation for HIP")
 
-  set(ASGARD_HIP_FLAGS "-std=c++14 -g" CACHE STRING "HIP compiler flags for both AMD and NVIDIA")
+  set(ASGARD_HIP_FLAGS "-std=c++14;-g" CACHE STRING "HIP compiler flags for both AMD and NVIDIA")
   set(ASGARD_AMD_ARCH "906" CACHE STRING "AMD GPU architecture number")
-  set(ASGARD_AMD_FLAGS "${ASGARD_HIP_FLAGS} --amdgpu-target=gfx${ASGARD_AMD_ARCH} -O3" CACHE STRING "Flags to pass to amd-clang for HIP") # amdgpu specific options
+  set(ASGARD_AMD_FLAGS "${ASGARD_HIP_FLAGS}; --amdgpu-target=gfx${ASGARD_AMD_ARCH};-O3" CACHE STRING "Flags to pass to amd-clang for HIP") # amdgpu specific options
   set(ASGARD_NVCC_ARCH "86" CACHE STRING "Nvidia architecture number") # -gencode arch=compute_xx,code=compute_xx
-  set(ASGARD_NVCC_FLAGS "${ASGARD_HIP_FLAGS} -gencode arch=compute_${ASGARD_NVCC_ARCH},code=compute_${ASGARD_NVCC_ARCH};--ptxas-options=-O3;-lineinfo" CACHE STRING "Flags to pass to NVCC") # nvcc specific options
+  set(ASGARD_NVCC_FLAGS "${ASGARD_HIP_FLAGS}; -gencode arch=compute_${ASGARD_NVCC_ARCH},code=compute_${ASGARD_NVCC_ARCH};--ptxas-options=-O3;-lineinfo" CACHE STRING "Flags to pass to NVCC") # nvcc specific options
 
   set(HIP_HIPCC_FLAGS "${HIP_HIPCC_FLAGS} ${ASGARD_HIP_FLAGS}" CACHE STRING "")
   set(HIP_CLANG_FLAGS "${HIP_CLANG_FLAGS} ${ASGARD_AMD_FLAGS}" CACHE STRING "")
@@ -262,6 +262,14 @@ if(ASGARD_USE_HIP)
   include_directories("${HIP_PATH}/include")
   include_directories(${HIPBLAS_INCLUDE_DIRS})
 
+  # set source file language properties
+  if(ASGARD_PLATFORM_HCC)
+    #set_source_files_properties( src/device/kronmult_cuda.cpp PROPERTIES LANGUAGE HIP ) # should work after cmake 3.21 release?
+    set_source_files_properties( src/device/kronmult_cuda.cpp PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1)
+  elseif(ASGARD_PLATFORM_NVCC)
+    set_source_files_properties( src/device/kronmult_cuda.cpp PROPERTIES LANGUAGE CUDA ) # no .cu extension
+  endif()
+
   hip_add_library(kronmult_cuda SHARED src/device/kronmult_cuda.cpp
                   HIPCC_OPTIONS "${ASGARD_HIP_FLAGS}"
                   NVCC_OPTIONS "${ASGARD_NVCC_FLAGS}"
@@ -273,11 +281,7 @@ else()
 endif()
 
 if(ASGARD_USE_HIP)
-  if(ASGARD_PLATFORM_HCC)
-    #set_source_files_properties( src/device/kronmult_cuda.cpp PROPERTIES LANGUAGE HIP ) # should work after cmake 3.21 release?
-    set_source_files_properties( src/device/kronmult_cuda.cpp PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1)
-  elseif(ASGARD_PLATFORM_NVCC)
-    set_source_files_properties( src/device/kronmult_cuda.cpp PROPERTIES LANGUAGE CUDA ) # no .cu extension
+  if(ASGARD_PLATFORM_NVCC)
     set_target_properties( kronmult_cuda PROPERTIES CUDA_ARCHITECTURES OFF)
     set_target_properties( kronmult_cuda PROPERTIES COMPILE_FLAGS "-arch sm_${ASGARD_NVCC_ARCH} -g -lineinfo --ptxas-options=-O3")
   endif()

From e591cd92527fa48a7c119576979c8a1f6e472fdf Mon Sep 17 00:00:00 2001
From: Cole Kendrick <kendrickcj@ornl.gov>
Date: Thu, 15 Jul 2021 08:54:11 -0600
Subject: [PATCH 21/36] Add check for empty matrix to avoid hipmemcpy2d error

---
 CMakeLists.txt       |  2 +-
 src/lib_dispatch.cpp | 24 ++++++++++++------------
 src/tensors.hpp      |  9 +++++++++
 3 files changed, 22 insertions(+), 13 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 13ae90442..9aff4c384 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -219,7 +219,7 @@ if(ASGARD_USE_HIP)
   set(ASGARD_AMD_ARCH "906" CACHE STRING "AMD GPU architecture number")
   set(ASGARD_AMD_FLAGS "${ASGARD_HIP_FLAGS}; --amdgpu-target=gfx${ASGARD_AMD_ARCH};-O3" CACHE STRING "Flags to pass to amd-clang for HIP") # amdgpu specific options
   set(ASGARD_NVCC_ARCH "86" CACHE STRING "Nvidia architecture number") # -gencode arch=compute_xx,code=compute_xx
-  set(ASGARD_NVCC_FLAGS "${ASGARD_HIP_FLAGS}; -gencode arch=compute_${ASGARD_NVCC_ARCH},code=compute_${ASGARD_NVCC_ARCH};--ptxas-options=-O3;-lineinfo" CACHE STRING "Flags to pass to NVCC") # nvcc specific options
+  set(ASGARD_NVCC_FLAGS "${ASGARD_HIP_FLAGS}; -gencode arch=compute_${ASGARD_NVCC_ARCH},code=compute_${ASGARD_NVCC_ARCH} --ptxas-options=-O3 -lineinfo" CACHE STRING "Flags to pass to NVCC") # nvcc specific options
 
   set(HIP_HIPCC_FLAGS "${HIP_HIPCC_FLAGS} ${ASGARD_HIP_FLAGS}" CACHE STRING "")
   set(HIP_CLANG_FLAGS "${HIP_CLANG_FLAGS} ${ASGARD_AMD_FLAGS}" CACHE STRING "")
diff --git a/src/lib_dispatch.cpp b/src/lib_dispatch.cpp
index 3bbf840f8..b94ee451a 100644
--- a/src/lib_dispatch.cpp
+++ b/src/lib_dispatch.cpp
@@ -114,14 +114,14 @@ struct device_handler
 
     if (handle)
     {
-      auto const cublas_success = hipblasDestroy(handle);
-      expect(cublas_success == HIPBLAS_STATUS_SUCCESS);
+      auto const hipblas_success = hipblasDestroy(handle);
+      expect(hipblas_success == HIPBLAS_STATUS_SUCCESS);
     }
 
     success = hipSetDevice(local_rank);
     expect(success == hipSuccess);
-    auto const cublas_success = hipblasCreate(&handle);
-    expect(cublas_success == HIPBLAS_STATUS_SUCCESS);
+    auto const hipblas_success = hipblasCreate(&handle);
+    expect(hipblas_success == HIPBLAS_STATUS_SUCCESS);
 
 #else
     ignore(local_rank);
@@ -852,8 +852,8 @@ void batched_gemm(P **const &a, int *lda, char const *transa, P **const &b,
       auto const success = hipblasDgemmBatched(
           device.get_handle(), hipblas_trans(*transa), hipblas_trans(*transb),
           *m, *n, *k, alpha, a_d, *lda, b_d, *ldb, beta, c_d, *ldc, *num_batch);
-      auto const cuda_stat = hipDeviceSynchronize();
-      expect(cuda_stat == 0);
+      auto const hip_stat = hipDeviceSynchronize();
+      expect(hip_stat == 0);
       expect(success == 0);
     }
     else if constexpr (std::is_same<P, float>::value)
@@ -861,8 +861,8 @@ void batched_gemm(P **const &a, int *lda, char const *transa, P **const &b,
       auto const success = hipblasSgemmBatched(
           device.get_handle(), hipblas_trans(*transa), hipblas_trans(*transb),
           *m, *n, *k, alpha, a_d, *lda, b_d, *ldb, beta, c_d, *ldc, *num_batch);
-      auto const cuda_stat = hipDeviceSynchronize();
-      expect(cuda_stat == 0);
+      auto const hip_stat = hipDeviceSynchronize();
+      expect(hip_stat == 0);
       expect(success == 0);
     }
 
@@ -949,8 +949,8 @@ void batched_gemv(P **const &a, int *lda, char const *trans, P **const &x,
           device.get_handle(), hipblas_trans(*trans), hipblas_trans(transb),
           gemm_m, gemm_n, gemm_k, alpha, a_d, *lda, x_d, ldb, beta, y_d, ldc,
           *num_batch);
-      auto const cuda_stat = hipDeviceSynchronize();
-      expect(cuda_stat == 0);
+      auto const hip_stat = hipDeviceSynchronize();
+      expect(hip_stat == 0);
       expect(success == 0);
     }
     else if constexpr (std::is_same<P, float>::value)
@@ -959,8 +959,8 @@ void batched_gemv(P **const &a, int *lda, char const *trans, P **const &x,
           device.get_handle(), hipblas_trans(*trans), hipblas_trans(transb),
           gemm_m, gemm_n, gemm_k, alpha, a_d, *lda, x_d, ldb, beta, y_d, ldc,
           *num_batch);
-      auto const cuda_stat = hipDeviceSynchronize();
-      expect(cuda_stat == 0);
+      auto const hip_stat = hipDeviceSynchronize();
+      expect(hip_stat == 0);
       expect(success == 0);
     }
 
diff --git a/src/tensors.hpp b/src/tensors.hpp
index 87234e1a9..d6bbb7986 100644
--- a/src/tensors.hpp
+++ b/src/tensors.hpp
@@ -727,6 +727,9 @@ copy_matrix_on_device(fk::matrix<P, mem, resource::device> &dest,
   expect(source.ncols() == dest.ncols());
 
 #ifdef ASGARD_USE_HIP
+  // on AMD, hipMemcpy2D will give throw an error if dpitch or spitch is 0
+  if (source.stride() == 0)
+    return;
   auto const success =
       hipMemcpy2D(dest.data(), dest.stride() * sizeof(P), source.data(),
                   source.stride() * sizeof(P), source.nrows() * sizeof(P),
@@ -746,6 +749,9 @@ copy_matrix_to_device(fk::matrix<P, mem, resource::device> &dest,
   expect(source.nrows() == dest.nrows());
   expect(source.ncols() == dest.ncols());
 #ifdef ASGARD_USE_HIP
+  // on AMD, hipMemcpy2D will give throw an error if dpitch or spitch is 0
+  if (source.stride() == 0)
+    return;
   auto const success =
       hipMemcpy2D(dest.data(), dest.stride() * sizeof(P), source.data(),
                   source.stride() * sizeof(P), source.nrows() * sizeof(P),
@@ -765,6 +771,9 @@ copy_matrix_to_host(fk::matrix<P, mem, resource::host> &dest,
   expect(source.nrows() == dest.nrows());
   expect(source.ncols() == dest.ncols());
 #ifdef ASGARD_USE_HIP
+  // on AMD, hipMemcpy2D will give throw an error if dpitch or spitch is 0
+  if (source.stride() == 0)
+    return;
   auto const success =
       hipMemcpy2D(dest.data(), dest.stride() * sizeof(P), source.data(),
                   source.stride() * sizeof(P), source.nrows() * sizeof(P),

From 770a040d4584e460a3ec038e62787be9f11a397e Mon Sep 17 00:00:00 2001
From: Cole Kendrick <kendrickcj@ornl.gov>
Date: Tue, 20 Jul 2021 13:49:07 -0600
Subject: [PATCH 22/36] Fix hardcoded amd clang version and update target flags

---
 CMakeLists.txt | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9aff4c384..a29d7e3ee 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -78,7 +78,7 @@ option (ASGARD_PROFILE_PERF "enable profiling support for using linux perf" "")
 option (ASGARD_PROFILE_VALGRIND "enable profiling support for using valgrind" "")
 option (ASGARD_GRAPHVIZ_PATH "optional location of bin/ containing dot executable" "")
 option (ASGARD_IO_HIGHFIVE "Use the HighFive HDF5 header library for I/O" OFF)
-option (ASGARD_USE_HIP "Optional HIP support for asgard" ON)
+option (ASGARD_USE_HIP "Optional HIP support for asgard" OFF)
 option (ASGARD_USE_OPENMP "Optional openMP support for asgard" ON)
 option (ASGARD_USE_MPI "Optional distributed computing support for asgard" OFF)
 include(CMakeDependentOption)
@@ -161,7 +161,8 @@ if(ASGARD_USE_HIP)
   # note: could probably grab this path directly using hipconfig?
   if (NOT DEFINED HIP_CLANG_INCLUDE_PATH)
     if(NOT DEFINED ENV{HIP_CLANG_INCLUDE_PATH})
-      set(HIP_CLANG_INCLUDE_PATH "${HIP_CLANG_PATH}/../lib/clang/12.0.0/include" CACHE PATH "Path to HIP clang include directory")
+      # probably need a better way to get the compiler version.. this will cause non-existent paths for non-clang compilers
+      set(HIP_CLANG_INCLUDE_PATH "${HIP_CLANG_PATH}/../lib/clang/${CMAKE_CXX_COMPILER_VERSION}/include" CACHE PATH "Path to HIP clang include directory")
     else()
       set(HIP_CLANG_INCLUDE_PATH $ENV{HIP_CLANG_INCLUDE_PATH} CACHE PATH "Path to HIP clang include directory")
     endif()
@@ -210,6 +211,11 @@ if(ASGARD_USE_HIP)
   if(ASGARD_PLATFORM_HCC)
     # note: causes issues on nvidia, but might be needed on amd platforms?
     list(APPEND CMAKE_PREFIX_PATH "${HIP_PATH}/lib/cmake" "${ROCM_PATH}")
+
+    # output a warning if compiling for AMD without using amd-clang
+    if(NOT CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+      message(WARNING "Compiling HIP for AMD without using AMD clang might not work. Use -DCMAKE_CXX_COMPILER=clang++")
+    endif()
   endif()
   list(APPEND CMAKE_PREFIX_PATH "${HIPBLAS_PATH}/lib/cmake")
 
@@ -225,8 +231,8 @@ if(ASGARD_USE_HIP)
   set(HIP_CLANG_FLAGS "${HIP_CLANG_FLAGS} ${ASGARD_AMD_FLAGS}" CACHE STRING "")
   set(HIP_NVCC_FLAGS "${HIP_NVCC_FLAGS} ${ASGARD_NVCC_FLAGS}" CACHE STRING "")
 
-  set(AMDGPU_TARGETS "gfx${ASGARD_AMD_ARCH}")
-  set(GPU_TARGETS "gfx${ASGARD_AMD_ARCH}")
+  set(AMDGPU_TARGETS "gfx${ASGARD_AMD_ARCH}" CACHE STRING "GPU target architectures to compile for")
+  set(GPU_TARGETS "gfx${ASGARD_AMD_ARCH}" CACHE STRING "GPU target architectures to compile for")
 
   find_package(HIP 4.0 REQUIRED)
   find_package(hipblas REQUIRED)
@@ -249,8 +255,6 @@ if(ASGARD_USE_HIP)
     # these compile definitions should be added automatically if using amd's clang, but
     # may not necessarily be added if compiling with gcc or others
     add_compile_definitions(__HIP_PLATFORM_HCC__ __HIP_PLATFORM_AMD__)
-    # temporarily disable warnings generated when using clang
-    add_compile_options("-Wno-dtor-name" "-Wno-gnu-anonymous-struct" "-Wno-nested-anon-types")
   endif()
 
   if (hipBLAS_FOUND)

From c318c8926ce4dfd89b722428cd915286f29c1dd2 Mon Sep 17 00:00:00 2001
From: Cole Kendrick <kendrickcj@ornl.gov>
Date: Wed, 21 Jul 2021 11:07:41 -0600
Subject: [PATCH 23/36] Modify include dirs to hide deprecated cuda messages

---
 CMakeLists.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index a29d7e3ee..ba7c16b1c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -261,9 +261,9 @@ if(ASGARD_USE_HIP)
      message(STATUS "Found rocBLAS version ${rocBLAS_VERSION}: ${HIPBLAS_INCLUDE_DIRS}")
   endif()
 
-  include_directories(${HIP_INCLUDE_DIRS})
+  include_directories(SYSTEM ${HIP_INCLUDE_DIRS})
   # assume this include path since HIP_INCLUDE_DIRS is not being set on nvidia platform
-  include_directories("${HIP_PATH}/include")
+  include_directories(SYSTEM "${HIP_PATH}/include")
   include_directories(${HIPBLAS_INCLUDE_DIRS})
 
   # set source file language properties

From cf7c04a7f69508cea896a3cd87cbb80fc3f922c5 Mon Sep 17 00:00:00 2001
From: Cole Kendrick <kendrickcj@ornl.gov>
Date: Wed, 21 Jul 2021 12:21:13 -0600
Subject: [PATCH 24/36] Update device test tol for batch tests

---
 src/batch_tests.cpp | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/batch_tests.cpp b/src/batch_tests.cpp
index 757b3d4e8..4739b0f79 100644
--- a/src/batch_tests.cpp
+++ b/src/batch_tests.cpp
@@ -864,7 +864,11 @@ void test_batched_gemv(int const m, int const n, int const lda,
 
   batched_gemv(a_batch, x_batch, y_batch, alpha, beta);
 
-  P const tol_factor = 1e-17;
+  P tol_factor = 1e-17;
+  if constexpr (resrc == resource::device)
+  {
+    tol_factor = 1e-7;
+  }
   for (int i = 0; i < num_batch; ++i)
   {
     if constexpr (resrc == resource::host)

From d5f34cab7999286aa013cc1383d1256c4045af0b Mon Sep 17 00:00:00 2001
From: Cole Kendrick <kendrickcj@ornl.gov>
Date: Thu, 29 Jul 2021 14:07:29 -0600
Subject: [PATCH 25/36] Fix clang formatting issues

---
 src/tensors.hpp | 76 ++++++++++++++++++++++++-------------------------
 1 file changed, 38 insertions(+), 38 deletions(-)

diff --git a/src/tensors.hpp b/src/tensors.hpp
index d6bbb7986..0fff81f60 100644
--- a/src/tensors.hpp
+++ b/src/tensors.hpp
@@ -240,10 +240,10 @@ class vector
   template<mem_type omem, resource r_ = resrc, typename = enable_for_host<r_>>
   vector<P> operator-(vector<P, omem> const &right) const;
   template<mem_type omem, resource r_ = resrc, typename = enable_for_host<r_>>
-  P operator*(vector<P, omem> const &) const;
+  P operator*(vector<P, omem> const &)const;
 
   template<mem_type omem, resource r_ = resrc, typename = enable_for_host<r_>>
-  vector<P> operator*(matrix<P, omem> const &) const;
+  vector<P> operator*(matrix<P, omem> const &)const;
 
   template<resource r_ = resrc, typename = enable_for_host<r_>>
   vector<P> operator*(P const) const;
@@ -475,9 +475,9 @@ class matrix
   template<resource r_ = resrc, typename = enable_for_host<r_>>
   matrix<P> operator*(P const) const;
   template<mem_type omem, resource r_ = resrc, typename = enable_for_host<r_>>
-  vector<P> operator*(vector<P, omem> const &) const;
+  vector<P> operator*(vector<P, omem> const &)const;
   template<mem_type omem, resource r_ = resrc, typename = enable_for_host<r_>>
-  matrix<P> operator*(matrix<P, omem> const &) const;
+  matrix<P> operator*(matrix<P, omem> const &)const;
   template<mem_type omem, resource r_ = resrc, typename = enable_for_host<r_>>
   matrix<P> operator+(matrix<P, omem> const &) const;
   template<mem_type omem, resource r_ = resrc, typename = enable_for_host<r_>>
@@ -1005,8 +1005,8 @@ fk::vector<P, mem, resrc>::vector(vector<P, mem, resrc> const &a)
 // http://stackoverflow.com/questions/3279543/what-is-the-copy-and-swap-idiom
 //
 template<typename P, mem_type mem, resource resrc>
-fk::vector<P, mem, resrc> &
-fk::vector<P, mem, resrc>::operator=(vector<P, mem, resrc> const &a)
+fk::vector<P, mem, resrc> &fk::vector<P, mem, resrc>::
+operator=(vector<P, mem, resrc> const &a)
 {
   static_assert(mem != mem_type::const_view,
                 "cannot copy assign into const_view!");
@@ -1051,8 +1051,8 @@ fk::vector<P, mem, resrc>::vector(vector<P, mem, resrc> &&a)
 // vector move assignment
 //
 template<typename P, mem_type mem, resource resrc>
-fk::vector<P, mem, resrc> &
-fk::vector<P, mem, resrc>::operator=(vector<P, mem, resrc> &&a)
+fk::vector<P, mem, resrc> &fk::vector<P, mem, resrc>::
+operator=(vector<P, mem, resrc> &&a)
 {
   static_assert(mem != mem_type::const_view,
                 "cannot move assign into const_view!");
@@ -1098,8 +1098,8 @@ fk::vector<P, mem, resrc>::vector(vector<PP, omem> const &a)
 //
 template<typename P, mem_type mem, resource resrc>
 template<typename PP, mem_type omem, mem_type, typename, resource, typename>
-fk::vector<P, mem> &
-fk::vector<P, mem, resrc>::operator=(vector<PP, omem> const &a)
+fk::vector<P, mem> &fk::vector<P, mem, resrc>::
+operator=(vector<PP, omem> const &a)
 {
   expect(size() == a.size());
 
@@ -1133,8 +1133,8 @@ fk::vector<P, mem, resrc>::vector(vector<P, omem, resrc> const &a)
 // assignment owner <-> view
 template<typename P, mem_type mem, resource resrc>
 template<mem_type omem, mem_type, typename>
-fk::vector<P, mem, resrc> &
-fk::vector<P, mem, resrc>::operator=(vector<P, omem, resrc> const &a)
+fk::vector<P, mem, resrc> &fk::vector<P, mem, resrc>::
+operator=(vector<P, omem, resrc> const &a)
 {
   expect(size() == a.size());
   if constexpr (resrc == resource::host)
@@ -1200,8 +1200,8 @@ fk::vector<P, mem, resrc> &fk::vector<P, mem, resrc>::transfer_from(
 //
 template<typename P, mem_type mem, resource resrc>
 template<mem_type, typename, resource, typename>
-fk::vector<P, mem> &
-fk::vector<P, mem, resrc>::operator=(std::vector<P> const &v)
+fk::vector<P, mem> &fk::vector<P, mem, resrc>::
+operator=(std::vector<P> const &v)
 {
   expect(size() == static_cast<int>(v.size()));
   std::memcpy(data_, v.data(), v.size() * sizeof(P));
@@ -1287,8 +1287,8 @@ bool fk::vector<P, mem, resrc>::operator<(vector<P, omem> const &other) const
 //
 template<typename P, mem_type mem, resource resrc>
 template<mem_type omem, resource, typename>
-fk::vector<P>
-fk::vector<P, mem, resrc>::operator+(vector<P, omem> const &right) const
+fk::vector<P> fk::vector<P, mem, resrc>::
+operator+(vector<P, omem> const &right) const
 {
   expect(size() == right.size());
   vector<P> ans(size());
@@ -1302,8 +1302,8 @@ fk::vector<P, mem, resrc>::operator+(vector<P, omem> const &right) const
 //
 template<typename P, mem_type mem, resource resrc>
 template<mem_type omem, resource, typename>
-fk::vector<P>
-fk::vector<P, mem, resrc>::operator-(vector<P, omem> const &right) const
+fk::vector<P> fk::vector<P, mem, resrc>::
+operator-(vector<P, omem> const &right) const
 {
   expect(size() == right.size());
   vector<P> ans(size());
@@ -1332,8 +1332,8 @@ P fk::vector<P, mem, resrc>::operator*(vector<P, omem> const &right) const
 //
 template<typename P, mem_type mem, resource resrc>
 template<mem_type omem, resource, typename>
-fk::vector<P>
-fk::vector<P, mem, resrc>::operator*(fk::matrix<P, omem> const &A) const
+fk::vector<P> fk::vector<P, mem, resrc>::
+operator*(fk::matrix<P, omem> const &A) const
 {
   // check dimension compatibility
   expect(size() == A.nrows());
@@ -1789,8 +1789,8 @@ fk::matrix<P, mem, resrc>::matrix(matrix<P, mem, resrc> const &a)
 // http://stackoverflow.com/questions/3279543/what-is-the-copy-and-swap-idiom
 //
 template<typename P, mem_type mem, resource resrc>
-fk::matrix<P, mem, resrc> &
-fk::matrix<P, mem, resrc>::operator=(matrix<P, mem, resrc> const &a)
+fk::matrix<P, mem, resrc> &fk::matrix<P, mem, resrc>::
+operator=(matrix<P, mem, resrc> const &a)
 {
   static_assert(mem != mem_type::const_view,
                 "cannot copy assign into const_view!");
@@ -1843,8 +1843,8 @@ fk::matrix<P, mem, resrc>::matrix(matrix<P, omem, resrc> const &a)
 // assignment owner <-> view
 template<typename P, mem_type mem, resource resrc>
 template<mem_type omem, mem_type, typename>
-fk::matrix<P, mem, resrc> &
-fk::matrix<P, mem, resrc>::operator=(matrix<P, omem, resrc> const &a)
+fk::matrix<P, mem, resrc> &fk::matrix<P, mem, resrc>::
+operator=(matrix<P, omem, resrc> const &a)
 {
   expect(nrows() == a.nrows());
   expect(ncols() == a.ncols());
@@ -1883,8 +1883,8 @@ fk::matrix<P, mem, resrc>::matrix(matrix<PP, omem> const &a)
 //
 template<typename P, mem_type mem, resource resrc>
 template<typename PP, mem_type omem, mem_type, typename, resource, typename>
-fk::matrix<P, mem> &
-fk::matrix<P, mem, resrc>::operator=(matrix<PP, omem> const &a)
+fk::matrix<P, mem> &fk::matrix<P, mem, resrc>::
+operator=(matrix<PP, omem> const &a)
 {
   expect((nrows() == a.nrows()) && (ncols() == a.ncols()));
 
@@ -1975,8 +1975,8 @@ fk::matrix<P, mem, resrc>::matrix(matrix<P, mem, resrc> &&a)
 // matrix move assignment
 //
 template<typename P, mem_type mem, resource resrc>
-fk::matrix<P, mem, resrc> &
-fk::matrix<P, mem, resrc>::operator=(matrix<P, mem, resrc> &&a)
+fk::matrix<P, mem, resrc> &fk::matrix<P, mem, resrc>::
+operator=(matrix<P, mem, resrc> &&a)
 {
   static_assert(mem != mem_type::const_view,
                 "cannot move assign into const_view!");
@@ -2007,8 +2007,8 @@ fk::matrix<P, mem, resrc>::operator=(matrix<P, mem, resrc> &&a)
 //
 template<typename P, mem_type mem, resource resrc>
 template<mem_type omem, mem_type, typename, resource, typename>
-fk::matrix<P, mem> &
-fk::matrix<P, mem, resrc>::operator=(fk::vector<P, omem> const &v)
+fk::matrix<P, mem> &fk::matrix<P, mem, resrc>::
+operator=(fk::vector<P, omem> const &v)
 {
   expect(nrows() * ncols() == v.size());
 
@@ -2094,8 +2094,8 @@ bool fk::matrix<P, mem, resrc>::operator<(matrix<P, omem> const &other) const
 //
 template<typename P, mem_type mem, resource resrc>
 template<mem_type omem, resource, typename>
-fk::matrix<P>
-fk::matrix<P, mem, resrc>::operator+(matrix<P, omem> const &right) const
+fk::matrix<P> fk::matrix<P, mem, resrc>::
+operator+(matrix<P, omem> const &right) const
 {
   expect(nrows() == right.nrows() && ncols() == right.ncols());
 
@@ -2115,8 +2115,8 @@ fk::matrix<P, mem, resrc>::operator+(matrix<P, omem> const &right) const
 //
 template<typename P, mem_type mem, resource resrc>
 template<mem_type omem, resource, typename>
-fk::matrix<P>
-fk::matrix<P, mem, resrc>::operator-(matrix<P, omem> const &right) const
+fk::matrix<P> fk::matrix<P, mem, resrc>::
+operator-(matrix<P, omem> const &right) const
 {
   expect(nrows() == right.nrows() && ncols() == right.ncols());
 
@@ -2154,8 +2154,8 @@ fk::matrix<P> fk::matrix<P, mem, resrc>::operator*(P const right) const
 //
 template<typename P, mem_type mem, resource resrc>
 template<mem_type omem, resource, typename>
-fk::vector<P>
-fk::matrix<P, mem, resrc>::operator*(fk::vector<P, omem> const &right) const
+fk::vector<P> fk::matrix<P, mem, resrc>::
+operator*(fk::vector<P, omem> const &right) const
 {
   // check dimension compatibility
   expect(ncols() == right.size());
@@ -2181,8 +2181,8 @@ fk::matrix<P, mem, resrc>::operator*(fk::vector<P, omem> const &right) const
 //
 template<typename P, mem_type mem, resource resrc>
 template<mem_type omem, resource, typename>
-fk::matrix<P>
-fk::matrix<P, mem, resrc>::operator*(matrix<P, omem> const &B) const
+fk::matrix<P> fk::matrix<P, mem, resrc>::
+operator*(matrix<P, omem> const &B) const
 {
   expect(ncols() == B.nrows()); // k == k
 

From 8c69e5702f813099c2e1b1157d4dd4bf069021af Mon Sep 17 00:00:00 2001
From: Cole Kendrick <kendrickcj@ornl.gov>
Date: Wed, 1 Sep 2021 11:48:44 -0600
Subject: [PATCH 26/36] Re-enable kronmult for amd platforms

---
 src/device/kronmult_cuda.cpp | 18 ------------------
 1 file changed, 18 deletions(-)

diff --git a/src/device/kronmult_cuda.cpp b/src/device/kronmult_cuda.cpp
index 0a346268b..fe6573a2c 100644
--- a/src/device/kronmult_cuda.cpp
+++ b/src/device/kronmult_cuda.cpp
@@ -2,9 +2,6 @@
 #include "build_info.hpp"
 
 #ifdef ASGARD_USE_HIP
-// temporary workaround to use hip only on nvidia
-// this can be removed once the hipified kronmult is used
-#ifdef __HIP_PLATFORM_NVCC__
 #include <hip/hip_runtime.h>
 #define USE_GPU
 #define GLOBAL_FUNCTION __global__
@@ -13,14 +10,6 @@
 #define DEVICE_FUNCTION __device__
 #define HOST_FUNCTION __host__
 #else
-#undef ASGARD_USE_HIP
-#define GLOBAL_FUNCTION
-#define SYNCTHREADS
-#define SHARED_MEMORY
-#define DEVICE_FUNCTION
-#define HOST_FUNCTION
-#endif
-#else
 #define GLOBAL_FUNCTION
 #define SYNCTHREADS
 #define SHARED_MEMORY
@@ -407,13 +396,6 @@ void call_kronmult(int const n, P *x_ptrs[], P *output_ptrs[], P *work_ptrs[],
 #endif
 }
 
-// note - temporary workaround for compiling kronmult on amd platforms
-// this needs to be removed when hipified kronmult is used
-#ifndef __HIP_PLATFORM_NVCC__
-// redefine hip flag
-#define ASGARD_USE_HIP
-#endif
-
 template void stage_inputs_kronmult(float const *const x,
                                     float *const workspace, int const num_elems,
                                     int const num_copies);

From 49f45dca4a7009eae3a3ca1066236617474c40f0 Mon Sep 17 00:00:00 2001
From: Cole Kendrick <kendrickcj@ornl.gov>
Date: Wed, 15 Sep 2021 11:22:17 -0400
Subject: [PATCH 27/36] Rename hip platform from hcc to amd

---
 CMakeLists.txt | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index ba7c16b1c..7d08981ea 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -138,7 +138,7 @@ endif()
 
 # convenience flags for which HIP platform has been setup
 set(ASGARD_PLATFORM_NVCC 0)
-set(ASGARD_PLATFORM_HCC 0)
+set(ASGARD_PLATFORM_AMD 0)
 if(ASGARD_USE_HIP)
   # search for HIP and libraries
   if(NOT DEFINED HIP_PATH)
@@ -189,7 +189,7 @@ if(ASGARD_USE_HIP)
   message(STATUS "HIP platform has been detected as ${ASGARD_HIP_PLATFORM}")
   # hip >= 4.2 is now using "amd" to identify platform
   if(ASGARD_HIP_PLATFORM STREQUAL "hcc" OR ASGARD_HIP_PLATFORM STREQUAL "amd")
-    set(ASGARD_PLATFORM_HCC 1)
+    set(ASGARD_PLATFORM_AMD 1)
   # hip <= 4.1 uses "nvcc" to identify nvidia platforms, >= 4.2 uses "nvidia"
   elseif(ASGARD_HIP_PLATFORM STREQUAL "nvcc" OR ASGARD_HIP_PLATFORM STREQUAL "nvidia")
     set(ASGARD_PLATFORM_NVCC 1)
@@ -208,7 +208,7 @@ if(ASGARD_USE_HIP)
 
   # look for HIP cmake configs in different locations
   list(APPEND CMAKE_MODULE_PATH "${HIP_PATH}/cmake" "${ROCM_PATH}")
-  if(ASGARD_PLATFORM_HCC)
+  if(ASGARD_PLATFORM_AMD)
     # note: causes issues on nvidia, but might be needed on amd platforms?
     list(APPEND CMAKE_PREFIX_PATH "${HIP_PATH}/lib/cmake" "${ROCM_PATH}")
 
@@ -250,7 +250,7 @@ if(ASGARD_USE_HIP)
     set (CMAKE_CUDA_STANDARD 14)
     set (CMAKE_CUDA_STANDARD_REQUIRED ON)
     add_compile_definitions(__HIP_PLATFORM_NVCC__ __HIP_PLATFORM_NVIDIA__)
-  elseif(ASGARD_PLATFORM_HCC)
+  elseif(ASGARD_PLATFORM_AMD)
     #enable_language(HIP) # not yet added to latest cmake, but should be available in 3.21
     # these compile definitions should be added automatically if using amd's clang, but
     # may not necessarily be added if compiling with gcc or others
@@ -267,7 +267,7 @@ if(ASGARD_USE_HIP)
   include_directories(${HIPBLAS_INCLUDE_DIRS})
 
   # set source file language properties
-  if(ASGARD_PLATFORM_HCC)
+  if(ASGARD_PLATFORM_AMD)
     #set_source_files_properties( src/device/kronmult_cuda.cpp PROPERTIES LANGUAGE HIP ) # should work after cmake 3.21 release?
     set_source_files_properties( src/device/kronmult_cuda.cpp PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1)
   elseif(ASGARD_PLATFORM_NVCC)
@@ -419,7 +419,7 @@ if (ASGARD_USE_OPENMP AND NOT ASGARD_USE_MKL)
    target_link_libraries(kronmult PRIVATE OpenMP::OpenMP_CXX)
 endif ()
 
-if (ASGARD_USE_HIP AND ASGARD_PLATFORM_HCC)
+if (ASGARD_USE_HIP AND ASGARD_PLATFORM_AMD)
   target_link_libraries(kronmult_cuda PUBLIC kron hip::device)
 else ()
   target_link_libraries(kronmult_cuda PUBLIC kron)
@@ -445,7 +445,7 @@ else ()
 endif ()
 
 if (ASGARD_USE_HIP)
-  if(ASGARD_PLATFORM_HCC)
+  if(ASGARD_PLATFORM_AMD)
     target_link_libraries(lib_dispatch PRIVATE hip::device)
   elseif(ASGARD_PLATFORM_NVCC)
     target_link_libraries(lib_dispatch PRIVATE ${CUDA_LIBRARIES})
@@ -478,7 +478,7 @@ target_link_libraries (solver PRIVATE distribution fast_math lib_dispatch tensor
 
 target_link_libraries (tensors PRIVATE lib_dispatch)
 if (ASGARD_USE_HIP)
-  if(ASGARD_PLATFORM_HCC)
+  if(ASGARD_PLATFORM_AMD)
     target_link_libraries(tensors PRIVATE hip::device)
   elseif(ASGARD_PLATFORM_NVCC)
     target_link_libraries(tensors PRIVATE ${CUDA_LIBRARIES})

From 8cd9e452a09424511b833c8084ff60b27aea6191 Mon Sep 17 00:00:00 2001
From: Cole Kendrick <kendrickcj@ornl.gov>
Date: Tue, 5 Oct 2021 16:13:56 -0400
Subject: [PATCH 28/36] Adjust lib dispatch device test tol for amd

---
 src/lib_dispatch_tests.cpp | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/lib_dispatch_tests.cpp b/src/lib_dispatch_tests.cpp
index a6095cec0..6ffa36aaf 100644
--- a/src/lib_dispatch_tests.cpp
+++ b/src/lib_dispatch_tests.cpp
@@ -1179,7 +1179,11 @@ TEMPLATE_TEST_CASE_SIG("batched gemv", "[lib_dispatch]",
                        (double, resource::host), (double, resource::device),
                        (float, resource::host), (float, resource::device))
 {
-  TestType const tol_factor = 1e-18;
+  TestType tol_factor = 1e-18;
+  if constexpr (resrc == resource::device)
+  {
+    tol_factor = 1e-8;
+  }
 
   SECTION("batched gemv: no trans, alpha = 1.0, beta = 0.0")
   {

From 2de03a3bf848f2608aaff475e3f54783e49f7ede Mon Sep 17 00:00:00 2001
From: Cole Kendrick <kendrickcj@ornl.gov>
Date: Fri, 8 Oct 2021 10:29:41 -0400
Subject: [PATCH 29/36] Clean up cmake, add hipblas version check for amd

---
 CMakeLists.txt | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7d08981ea..02bf50b8e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required (VERSION 3.19) 
+cmake_minimum_required (VERSION 3.21)
 
 project (asgard
   VERSION 0.3.0
@@ -198,7 +198,7 @@ if(ASGARD_USE_HIP)
   # double check for cuda path since HIP uses it internally
   if(ASGARD_PLATFORM_NVCC)
     if (NOT DEFINED ENV{CUDA_PATH})
-      find_path(ASGARD_HIP_DEFAULT_CUDA_PATH "cuda.h" PATH /usr/local/cuda/include NO_DEFAULT_PATH)
+      find_path(ASGARD_HIP_DEFAULT_CUDA_PATH "cuda.h" PATH /usr/local/cuda/include)
       if (NOT ASGARD_HIP_DEFAULT_CUDA_PATH)
         message(FATAL_ERROR "Make sure the CUDA_PATH env is set to locate for HIP")
       endif()
@@ -224,7 +224,7 @@ if(ASGARD_USE_HIP)
   set(ASGARD_HIP_FLAGS "-std=c++14;-g" CACHE STRING "HIP compiler flags for both AMD and NVIDIA")
   set(ASGARD_AMD_ARCH "906" CACHE STRING "AMD GPU architecture number")
   set(ASGARD_AMD_FLAGS "${ASGARD_HIP_FLAGS}; --amdgpu-target=gfx${ASGARD_AMD_ARCH};-O3" CACHE STRING "Flags to pass to amd-clang for HIP") # amdgpu specific options
-  set(ASGARD_NVCC_ARCH "86" CACHE STRING "Nvidia architecture number") # -gencode arch=compute_xx,code=compute_xx
+  set(ASGARD_NVCC_ARCH "70" CACHE STRING "Nvidia architecture number") # -gencode arch=compute_xx,code=compute_xx
   set(ASGARD_NVCC_FLAGS "${ASGARD_HIP_FLAGS}; -gencode arch=compute_${ASGARD_NVCC_ARCH},code=compute_${ASGARD_NVCC_ARCH} --ptxas-options=-O3 -lineinfo" CACHE STRING "Flags to pass to NVCC") # nvcc specific options
 
   set(HIP_HIPCC_FLAGS "${HIP_HIPCC_FLAGS} ${ASGARD_HIP_FLAGS}" CACHE STRING "")
@@ -234,8 +234,14 @@ if(ASGARD_USE_HIP)
   set(AMDGPU_TARGETS "gfx${ASGARD_AMD_ARCH}" CACHE STRING "GPU target architectures to compile for")
   set(GPU_TARGETS "gfx${ASGARD_AMD_ARCH}" CACHE STRING "GPU target architectures to compile for")
 
-  find_package(HIP 4.0 REQUIRED)
-  find_package(hipblas REQUIRED)
+  if(ASGARD_PLATFORM_AMD)
+    # need a much later version for AMD since ipiv=nullptr fix not in hipblas until >4.3.1
+    find_package(HIP 4.3.0 REQUIRED)
+    find_package(hipblas 0.49 REQUIRED)
+  else()
+    find_package(HIP 4.0 REQUIRED)
+    find_package(hipblas REQUIRED)
+  endif()
   # Print some debug info about HIP configuration
   message(STATUS "HIP PLATFORM: ${HIP_PLATFORM}")
   message(STATUS "HIP COMPILER: ${HIP_COMPILER}")
@@ -451,7 +457,6 @@ if (ASGARD_USE_HIP)
     target_link_libraries(lib_dispatch PRIVATE ${CUDA_LIBRARIES})
   endif()
   target_link_libraries(lib_dispatch PRIVATE roc::hipblas)
-  #set_target_properties(lib_dispatch PROPERTIES LINKER_LANGUAGE HIP)
 endif()
 
 if (ASGARD_USE_OPENMP AND NOT ASGARD_USE_MKL)
@@ -484,7 +489,6 @@ if (ASGARD_USE_HIP)
     target_link_libraries(tensors PRIVATE ${CUDA_LIBRARIES})
   endif()
   target_link_libraries (tensors PRIVATE roc::hipblas)
-  #set_target_properties(tensors PROPERTIES LINKER_LANGUAGE HIP)
 endif ()
 if (ASGARD_USE_SCALAPACK)	
   add_compile_definitions (ASGARD_USE_SCALAPACK)

From c5ed1fd43587d49b0811e849398e4167b7fd5b6f Mon Sep 17 00:00:00 2001
From: Cole Kendrick <kendrickcj@ornl.gov>
Date: Tue, 12 Oct 2021 13:48:16 -0400
Subject: [PATCH 30/36] Pass gpu arch to kronmult, set amd flags only on amd

---
 CMakeLists.txt | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 02bf50b8e..3683d7c13 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -231,10 +231,10 @@ if(ASGARD_USE_HIP)
   set(HIP_CLANG_FLAGS "${HIP_CLANG_FLAGS} ${ASGARD_AMD_FLAGS}" CACHE STRING "")
   set(HIP_NVCC_FLAGS "${HIP_NVCC_FLAGS} ${ASGARD_NVCC_FLAGS}" CACHE STRING "")
 
-  set(AMDGPU_TARGETS "gfx${ASGARD_AMD_ARCH}" CACHE STRING "GPU target architectures to compile for")
-  set(GPU_TARGETS "gfx${ASGARD_AMD_ARCH}" CACHE STRING "GPU target architectures to compile for")
-
   if(ASGARD_PLATFORM_AMD)
+    set(AMDGPU_TARGETS "gfx${ASGARD_AMD_ARCH}" CACHE STRING "GPU target architectures to compile for")
+    set(GPU_TARGETS "gfx${ASGARD_AMD_ARCH}" CACHE STRING "GPU target architectures to compile for")
+
     # need a much later version for AMD since ipiv=nullptr fix not in hipblas until >4.3.1
     find_package(HIP 4.3.0 REQUIRED)
     find_package(hipblas 0.49 REQUIRED)
@@ -299,6 +299,13 @@ if(ASGARD_USE_HIP)
 
 #  Turn on GPU support in kronmult.
   set (USE_GPU ON CACHE BOOL "Turn on kronmult gpu support" FORCE)
+
+  # pass gpu arch code to kronmult
+  if(ASGARD_PLATFORM_NVCC)
+    set(GPU_ARCH "${ASGARD_NVCC_ARCH}" CACHE STRING "GPU architecture code for AMD/NVIDIA" FORCE)
+  else()
+    set(GPU_ARCH "${ASGARD_AMD_ARCH}" CACHE STRING "GPU architecture code for AMD/NVIDIA" FORCE)
+  endif()
 endif()
 
 if(ASGARD_USE_MKL)

From c79997b6a0f335bcda3e85a27fb1891717e9b5a8 Mon Sep 17 00:00:00 2001
From: Cole Kendrick <kendrickcj@ornl.gov>
Date: Thu, 28 Oct 2021 14:37:53 -0400
Subject: [PATCH 31/36] Change kronmult fetch content to after configuration

---
 CMakeLists.txt | 43 ++++++++++++++++++++++++-------------------
 1 file changed, 24 insertions(+), 19 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3683d7c13..f11cb87b2 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -12,7 +12,7 @@ include (FetchContent)
 find_package (Git)
 
 #  Define a macro to register new projects.
-function (register_project name dir url default_tag)
+function (register_project name dir url default_tag make_avail)
     set (BUILD_TAG_${dir} ${default_tag} CACHE STRING "Name of the tag to checkout.")
     set (BUILD_REPO_${dir} ${url} CACHE STRING "URL of the repo to clone.")
 
@@ -24,7 +24,9 @@ function (register_project name dir url default_tag)
         SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/contrib/${dir}
     )
 
-    FetchContent_MakeAvailable(${name})
+    if (${make_avail})
+      FetchContent_MakeAvailable(${name})
+    endif()
 endfunction ()
 
 #  Changes to the current version of kromult should proceed through a pull
@@ -33,6 +35,7 @@ register_project (kronmult
                   KRONMULT
                   https://github.com/project-asgard/kronmult.git
                   f941819685bbd3026a85145dde286f593683c1f4
+                  OFF
 )
 
 ###############################################################################
@@ -280,23 +283,6 @@ if(ASGARD_USE_HIP)
     set_source_files_properties( src/device/kronmult_cuda.cpp PROPERTIES LANGUAGE CUDA ) # no .cu extension
   endif()
 
-  hip_add_library(kronmult_cuda SHARED src/device/kronmult_cuda.cpp
-                  HIPCC_OPTIONS "${ASGARD_HIP_FLAGS}"
-                  NVCC_OPTIONS "${ASGARD_NVCC_FLAGS}"
-                  CLANG_OPTIONS "${ASGARD_AMD_FLAGS}")
-else()
-  # build component to interface with Ed's kronmult lib
-  ##TODO: link to kronmult as interface library
-  add_library(kronmult_cuda SHARED src/device/kronmult_cuda.cpp)
-endif()
-
-if(ASGARD_USE_HIP)
-  if(ASGARD_PLATFORM_NVCC)
-    set_target_properties( kronmult_cuda PROPERTIES CUDA_ARCHITECTURES OFF)
-    set_target_properties( kronmult_cuda PROPERTIES COMPILE_FLAGS "-arch sm_${ASGARD_NVCC_ARCH} -g -lineinfo --ptxas-options=-O3")
-  endif()
-  set_target_properties( kronmult_cuda PROPERTIES LINK_FLAGS "-Wl,-rpath,${CMAKE_BINARY_DIR}")
-
 #  Turn on GPU support in kronmult.
   set (USE_GPU ON CACHE BOOL "Turn on kronmult gpu support" FORCE)
 
@@ -308,6 +294,25 @@ if(ASGARD_USE_HIP)
   endif()
 endif()
 
+# Fetch kronmult after configuring everything, but before adding libraries
+FetchContent_MakeAvailable(kronmult)
+
+if(ASGARD_USE_HIP)
+  hip_add_library(kronmult_cuda SHARED src/device/kronmult_cuda.cpp
+                  HIPCC_OPTIONS "${ASGARD_HIP_FLAGS}"
+                  NVCC_OPTIONS "${ASGARD_NVCC_FLAGS}"
+                  CLANG_OPTIONS "${ASGARD_AMD_FLAGS}")
+  if(ASGARD_PLATFORM_NVCC)
+    set_target_properties( kronmult_cuda PROPERTIES CUDA_ARCHITECTURES OFF)
+    set_target_properties( kronmult_cuda PROPERTIES COMPILE_FLAGS "-arch sm_${ASGARD_NVCC_ARCH} -g -lineinfo --ptxas-options=-O3")
+  endif()
+  set_target_properties( kronmult_cuda PROPERTIES LINK_FLAGS "-Wl,-rpath,${CMAKE_BINARY_DIR}")
+else()
+  # build component to interface with Ed's kronmult lib
+  ##TODO: link to kronmult as interface library
+  add_library(kronmult_cuda SHARED src/device/kronmult_cuda.cpp)
+endif()
+
 if(ASGARD_USE_MKL)
   if(ASGARD_USE_HIP AND ASGARD_PLATFORM_NVCC)
       set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --compiler-options -fopenmp")

From 91b0626a24fd77dcf869c40a9421c14eb2c65ac9 Mon Sep 17 00:00:00 2001
From: Cole Kendrick <kendrickcj@ornl.gov>
Date: Thu, 28 Oct 2021 16:15:25 -0400
Subject: [PATCH 32/36] Fix kron linking on nvidia

---
 CMakeLists.txt | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index f11cb87b2..9844ad117 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -395,9 +395,7 @@ if (build_hdf5)
   add_dependencies (io hdf5-ext)
 endif ()
 
-if (build_kron)
-  add_dependencies (kronmult_cuda kronmult-ext)
-endif ()
+add_dependencies (kronmult_cuda kron)
 
 if (ASGARD_USE_SCALAPACK)
   target_link_libraries (tensors PRIVATE scalapack_matrix_info cblacs_grid)
@@ -440,7 +438,7 @@ endif ()
 if (ASGARD_USE_HIP AND ASGARD_PLATFORM_AMD)
   target_link_libraries(kronmult_cuda PUBLIC kron hip::device)
 else ()
-  target_link_libraries(kronmult_cuda PUBLIC kron)
+  target_link_libraries(kronmult_cuda PRIVATE kron)
 endif()
 
 if (ASGARD_USE_OPENMP AND NOT ASGARD_USE_MKL)

From acf6305daea8cf314ae3a99a349e3962ccc94870 Mon Sep 17 00:00:00 2001
From: Cole Kendrick <kendrickcj@ornl.gov>
Date: Mon, 1 Nov 2021 16:40:26 -0400
Subject: [PATCH 33/36] Consolidate gpu arch flags and reorder hip flags

---
 CMakeLists.txt | 28 ++++++++++------------------
 1 file changed, 10 insertions(+), 18 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9844ad117..53aa08939 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -225,18 +225,11 @@ if(ASGARD_USE_HIP)
   set(HIP_VERBOSE_BUILD ON CACHE STRING "Verbose compilation for HIP")
 
   set(ASGARD_HIP_FLAGS "-std=c++14;-g" CACHE STRING "HIP compiler flags for both AMD and NVIDIA")
-  set(ASGARD_AMD_ARCH "906" CACHE STRING "AMD GPU architecture number")
-  set(ASGARD_AMD_FLAGS "${ASGARD_HIP_FLAGS}; --amdgpu-target=gfx${ASGARD_AMD_ARCH};-O3" CACHE STRING "Flags to pass to amd-clang for HIP") # amdgpu specific options
-  set(ASGARD_NVCC_ARCH "70" CACHE STRING "Nvidia architecture number") # -gencode arch=compute_xx,code=compute_xx
-  set(ASGARD_NVCC_FLAGS "${ASGARD_HIP_FLAGS}; -gencode arch=compute_${ASGARD_NVCC_ARCH},code=compute_${ASGARD_NVCC_ARCH} --ptxas-options=-O3 -lineinfo" CACHE STRING "Flags to pass to NVCC") # nvcc specific options
-
-  set(HIP_HIPCC_FLAGS "${HIP_HIPCC_FLAGS} ${ASGARD_HIP_FLAGS}" CACHE STRING "")
-  set(HIP_CLANG_FLAGS "${HIP_CLANG_FLAGS} ${ASGARD_AMD_FLAGS}" CACHE STRING "")
-  set(HIP_NVCC_FLAGS "${HIP_NVCC_FLAGS} ${ASGARD_NVCC_FLAGS}" CACHE STRING "")
+  set(GPU_ARCH "70" CACHE STRING "AMD/NVIDIA GPU architecture number (such as 906 or 70)")
 
   if(ASGARD_PLATFORM_AMD)
-    set(AMDGPU_TARGETS "gfx${ASGARD_AMD_ARCH}" CACHE STRING "GPU target architectures to compile for")
-    set(GPU_TARGETS "gfx${ASGARD_AMD_ARCH}" CACHE STRING "GPU target architectures to compile for")
+    set(AMDGPU_TARGETS "gfx${GPU_ARCH}" CACHE STRING "GPU target architectures to compile for" FORCE)
+    set(GPU_TARGETS "gfx${GPU_ARCH}" CACHE STRING "GPU target architectures to compile for" FORCE)
 
     # need a much later version for AMD since ipiv=nullptr fix not in hipblas until >4.3.1
     find_package(HIP 4.3.0 REQUIRED)
@@ -259,11 +252,13 @@ if(ASGARD_USE_HIP)
     set (CMAKE_CUDA_STANDARD 14)
     set (CMAKE_CUDA_STANDARD_REQUIRED ON)
     add_compile_definitions(__HIP_PLATFORM_NVCC__ __HIP_PLATFORM_NVIDIA__)
+    set(ASGARD_NVCC_FLAGS "${ASGARD_HIP_FLAGS}; -gencode arch=compute_${GPU_ARCH},code=compute_${GPU_ARCH} --ptxas-options=-O3 -lineinfo" CACHE STRING "Flags to pass to NVCC" FORCE) # nvcc specific options
   elseif(ASGARD_PLATFORM_AMD)
     #enable_language(HIP) # not yet added to latest cmake, but should be available in 3.21
     # these compile definitions should be added automatically if using amd's clang, but
     # may not necessarily be added if compiling with gcc or others
     add_compile_definitions(__HIP_PLATFORM_HCC__ __HIP_PLATFORM_AMD__)
+    set(ASGARD_AMD_FLAGS "${ASGARD_HIP_FLAGS}; --amdgpu-target=gfx${GPU_ARCH};-O3" CACHE STRING "Flags to pass to amd-clang for HIP" FORCE) # amdgpu specific options
   endif()
 
   if (hipBLAS_FOUND)
@@ -285,26 +280,23 @@ if(ASGARD_USE_HIP)
 
 #  Turn on GPU support in kronmult.
   set (USE_GPU ON CACHE BOOL "Turn on kronmult gpu support" FORCE)
-
-  # pass gpu arch code to kronmult
-  if(ASGARD_PLATFORM_NVCC)
-    set(GPU_ARCH "${ASGARD_NVCC_ARCH}" CACHE STRING "GPU architecture code for AMD/NVIDIA" FORCE)
-  else()
-    set(GPU_ARCH "${ASGARD_AMD_ARCH}" CACHE STRING "GPU architecture code for AMD/NVIDIA" FORCE)
-  endif()
 endif()
 
 # Fetch kronmult after configuring everything, but before adding libraries
 FetchContent_MakeAvailable(kronmult)
 
 if(ASGARD_USE_HIP)
+  set(HIP_HIPCC_FLAGS "${HIP_HIPCC_FLAGS} ${ASGARD_HIP_FLAGS}" CACHE STRING "")
+  set(HIP_CLANG_FLAGS "${HIP_CLANG_FLAGS} ${ASGARD_AMD_FLAGS}" CACHE STRING "")
+  set(HIP_NVCC_FLAGS "${HIP_NVCC_FLAGS} ${ASGARD_NVCC_FLAGS}" CACHE STRING "")
+
   hip_add_library(kronmult_cuda SHARED src/device/kronmult_cuda.cpp
                   HIPCC_OPTIONS "${ASGARD_HIP_FLAGS}"
                   NVCC_OPTIONS "${ASGARD_NVCC_FLAGS}"
                   CLANG_OPTIONS "${ASGARD_AMD_FLAGS}")
   if(ASGARD_PLATFORM_NVCC)
     set_target_properties( kronmult_cuda PROPERTIES CUDA_ARCHITECTURES OFF)
-    set_target_properties( kronmult_cuda PROPERTIES COMPILE_FLAGS "-arch sm_${ASGARD_NVCC_ARCH} -g -lineinfo --ptxas-options=-O3")
+    set_target_properties( kronmult_cuda PROPERTIES COMPILE_FLAGS "-arch sm_${GPU_ARCH} -g -lineinfo --ptxas-options=-O3")
   endif()
   set_target_properties( kronmult_cuda PROPERTIES LINK_FLAGS "-Wl,-rpath,${CMAKE_BINARY_DIR}")
 else()

From 37ebc252fafa6ecaeccead2f98324dca0e133aa5 Mon Sep 17 00:00:00 2001
From: Cole Kendrick <kendrickcj@ornl.gov>
Date: Tue, 2 Nov 2021 11:59:41 -0400
Subject: [PATCH 34/36] Add new register project arg when building openblas

---
 contrib/FindLINALG.cmake | 1 +
 1 file changed, 1 insertion(+)

diff --git a/contrib/FindLINALG.cmake b/contrib/FindLINALG.cmake
index b92d94607..7f1f06af4 100644
--- a/contrib/FindLINALG.cmake
+++ b/contrib/FindLINALG.cmake
@@ -54,6 +54,7 @@ if (${ASGARD_BUILD_OPENBLAS})
                       OPENBLAS
                       https://github.com/xianyi/OpenBLAS.git
                       v0.3.18
+                      ON
     )
 
 #  Fetch content does not run the install phase so the headers for openblas are

From 8909eab6b283369037db47230d217130c459fe04 Mon Sep 17 00:00:00 2001
From: Cole Kendrick <kendrickcj@ornl.gov>
Date: Tue, 9 Nov 2021 12:03:24 -0500
Subject: [PATCH 35/36] Decrease batched gemv test tol for amd gpu

---
 src/lib_dispatch_tests.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/lib_dispatch_tests.cpp b/src/lib_dispatch_tests.cpp
index 6ffa36aaf..8c0eb3ca9 100644
--- a/src/lib_dispatch_tests.cpp
+++ b/src/lib_dispatch_tests.cpp
@@ -1182,7 +1182,7 @@ TEMPLATE_TEST_CASE_SIG("batched gemv", "[lib_dispatch]",
   TestType tol_factor = 1e-18;
   if constexpr (resrc == resource::device)
   {
-    tol_factor = 1e-8;
+    tol_factor = 1e-7;
   }
 
   SECTION("batched gemv: no trans, alpha = 1.0, beta = 0.0")

From ef3c11b8ad00c3fb11fa4eff98ddb74532870211 Mon Sep 17 00:00:00 2001
From: Cole Kendrick <kendrickcj@ornl.gov>
Date: Tue, 23 Nov 2021 11:01:46 -0500
Subject: [PATCH 36/36] Change CMake HIP linking

---
 CMakeLists.txt | 98 ++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 80 insertions(+), 18 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 53aa08939..ff5d239d5 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -246,8 +246,6 @@ if(ASGARD_USE_HIP)
   message(STATUS "HIP Libraries: ${HIP_LIBRARIES}")
 
   if(ASGARD_PLATFORM_NVCC)
-    find_package(CUDA 9.0 REQUIRED)
-    include_directories(${CUDA_INCLUDE_DIRS})
     enable_language(CUDA)
     set (CMAKE_CUDA_STANDARD 14)
     set (CMAKE_CUDA_STANDARD_REQUIRED ON)
@@ -265,11 +263,6 @@ if(ASGARD_USE_HIP)
      message(STATUS "Found rocBLAS version ${rocBLAS_VERSION}: ${HIPBLAS_INCLUDE_DIRS}")
   endif()
 
-  include_directories(SYSTEM ${HIP_INCLUDE_DIRS})
-  # assume this include path since HIP_INCLUDE_DIRS is not being set on nvidia platform
-  include_directories(SYSTEM "${HIP_PATH}/include")
-  include_directories(${HIPBLAS_INCLUDE_DIRS})
-
   # set source file language properties
   if(ASGARD_PLATFORM_AMD)
     #set_source_files_properties( src/device/kronmult_cuda.cpp PROPERTIES LANGUAGE HIP ) # should work after cmake 3.21 release?
@@ -286,6 +279,7 @@ endif()
 FetchContent_MakeAvailable(kronmult)
 
 if(ASGARD_USE_HIP)
+  set(CMAKE_HIP_ARCHITECTURES OFF)
   set(HIP_HIPCC_FLAGS "${HIP_HIPCC_FLAGS} ${ASGARD_HIP_FLAGS}" CACHE STRING "")
   set(HIP_CLANG_FLAGS "${HIP_CLANG_FLAGS} ${ASGARD_AMD_FLAGS}" CACHE STRING "")
   set(HIP_NVCC_FLAGS "${HIP_NVCC_FLAGS} ${ASGARD_NVCC_FLAGS}" CACHE STRING "")
@@ -294,6 +288,10 @@ if(ASGARD_USE_HIP)
                   HIPCC_OPTIONS "${ASGARD_HIP_FLAGS}"
                   NVCC_OPTIONS "${ASGARD_NVCC_FLAGS}"
                   CLANG_OPTIONS "${ASGARD_AMD_FLAGS}")
+  target_include_directories(kronmult_cuda PUBLIC ${HIP_INCLUDE_DIRS})
+  # assume this include path since HIP_INCLUDE_DIRS is not being set on nvidia platform
+  target_include_directories(kronmult_cuda PUBLIC "${HIP_PATH}/include")
+  target_include_directories(kronmult_cuda PUBLIC ${HIPBLAS_INCLUDE_DIRS})
   if(ASGARD_PLATFORM_NVCC)
     set_target_properties( kronmult_cuda PROPERTIES CUDA_ARCHITECTURES OFF)
     set_target_properties( kronmult_cuda PROPERTIES COMPILE_FLAGS "-arch sm_${GPU_ARCH} -g -lineinfo --ptxas-options=-O3")
@@ -376,7 +374,22 @@ if (ASGARD_USE_MATLAB)
 endif ()
 
 foreach (component IN LISTS components)
-  add_library (${component} src/${component}.cpp)
+  if (ASGARD_USE_HIP)
+    hip_add_library (${component} src/${component}.cpp
+                     HIPCC_OPTIONS "${ASGARD_HIP_FLAGS}"
+                     NVCC_OPTIONS "${ASGARD_NVCC_FLAGS} -lhipblas"
+                     CLANG_OPTIONS "${ASGARD_AMD_FLAGS}")
+    set_target_properties(${component} PROPERTIES LINKER_LANGUAGE HIP)
+    target_include_directories(${component} SYSTEM PUBLIC ${HIP_INCLUDE_DIRS})
+    target_include_directories(${component} SYSTEM PUBLIC "${HIP_PATH}/include")
+    target_include_directories(${component} SYSTEM PUBLIC ${HIPBLAS_INCLUDE_DIRS})
+    if(ASGARD_PLATFORM_NVCC)
+      target_include_directories(${component} SYSTEM PUBLIC ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
+    endif()
+    target_link_directories(${component} PRIVATE ${HIPBLAS_PATH}/lib/) # roc::hipblas target isn't linking properly on nvidia platforms
+  else()
+    add_library (${component} src/${component}.cpp)
+  endif()
   target_include_directories (${component} PRIVATE ${CMAKE_BINARY_DIR})
   if(ASGARD_USE_MKL)
     target_compile_options (${component} PRIVATE "-fopenmp") # CMAKE doesn't handle MKL openmp link properly
@@ -455,10 +468,8 @@ endif ()
 if (ASGARD_USE_HIP)
   if(ASGARD_PLATFORM_AMD)
     target_link_libraries(lib_dispatch PRIVATE hip::device)
-  elseif(ASGARD_PLATFORM_NVCC)
-    target_link_libraries(lib_dispatch PRIVATE ${CUDA_LIBRARIES})
+    target_link_libraries(lib_dispatch PRIVATE roc::hipblas)
   endif()
-  target_link_libraries(lib_dispatch PRIVATE roc::hipblas)
 endif()
 
 if (ASGARD_USE_OPENMP AND NOT ASGARD_USE_MKL)
@@ -487,10 +498,8 @@ target_link_libraries (tensors PRIVATE lib_dispatch)
 if (ASGARD_USE_HIP)
   if(ASGARD_PLATFORM_AMD)
     target_link_libraries(tensors PRIVATE hip::device)
-  elseif(ASGARD_PLATFORM_NVCC)
-    target_link_libraries(tensors PRIVATE ${CUDA_LIBRARIES})
+    target_link_libraries (tensors PRIVATE roc::hipblas)
   endif()
-  target_link_libraries (tensors PRIVATE roc::hipblas)
 endif ()
 if (ASGARD_USE_SCALAPACK)	
   add_compile_definitions (ASGARD_USE_SCALAPACK)
@@ -504,7 +513,16 @@ target_link_libraries (transformations
   quadrature tensors)
 
 # define the main application and its linking
-add_executable (asgard src/main.cpp)
+if (ASGARD_USE_HIP)
+  hip_add_executable (asgard src/main.cpp
+                      HIPCC_OPTIONS "${ASGARD_HIP_FLAGS}"
+                      NVCC_OPTIONS "${ASGARD_NVCC_FLAGS} -lhipblas"
+                      CLANG_OPTIONS "${ASGARD_AMD_FLAGS}")
+  set_target_properties(asgard PROPERTIES LINKER_LANGUAGE HIP)
+  target_link_directories(asgard PRIVATE ${HIPBLAS_PATH}/lib/)
+else()
+  add_executable (asgard src/main.cpp)
+endif()
 
 # link in components needed directly by main
 set (main_app_link_deps
@@ -559,7 +577,21 @@ if (ASGARD_BUILD_TESTS)
   enable_testing ()
 
   # Define ctest tests and their executables
-  add_library (tests_general testing/tests_general.cpp)
+  if (ASGARD_USE_HIP)
+    hip_add_library (tests_general testing/tests_general.cpp
+                     HIPCC_OPTIONS "${ASGARD_HIP_FLAGS}"
+                     NVCC_OPTIONS "${ASGARD_NVCC_FLAGS}"
+                     CLANG_OPTIONS "${ASGARD_AMD_FLAGS}")
+    target_include_directories(tests_general SYSTEM PUBLIC ${HIP_INCLUDE_DIRS})
+    target_include_directories(tests_general SYSTEM PUBLIC "${HIP_PATH}/include")
+    target_include_directories(tests_general SYSTEM PUBLIC ${HIPBLAS_INCLUDE_DIRS})
+    if (ASGARD_PLATFORM_NVCC)
+      target_include_directories(tests_general SYSTEM PUBLIC ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
+      target_link_libraries(tests_general PRIVATE ${CMAKE_CUDA_RUNTIME_LIBRARY})
+    endif()
+  else()
+    add_library (tests_general testing/tests_general.cpp)
+  endif()
   target_link_libraries (tests_general PUBLIC Catch PRIVATE pde program_options )
   target_include_directories(tests_general PRIVATE ${CMAKE_BINARY_DIR})
   
@@ -575,7 +607,22 @@ if (ASGARD_BUILD_TESTS)
   endif()
 
   foreach (component IN LISTS components)
-    add_executable (${component}-tests src/${component}_tests.cpp)
+    if (ASGARD_USE_HIP)
+      hip_add_executable (${component}-tests src/${component}_tests.cpp
+                          HIPCC_OPTIONS "${ASGARD_HIP_FLAGS}"
+                          NVCC_OPTIONS "${ASGARD_NVCC_FLAGS} -lhipblas"
+                          CLANG_OPTIONS "${ASGARD_AMD_FLAGS}")
+      if (ASGARD_PLATFORM_NVCC)
+        target_include_directories(${component}-tests SYSTEM PUBLIC ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
+        target_link_libraries(${component}-tests PRIVATE ${CMAKE_CUDA_RUNTIME_LIBRARY})
+        target_link_directories(${component}-tests PRIVATE ${HIPBLAS_PATH}/lib/)
+      endif()
+      target_include_directories(${component}-tests SYSTEM PUBLIC ${HIP_INCLUDE_DIRS})
+      target_include_directories(${component}-tests SYSTEM PUBLIC "${HIP_PATH}/include")
+      target_include_directories(${component}-tests SYSTEM PUBLIC ${HIPBLAS_INCLUDE_DIRS})
+    else()
+      add_executable (${component}-tests src/${component}_tests.cpp)
+    endif()
     target_include_directories (${component}-tests PRIVATE ${CMAKE_SOURCE_DIR}/testing)
     target_include_directories (${component}-tests PRIVATE ${CMAKE_BINARY_DIR})
     target_link_libraries (${component}-tests PRIVATE ${component} tests_general)
@@ -609,7 +656,22 @@ if (ASGARD_BUILD_TESTS)
               WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} )
   endforeach ()
 
-  add_executable (kronmult_cuda-tests src/device/kronmult_cuda_tests.cpp)
+  if (ASGARD_USE_HIP)
+    hip_add_executable (kronmult_cuda-tests src/device/kronmult_cuda_tests.cpp
+                        HIPCC_OPTIONS "${ASGARD_HIP_FLAGS}"
+                        NVCC_OPTIONS "${ASGARD_NVCC_FLAGS} -lhipblas"
+                        CLANG_OPTIONS "${ASGARD_AMD_FLAGS}")
+    if (ASGARD_PLATFORM_NVCC)
+      target_include_directories(kronmult_cuda-tests SYSTEM PUBLIC ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
+      target_link_libraries(kronmult_cuda-tests PRIVATE ${CMAKE_CUDA_RUNTIME_LIBRARY})
+      target_link_directories(kronmult_cuda-tests PRIVATE ${HIPBLAS_PATH}/lib/)
+    endif()
+    target_include_directories(kronmult_cuda-tests SYSTEM PUBLIC ${HIP_INCLUDE_DIRS})
+    target_include_directories(kronmult_cuda-tests SYSTEM PUBLIC "${HIP_PATH}/include")
+    target_include_directories(kronmult_cuda-tests SYSTEM PUBLIC ${HIPBLAS_INCLUDE_DIRS})
+  else()
+    add_executable (kronmult_cuda-tests src/device/kronmult_cuda_tests.cpp)
+  endif()
   target_include_directories (kronmult_cuda-tests PRIVATE ${CMAKE_SOURCE_DIR}/testing)
   target_include_directories (kronmult_cuda-tests PRIVATE ${CMAKE_BINARY_DIR})
   target_link_libraries (kronmult_cuda-tests PRIVATE coefficients kronmult_cuda tests_general)