project-asgard · ckendrick · Jan 12, 2021 · Jan 26, 2021 · Feb 2, 2021 · Feb 2, 2021
diff --git a/CMakeLists.txt b/CMakeLists.txt
diff --git a/contrib/FindLINALG.cmake b/contrib/FindLINALG.cmake
@@ -54,6 +54,7 @@ if (${ASGARD_BUILD_OPENBLAS})
                       OPENBLAS
                       https://github.com/xianyi/OpenBLAS.git
                       v0.3.18
+                      ON
     )
 
 #  Fetch content does not run the install phase so the headers for openblas are

diff --git a/src/batch_tests.cpp b/src/batch_tests.cpp
@@ -864,7 +864,11 @@ void test_batched_gemv(int const m, int const n, int const lda,
 
   batched_gemv(a_batch, x_batch, y_batch, alpha, beta);
 
-  P const tol_factor = 1e-17;
+  P tol_factor = 1e-17;
+  if constexpr (resrc == resource::device)
+  {
+    tol_factor = 1e-7;
+  }
   for (int i = 0; i < num_batch; ++i)
   {
     if constexpr (resrc == resource::host)

diff --git a/src/build_info.hpp.in b/src/build_info.hpp.in
@@ -6,7 +6,7 @@
 #define BUILD_TIME "@BUILD_TIME@"
 
 #cmakedefine ASGARD_IO_HIGHFIVE
-#cmakedefine ASGARD_USE_CUDA
+#cmakedefine ASGARD_USE_HIP
 #cmakedefine ASGARD_USE_OPENMP
 #cmakedefine ASGARD_USE_MPI
 #cmakedefine ASGARD_USE_MATLAB

diff --git a/src/device/kronmult_cuda.cpp b/src/device/kronmult_cuda.cpp
@@ -1,9 +1,8 @@
 #include "kronmult_cuda.hpp"
 #include "build_info.hpp"
 
-#ifdef ASGARD_USE_CUDA
-#include <cuda.h>
-#include <cuda_runtime.h>
+#ifdef ASGARD_USE_HIP
+#include <hip/hip_runtime.h>
 #define USE_GPU
 #define GLOBAL_FUNCTION __global__
 #define SYNCTHREADS __syncthreads()
@@ -47,7 +46,7 @@ GLOBAL_FUNCTION void
 stage_inputs_kronmult_kernel(P const *const x, P *const workspace,
                              int const num_elems, int const num_copies)
 {
-#ifdef ASGARD_USE_CUDA
+#ifdef ASGARD_USE_HIP
 
   expect(blockIdx.y == 0);
   expect(blockIdx.z == 0);
@@ -90,7 +89,7 @@ void stage_inputs_kronmult(P const *const x, P *const workspace,
   expect(num_elems > 0);
   expect(num_copies > 0);
 
-#ifdef ASGARD_USE_CUDA
+#ifdef ASGARD_USE_HIP
 
   auto constexpr warp_size   = 32;
   auto constexpr num_warps   = 8;
@@ -99,11 +98,12 @@ void stage_inputs_kronmult(P const *const x, P *const workspace,
   auto const total_copies = static_cast<int64_t>(num_elems) * num_copies;
   auto const num_blocks   = (total_copies + num_threads - 1) / num_threads;
 
-  stage_inputs_kronmult_kernel<P>
-      <<<num_blocks, num_threads>>>(x, workspace, num_elems, num_copies);
+  hipLaunchKernelGGL(HIP_KERNEL_NAME(stage_inputs_kronmult_kernel<P>),
+                     dim3(num_blocks), dim3(num_threads), 0, 0, x, workspace,
+                     num_elems, num_copies);
 
-  auto const stat = cudaDeviceSynchronize();
-  expect(stat == cudaSuccess);
+  auto const stat = hipDeviceSynchronize();
+  expect(stat == hipSuccess);
 #else
   stage_inputs_kronmult_kernel(x, workspace, num_elems, num_copies);
 #endif
@@ -175,7 +175,7 @@ prepare_kronmult_kernel(int const *const flattened_table,
   auto const coord_size = num_dims * 2;
   auto const num_elems  = static_cast<int64_t>(num_cols) * num_rows;
 
-#ifdef ASGARD_USE_CUDA
+#ifdef ASGARD_USE_HIP
 
   expect(blockIdx.y == 0);
   expect(blockIdx.z == 0);
@@ -192,7 +192,7 @@ prepare_kronmult_kernel(int const *const flattened_table,
   auto const increment = 1;
 #endif
 
-#ifndef ASGARD_USE_CUDA
+#ifndef ASGARD_USE_HIP
 #ifdef ASGARD_USE_OPENMP
 #pragma omp parallel for
 #endif
@@ -273,20 +273,22 @@ void prepare_kronmult(int const *const flattened_table,
   expect(input_ptrs);
   expect(output_ptrs);
 
-#ifdef ASGARD_USE_CUDA
+#ifdef ASGARD_USE_HIP
   auto constexpr warp_size   = 32;
   auto constexpr num_warps   = 8;
   auto constexpr num_threads = num_warps * warp_size;
   auto const num_krons =
       static_cast<int64_t>(elem_col_stop - elem_col_start + 1) *
       (elem_row_stop - elem_row_start + 1);
   auto const num_blocks = (num_krons / num_threads) + 1;
-  prepare_kronmult_kernel<P><<<num_blocks, num_threads>>>(
-      flattened_table, operators, operator_lda, element_x, element_work, fx,
-      operator_ptrs, work_ptrs, input_ptrs, output_ptrs, degree, num_terms,
-      num_dims, elem_row_start, elem_row_stop, elem_col_start, elem_col_stop);
-  auto const stat = cudaDeviceSynchronize();
-  expect(stat == cudaSuccess);
+  hipLaunchKernelGGL(HIP_KERNEL_NAME(prepare_kronmult_kernel<P>),
+                     dim3(num_blocks), dim3(num_threads), 0, 0, flattened_table,
+                     operators, operator_lda, element_x, element_work, fx,
+                     operator_ptrs, work_ptrs, input_ptrs, output_ptrs, degree,
+                     num_terms, num_dims, elem_row_start, elem_row_stop,
+                     elem_col_start, elem_col_stop);
+  auto const stat = hipDeviceSynchronize();
+  expect(stat == hipSuccess);
 #else
   prepare_kronmult_kernel(
       flattened_table, operators, operator_lda, element_x, element_work, fx,
@@ -304,7 +306,7 @@ void call_kronmult(int const n, P *x_ptrs[], P *output_ptrs[], P *work_ptrs[],
                    P const *const operator_ptrs[], int const lda,
                    int const num_krons, int const num_dims)
 {
-#ifdef ASGARD_USE_CUDA
+#ifdef ASGARD_USE_HIP
   {
     int constexpr warpsize    = 32;
     int constexpr nwarps      = 1;
@@ -313,28 +315,40 @@ void call_kronmult(int const n, P *x_ptrs[], P *output_ptrs[], P *work_ptrs[],
     switch (num_dims)
     {
     case 1:
-      kronmult1_xbatched<P><<<num_krons, num_threads>>>(
-          n, operator_ptrs, lda, x_ptrs, output_ptrs, work_ptrs, num_krons);
+      hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult1_xbatched<P>),
+                         dim3(num_krons), dim3(num_threads), 0, 0, n,
+                         operator_ptrs, lda, x_ptrs, output_ptrs, work_ptrs,
+                         num_krons);
       break;
     case 2:
-      kronmult2_xbatched<P><<<num_krons, num_threads>>>(
-          n, operator_ptrs, lda, x_ptrs, output_ptrs, work_ptrs, num_krons);
+      hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult2_xbatched<P>),
+                         dim3(num_krons), dim3(num_threads), 0, 0, n,
+                         operator_ptrs, lda, x_ptrs, output_ptrs, work_ptrs,
+                         num_krons);
       break;
     case 3:
-      kronmult3_xbatched<P><<<num_krons, num_threads>>>(
-          n, operator_ptrs, lda, x_ptrs, output_ptrs, work_ptrs, num_krons);
+      hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult3_xbatched<P>),
+                         dim3(num_krons), dim3(num_threads), 0, 0, n,
+                         operator_ptrs, lda, x_ptrs, output_ptrs, work_ptrs,
+                         num_krons);
       break;
     case 4:
-      kronmult4_xbatched<P><<<num_krons, num_threads>>>(
-          n, operator_ptrs, lda, x_ptrs, output_ptrs, work_ptrs, num_krons);
+      hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult4_xbatched<P>),
+                         dim3(num_krons), dim3(num_threads), 0, 0, n,
+                         operator_ptrs, lda, x_ptrs, output_ptrs, work_ptrs,
+                         num_krons);
       break;
     case 5:
-      kronmult5_xbatched<P><<<num_krons, num_threads>>>(
-          n, operator_ptrs, lda, x_ptrs, output_ptrs, work_ptrs, num_krons);
+      hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult5_xbatched<P>),
+                         dim3(num_krons), dim3(num_threads), 0, 0, n,
+                         operator_ptrs, lda, x_ptrs, output_ptrs, work_ptrs,
+                         num_krons);
       break;
     case 6:
-      kronmult6_xbatched<P><<<num_krons, num_threads>>>(
-          n, operator_ptrs, lda, x_ptrs, output_ptrs, work_ptrs, num_krons);
+      hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult6_xbatched<P>),
+                         dim3(num_krons), dim3(num_threads), 0, 0, n,
+                         operator_ptrs, lda, x_ptrs, output_ptrs, work_ptrs,
+                         num_krons);
       break;
     default:
       expect(false);
@@ -343,8 +357,8 @@ void call_kronmult(int const n, P *x_ptrs[], P *output_ptrs[], P *work_ptrs[],
     // -------------------------------------------
     // note important to wait for kernel to finish
     // -------------------------------------------
-    auto const stat = cudaDeviceSynchronize();
-    expect(stat == cudaSuccess);
+    auto const stat = hipDeviceSynchronize();
+    expect(stat == hipSuccess);
   }
 #else
 

diff --git a/src/kronmult.cpp b/src/kronmult.cpp
@@ -3,8 +3,8 @@
 #include "lib_dispatch.hpp"
 #include "tools.hpp"
 
-#ifdef ASGARD_USE_CUDA
-#include <cuda_runtime.h>
+#ifdef ASGARD_USE_HIP
+#include <hip/hip_runtime.h>
 #endif
 
 #ifdef ASGARD_USE_OPENMP

diff --git a/src/kronmult.hpp b/src/kronmult.hpp
@@ -1,5 +1,5 @@
 #pragma once
-#ifdef ASGARD_USE_CUDA
+#ifdef ASGARD_USE_HIP
 #define USE_GPU
 #endif
 #include "distribution.hpp"
-Original file line number
+Diff line change
@@ Expand Up / @@ -54,6 +54,7 @@ if (${ASGARD_BUILD_OPENBLAS}) @@
                           OPENBLAS
                           https://github.com/xianyi/OpenBLAS.git
                           v0.3.18
+                          ON
         )
     #  Fetch content does not run the install phase so the headers for openblas are
@@ Expand Down @@