Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

HIP version of asgard #400

Closed
wants to merge 36 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
79ab7a9
Initial hipify from perl script
ckendrick Jan 12, 2021
f70d02b
WIP HIP CMake configuration
ckendrick Jan 26, 2021
0e65742
Hipifying some missed parts
ckendrick Feb 2, 2021
3daf645
Updating formatting for hipified files
ckendrick Feb 2, 2021
e2491dc
Minor cmake adjustments for hip
ckendrick Feb 15, 2021
4b94cc8
Add initial hip platform configuration
ckendrick Mar 11, 2021
e816103
Update cmake cuda options and add hipblas
ckendrick Mar 16, 2021
0d98648
CMake adjustments and initial platform options
ckendrick Apr 26, 2021
0a78c9e
Add in nvidia platform update for hip 4.2
ckendrick May 17, 2021
b397160
Update tensor linking and missed cublas calls
ckendrick May 17, 2021
15a9d35
Update hip compiler option variables, hip_add_library for kronmult
ckendrick May 26, 2021
b0a81a1
Update nvidia arch flag for kronmult_cuda
ckendrick Jun 9, 2021
0889a35
Set hip_clang_path in cmake to find for amd platform
ckendrick Jun 16, 2021
3d2f466
Set the hip_clang_include_path needed for spack installs
ckendrick Jun 18, 2021
119cbb2
Update hip build for amd platform
ckendrick Jun 21, 2021
b9002c6
Add hcc platform def for backwards compatability
ckendrick Jun 28, 2021
d08af1e
Add temporary workaround for kronmult hip interface
ckendrick Jun 28, 2021
3dc539d
Temporarily suppress clang compiler warnings
ckendrick Jun 28, 2021
ef16b56
Fix shared flags for hip platform, amd gpu target archs
ckendrick Jul 6, 2021
a745fbe
Move kronmult source properties before add_lib
ckendrick Jul 6, 2021
e591cd9
Add check for empty matrix to avoid hipmemcpy2d error
ckendrick Jul 15, 2021
770a040
Fix hardcoded amd clang version and update target flags
ckendrick Jul 20, 2021
c318c89
Modify include dirs to hide deprecated cuda messages
ckendrick Jul 21, 2021
cf7c04a
Update device test tol for batch tests
ckendrick Jul 21, 2021
d5f34ca
Fix clang formatting issues
ckendrick Jul 29, 2021
8c69e57
Re-enable kronmult for amd platforms
ckendrick Sep 1, 2021
49f45dc
Rename hip platform from hcc to amd
ckendrick Sep 15, 2021
8cd9e45
Adjust lib dispatch device test tol for amd
ckendrick Oct 5, 2021
2de03a3
Clean up cmake, add hipblas version check for amd
ckendrick Oct 8, 2021
c5ed1fd
Pass gpu arch to kronmult, set amd flags only on amd
ckendrick Oct 12, 2021
c79997b
Change kronmult fetch content to after configuration
ckendrick Oct 28, 2021
91b0626
Fix kron linking on nvidia
ckendrick Oct 28, 2021
acf6305
Consolidate gpu arch flags and reorder hip flags
ckendrick Nov 1, 2021
37ebc25
Add new register project arg when building openblas
ckendrick Nov 2, 2021
8909eab
Decrease batched gemv test tol for amd gpu
ckendrick Nov 9, 2021
ef3c11b
Change CMake HIP linking
ckendrick Nov 23, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
293 changes: 257 additions & 36 deletions CMakeLists.txt

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions contrib/FindLINALG.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ if (${ASGARD_BUILD_OPENBLAS})
OPENBLAS
https://github.com/xianyi/OpenBLAS.git
v0.3.18
ON
)

# Fetch content does not run the install phase so the headers for openblas are
Expand Down
6 changes: 5 additions & 1 deletion src/batch_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -864,7 +864,11 @@ void test_batched_gemv(int const m, int const n, int const lda,

batched_gemv(a_batch, x_batch, y_batch, alpha, beta);

P const tol_factor = 1e-17;
P tol_factor = 1e-17;
if constexpr (resrc == resource::device)
{
tol_factor = 1e-7;
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

😮

}
for (int i = 0; i < num_batch; ++i)
{
if constexpr (resrc == resource::host)
Expand Down
2 changes: 1 addition & 1 deletion src/build_info.hpp.in
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
#define BUILD_TIME "@BUILD_TIME@"

#cmakedefine ASGARD_IO_HIGHFIVE
#cmakedefine ASGARD_USE_CUDA
#cmakedefine ASGARD_USE_HIP
#cmakedefine ASGARD_USE_OPENMP
#cmakedefine ASGARD_USE_MPI
#cmakedefine ASGARD_USE_MATLAB
Expand Down
80 changes: 47 additions & 33 deletions src/device/kronmult_cuda.cpp
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
#include "kronmult_cuda.hpp"
#include "build_info.hpp"

#ifdef ASGARD_USE_CUDA
#include <cuda.h>
#include <cuda_runtime.h>
#ifdef ASGARD_USE_HIP
#include <hip/hip_runtime.h>
#define USE_GPU
#define GLOBAL_FUNCTION __global__
#define SYNCTHREADS __syncthreads()
Expand Down Expand Up @@ -47,7 +46,7 @@ GLOBAL_FUNCTION void
stage_inputs_kronmult_kernel(P const *const x, P *const workspace,
int const num_elems, int const num_copies)
{
#ifdef ASGARD_USE_CUDA
#ifdef ASGARD_USE_HIP

expect(blockIdx.y == 0);
expect(blockIdx.z == 0);
Expand Down Expand Up @@ -90,7 +89,7 @@ void stage_inputs_kronmult(P const *const x, P *const workspace,
expect(num_elems > 0);
expect(num_copies > 0);

#ifdef ASGARD_USE_CUDA
#ifdef ASGARD_USE_HIP

auto constexpr warp_size = 32;
auto constexpr num_warps = 8;
Expand All @@ -99,11 +98,12 @@ void stage_inputs_kronmult(P const *const x, P *const workspace,
auto const total_copies = static_cast<int64_t>(num_elems) * num_copies;
auto const num_blocks = (total_copies + num_threads - 1) / num_threads;

stage_inputs_kronmult_kernel<P>
<<<num_blocks, num_threads>>>(x, workspace, num_elems, num_copies);
hipLaunchKernelGGL(HIP_KERNEL_NAME(stage_inputs_kronmult_kernel<P>),
dim3(num_blocks), dim3(num_threads), 0, 0, x, workspace,
num_elems, num_copies);

auto const stat = cudaDeviceSynchronize();
expect(stat == cudaSuccess);
auto const stat = hipDeviceSynchronize();
expect(stat == hipSuccess);
#else
stage_inputs_kronmult_kernel(x, workspace, num_elems, num_copies);
#endif
Expand Down Expand Up @@ -175,7 +175,7 @@ prepare_kronmult_kernel(int const *const flattened_table,
auto const coord_size = num_dims * 2;
auto const num_elems = static_cast<int64_t>(num_cols) * num_rows;

#ifdef ASGARD_USE_CUDA
#ifdef ASGARD_USE_HIP

expect(blockIdx.y == 0);
expect(blockIdx.z == 0);
Expand All @@ -192,7 +192,7 @@ prepare_kronmult_kernel(int const *const flattened_table,
auto const increment = 1;
#endif

#ifndef ASGARD_USE_CUDA
#ifndef ASGARD_USE_HIP
#ifdef ASGARD_USE_OPENMP
#pragma omp parallel for
#endif
Expand Down Expand Up @@ -273,20 +273,22 @@ void prepare_kronmult(int const *const flattened_table,
expect(input_ptrs);
expect(output_ptrs);

#ifdef ASGARD_USE_CUDA
#ifdef ASGARD_USE_HIP
auto constexpr warp_size = 32;
auto constexpr num_warps = 8;
auto constexpr num_threads = num_warps * warp_size;
auto const num_krons =
static_cast<int64_t>(elem_col_stop - elem_col_start + 1) *
(elem_row_stop - elem_row_start + 1);
auto const num_blocks = (num_krons / num_threads) + 1;
prepare_kronmult_kernel<P><<<num_blocks, num_threads>>>(
flattened_table, operators, operator_lda, element_x, element_work, fx,
operator_ptrs, work_ptrs, input_ptrs, output_ptrs, degree, num_terms,
num_dims, elem_row_start, elem_row_stop, elem_col_start, elem_col_stop);
auto const stat = cudaDeviceSynchronize();
expect(stat == cudaSuccess);
hipLaunchKernelGGL(HIP_KERNEL_NAME(prepare_kronmult_kernel<P>),
dim3(num_blocks), dim3(num_threads), 0, 0, flattened_table,
operators, operator_lda, element_x, element_work, fx,
operator_ptrs, work_ptrs, input_ptrs, output_ptrs, degree,
num_terms, num_dims, elem_row_start, elem_row_stop,
elem_col_start, elem_col_stop);
auto const stat = hipDeviceSynchronize();
expect(stat == hipSuccess);
#else
prepare_kronmult_kernel(
flattened_table, operators, operator_lda, element_x, element_work, fx,
Expand All @@ -304,7 +306,7 @@ void call_kronmult(int const n, P *x_ptrs[], P *output_ptrs[], P *work_ptrs[],
P const *const operator_ptrs[], int const lda,
int const num_krons, int const num_dims)
{
#ifdef ASGARD_USE_CUDA
#ifdef ASGARD_USE_HIP
{
int constexpr warpsize = 32;
int constexpr nwarps = 1;
Expand All @@ -313,28 +315,40 @@ void call_kronmult(int const n, P *x_ptrs[], P *output_ptrs[], P *work_ptrs[],
switch (num_dims)
{
case 1:
kronmult1_xbatched<P><<<num_krons, num_threads>>>(
n, operator_ptrs, lda, x_ptrs, output_ptrs, work_ptrs, num_krons);
hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult1_xbatched<P>),
dim3(num_krons), dim3(num_threads), 0, 0, n,
operator_ptrs, lda, x_ptrs, output_ptrs, work_ptrs,
num_krons);
break;
case 2:
kronmult2_xbatched<P><<<num_krons, num_threads>>>(
n, operator_ptrs, lda, x_ptrs, output_ptrs, work_ptrs, num_krons);
hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult2_xbatched<P>),
dim3(num_krons), dim3(num_threads), 0, 0, n,
operator_ptrs, lda, x_ptrs, output_ptrs, work_ptrs,
num_krons);
break;
case 3:
kronmult3_xbatched<P><<<num_krons, num_threads>>>(
n, operator_ptrs, lda, x_ptrs, output_ptrs, work_ptrs, num_krons);
hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult3_xbatched<P>),
dim3(num_krons), dim3(num_threads), 0, 0, n,
operator_ptrs, lda, x_ptrs, output_ptrs, work_ptrs,
num_krons);
break;
case 4:
kronmult4_xbatched<P><<<num_krons, num_threads>>>(
n, operator_ptrs, lda, x_ptrs, output_ptrs, work_ptrs, num_krons);
hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult4_xbatched<P>),
dim3(num_krons), dim3(num_threads), 0, 0, n,
operator_ptrs, lda, x_ptrs, output_ptrs, work_ptrs,
num_krons);
break;
case 5:
kronmult5_xbatched<P><<<num_krons, num_threads>>>(
n, operator_ptrs, lda, x_ptrs, output_ptrs, work_ptrs, num_krons);
hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult5_xbatched<P>),
dim3(num_krons), dim3(num_threads), 0, 0, n,
operator_ptrs, lda, x_ptrs, output_ptrs, work_ptrs,
num_krons);
break;
case 6:
kronmult6_xbatched<P><<<num_krons, num_threads>>>(
n, operator_ptrs, lda, x_ptrs, output_ptrs, work_ptrs, num_krons);
hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult6_xbatched<P>),
dim3(num_krons), dim3(num_threads), 0, 0, n,
operator_ptrs, lda, x_ptrs, output_ptrs, work_ptrs,
num_krons);
break;
default:
expect(false);
Expand All @@ -343,8 +357,8 @@ void call_kronmult(int const n, P *x_ptrs[], P *output_ptrs[], P *work_ptrs[],
// -------------------------------------------
// note important to wait for kernel to finish
// -------------------------------------------
auto const stat = cudaDeviceSynchronize();
expect(stat == cudaSuccess);
auto const stat = hipDeviceSynchronize();
expect(stat == hipSuccess);
}
#else

Expand Down
4 changes: 2 additions & 2 deletions src/kronmult.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@
#include "lib_dispatch.hpp"
#include "tools.hpp"

#ifdef ASGARD_USE_CUDA
#include <cuda_runtime.h>
#ifdef ASGARD_USE_HIP
#include <hip/hip_runtime.h>
#endif

#ifdef ASGARD_USE_OPENMP
Expand Down
2 changes: 1 addition & 1 deletion src/kronmult.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#pragma once
#ifdef ASGARD_USE_CUDA
#ifdef ASGARD_USE_HIP
#define USE_GPU
#endif
#include "distribution.hpp"
Expand Down
Loading