Skip to content

Commit

Permalink
gfx1030 enablement (#113)
Browse files Browse the repository at this point in the history
* gfx1030 enablement

* clang-format

* fix bsrilu0
  • Loading branch information
ntrost57 authored Oct 7, 2020
1 parent 0a340ba commit 1b00b6c
Show file tree
Hide file tree
Showing 17 changed files with 602 additions and 248 deletions.
8 changes: 8 additions & 0 deletions clients/include/rocsparse_check.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -227,11 +227,19 @@ inline void near_check_general(rocsparse_int M,
{
for(rocsparse_int i = 0; i < M; ++i)
{
#ifdef __HIP_ARCH_GFX1030__
rocsparse_float_complex compare_val
= rocsparse_float_complex(std::max(std::abs(std::real(hCPU[i + j * lda]) * 1e-2f),
10 * std::numeric_limits<float>::epsilon()),
std::max(std::abs(std::imag(hCPU[i + j * lda]) * 1e-2f),
10 * std::numeric_limits<float>::epsilon()));
#else
rocsparse_float_complex compare_val
= rocsparse_float_complex(std::max(std::abs(std::real(hCPU[i + j * lda]) * 1e-3f),
10 * std::numeric_limits<float>::epsilon()),
std::max(std::abs(std::imag(hCPU[i + j * lda]) * 1e-3f),
10 * std::numeric_limits<float>::epsilon()));
#endif
#ifdef GOOGLE_TEST
if(rocsparse_isnan(hCPU[i + j * lda]))
{
Expand Down
4 changes: 4 additions & 0 deletions clients/include/testing_bsrmv.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -343,7 +343,11 @@ void testing_bsrmv(const Arguments& arg)
base,
mat,
filename.c_str(),
#ifdef __HIP_ARCH_GFX1030__
arg.timing ? false : true,
#else
false,
#endif
full_rank);

// Update BSR block dimensions from generated matrix
Expand Down
4 changes: 4 additions & 0 deletions clients/include/testing_csrmv.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -400,7 +400,11 @@ void testing_csrmv(const Arguments& arg)
base,
mat,
filename.c_str(),
#ifdef __HIP_ARCH_GFX1030__
arg.timing ? false : true,
#else
arg.timing ? false : adaptive,
#endif
full_rank);

// Allocate host memory for vectors
Expand Down
2 changes: 1 addition & 1 deletion clients/tests/test_bsrsv.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,7 @@ Tests:
precision: *single_double_precisions
M: 1
N: 1
block_dim: [-1, 2, 6, 15]
block_dim: [-1, 2, 6]
alpha_alphai: *alpha_range_checkin
direction: [rocsparse_direction_column]
transA: [rocsparse_operation_none, rocsparse_operation_transpose]
Expand Down
1 change: 0 additions & 1 deletion library/src/conversion/rocsparse_csr2bsr.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,6 @@
#include "utility.h"

#include <hip/hip_runtime.h>
#include <rocprim/rocprim.hpp>

#define launch_csr2bsr_fast_kernel(T, direction, block_size, segment_size, wf_size) \
hipLaunchKernelGGL((csr2bsr_fast_kernel<T, direction, block_size, segment_size, wf_size>), \
Expand Down
118 changes: 84 additions & 34 deletions library/src/conversion/rocsparse_csx2dense.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -44,50 +44,100 @@ rocsparse_status rocsparse_csx2dense_template(rocsparse_handle handle,
{
case rocsparse_direction_row:
{
static constexpr rocsparse_int WAVEFRONT_SIZE = 64;
static constexpr rocsparse_int NROWS_PER_BLOCK = 16;
if(handle->wavefront_size == 32)
{
static constexpr rocsparse_int WAVEFRONT_SIZE = 32;
static constexpr rocsparse_int NROWS_PER_BLOCK = 16;

rocsparse_int blocks = (m - 1) / NROWS_PER_BLOCK + 1;
dim3 k_blocks(blocks), k_threads(WAVEFRONT_SIZE * NROWS_PER_BLOCK);
rocsparse_int blocks = (m - 1) / NROWS_PER_BLOCK + 1;
dim3 k_blocks(blocks), k_threads(WAVEFRONT_SIZE * NROWS_PER_BLOCK);

hipLaunchKernelGGL((csr2dense_kernel<NROWS_PER_BLOCK, WAVEFRONT_SIZE, T>),
k_blocks,
k_threads,
0,
stream,
descr->base,
m,
n,
csx_val,
csx_row_col_ptr,
csx_col_row_ind,
A,
ld);
hipLaunchKernelGGL((csr2dense_kernel<NROWS_PER_BLOCK, WAVEFRONT_SIZE, T>),
k_blocks,
k_threads,
0,
stream,
descr->base,
m,
n,
csx_val,
csx_row_col_ptr,
csx_col_row_ind,
A,
ld);
}
else
{
static constexpr rocsparse_int WAVEFRONT_SIZE = 64;
static constexpr rocsparse_int NROWS_PER_BLOCK = 16;

rocsparse_int blocks = (m - 1) / NROWS_PER_BLOCK + 1;
dim3 k_blocks(blocks), k_threads(WAVEFRONT_SIZE * NROWS_PER_BLOCK);

hipLaunchKernelGGL((csr2dense_kernel<NROWS_PER_BLOCK, WAVEFRONT_SIZE, T>),
k_blocks,
k_threads,
0,
stream,
descr->base,
m,
n,
csx_val,
csx_row_col_ptr,
csx_col_row_ind,
A,
ld);
}

return rocsparse_status_success;
}

case rocsparse_direction_column:
{
static constexpr rocsparse_int WAVEFRONT_SIZE = 64;
static constexpr rocsparse_int NCOLUMNS_PER_BLOCK = 16;
if(handle->wavefront_size == 32)
{
static constexpr rocsparse_int WAVEFRONT_SIZE = 32;
static constexpr rocsparse_int NCOLUMNS_PER_BLOCK = 16;

rocsparse_int blocks = (n - 1) / NCOLUMNS_PER_BLOCK + 1;
dim3 k_blocks(blocks), k_threads(WAVEFRONT_SIZE * NCOLUMNS_PER_BLOCK);

hipLaunchKernelGGL((csc2dense_kernel<NCOLUMNS_PER_BLOCK, WAVEFRONT_SIZE, T>),
k_blocks,
k_threads,
0,
stream,
descr->base,
m,
n,
csx_val,
csx_row_col_ptr,
csx_col_row_ind,
A,
ld);
}
else
{
static constexpr rocsparse_int WAVEFRONT_SIZE = 64;
static constexpr rocsparse_int NCOLUMNS_PER_BLOCK = 16;

rocsparse_int blocks = (n - 1) / NCOLUMNS_PER_BLOCK + 1;
dim3 k_blocks(blocks), k_threads(WAVEFRONT_SIZE * NCOLUMNS_PER_BLOCK);
rocsparse_int blocks = (n - 1) / NCOLUMNS_PER_BLOCK + 1;
dim3 k_blocks(blocks), k_threads(WAVEFRONT_SIZE * NCOLUMNS_PER_BLOCK);

hipLaunchKernelGGL((csc2dense_kernel<NCOLUMNS_PER_BLOCK, WAVEFRONT_SIZE, T>),
k_blocks,
k_threads,
0,
stream,
descr->base,
m,
n,
csx_val,
csx_row_col_ptr,
csx_col_row_ind,
A,
ld);
hipLaunchKernelGGL((csc2dense_kernel<NCOLUMNS_PER_BLOCK, WAVEFRONT_SIZE, T>),
k_blocks,
k_threads,
0,
stream,
descr->base,
m,
n,
csx_val,
csx_row_col_ptr,
csx_col_row_ind,
A,
ld);
}

return rocsparse_status_success;
}
Expand Down
116 changes: 82 additions & 34 deletions library/src/conversion/rocsparse_dense2csx.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -44,46 +44,94 @@ rocsparse_status rocsparse_dense2csx_template(rocsparse_handle handle,
{
case rocsparse_direction_row:
{
if(handle->wavefront_size == 32)
{
static constexpr rocsparse_int WF_SIZE = 32;
static constexpr rocsparse_int NROWS_PER_BLOCK = 16 / (data_ratio > 0 ? data_ratio : 1);
rocsparse_int blocks = (m - 1) / NROWS_PER_BLOCK + 1;
dim3 k_blocks(blocks), k_threads(WF_SIZE * NROWS_PER_BLOCK);
hipLaunchKernelGGL((dense2csr_kernel<NROWS_PER_BLOCK, WF_SIZE, T>),
k_blocks,
k_threads,
0,
stream,
descr->base,
m,
n,
A,
ld,
csx_val,
csx_row_col_ptr,
csx_col_row_ind);
}
else
{
static constexpr rocsparse_int WF_SIZE = 64;
static constexpr rocsparse_int NROWS_PER_BLOCK = 16 / (data_ratio > 0 ? data_ratio : 1);
rocsparse_int blocks = (m - 1) / NROWS_PER_BLOCK + 1;
dim3 k_blocks(blocks), k_threads(WF_SIZE * NROWS_PER_BLOCK);
hipLaunchKernelGGL((dense2csr_kernel<NROWS_PER_BLOCK, WF_SIZE, T>),
k_blocks,
k_threads,
0,
stream,
descr->base,
m,
n,
A,
ld,
csx_val,
csx_row_col_ptr,
csx_col_row_ind);
}

static constexpr rocsparse_int WF_SIZE = 64;
static constexpr rocsparse_int NROWS_PER_BLOCK = 16 / (data_ratio > 0 ? data_ratio : 1);
rocsparse_int blocks = (m - 1) / NROWS_PER_BLOCK + 1;
dim3 k_blocks(blocks), k_threads(WF_SIZE * NROWS_PER_BLOCK);
hipLaunchKernelGGL((dense2csr_kernel<NROWS_PER_BLOCK, WF_SIZE, T>),
k_blocks,
k_threads,
0,
stream,
descr->base,
m,
n,
A,
ld,
csx_val,
csx_row_col_ptr,
csx_col_row_ind);
return rocsparse_status_success;
}

case rocsparse_direction_column:
{
static constexpr rocsparse_int WF_SIZE = 64;
static constexpr rocsparse_int NCOLUMNS_PER_BLOCK = 16 / (data_ratio > 0 ? data_ratio : 1);
rocsparse_int blocks = (n - 1) / NCOLUMNS_PER_BLOCK + 1;
dim3 k_blocks(blocks), k_threads(WF_SIZE * NCOLUMNS_PER_BLOCK);
hipLaunchKernelGGL((dense2csc_kernel<NCOLUMNS_PER_BLOCK, WF_SIZE, T>),
k_blocks,
k_threads,
0,
stream,
descr->base,
m,
n,
A,
ld,
csx_val,
csx_row_col_ptr,
csx_col_row_ind);
if(handle->wavefront_size == 32)
{
static constexpr rocsparse_int WF_SIZE = 32;
static constexpr rocsparse_int NCOLUMNS_PER_BLOCK
= 16 / (data_ratio > 0 ? data_ratio : 1);
rocsparse_int blocks = (n - 1) / NCOLUMNS_PER_BLOCK + 1;
dim3 k_blocks(blocks), k_threads(WF_SIZE * NCOLUMNS_PER_BLOCK);
hipLaunchKernelGGL((dense2csc_kernel<NCOLUMNS_PER_BLOCK, WF_SIZE, T>),
k_blocks,
k_threads,
0,
stream,
descr->base,
m,
n,
A,
ld,
csx_val,
csx_row_col_ptr,
csx_col_row_ind);
}
else
{
static constexpr rocsparse_int WF_SIZE = 64;
static constexpr rocsparse_int NCOLUMNS_PER_BLOCK
= 16 / (data_ratio > 0 ? data_ratio : 1);
rocsparse_int blocks = (n - 1) / NCOLUMNS_PER_BLOCK + 1;
dim3 k_blocks(blocks), k_threads(WF_SIZE * NCOLUMNS_PER_BLOCK);
hipLaunchKernelGGL((dense2csc_kernel<NCOLUMNS_PER_BLOCK, WF_SIZE, T>),
k_blocks,
k_threads,
0,
stream,
descr->base,
m,
n,
A,
ld,
csx_val,
csx_row_col_ptr,
csx_col_row_ind);
}

return rocsparse_status_success;
}
Expand Down
Loading

0 comments on commit 1b00b6c

Please sign in to comment.