Skip to content

Commit

Permalink
Merge pull request #80 from ntrost57/develop
Browse files Browse the repository at this point in the history
cache usage optimizations
  • Loading branch information
ntrost57 authored Feb 4, 2019
2 parents 68467d6 + ed675e0 commit c4b97e1
Show file tree
Hide file tree
Showing 3 changed files with 16 additions and 14 deletions.
13 changes: 7 additions & 6 deletions library/src/level2/coomv_device.h
Original file line number Diff line number Diff line change
Expand Up @@ -68,8 +68,8 @@ static __device__ void coomvn_general_wf_reduce(rocsparse_int nnz,
// Initialize block buffers
if(lid == 0)
{
row_block_red[wid] = -1;
val_block_red[wid] = static_cast<T>(0);
__builtin_nontemporal_store(-1, row_block_red + wid);
__builtin_nontemporal_store(static_cast<T>(0), val_block_red + wid);
}

// Global COO array index start for current wavefront
Expand Down Expand Up @@ -105,8 +105,9 @@ static __device__ void coomvn_general_wf_reduce(rocsparse_int nnz,
// nnz % WF_SIZE != 0
if(idx < nnz)
{
row = coo_row_ind[idx] - idx_base;
val = alpha * coo_val[idx] * __ldg(x + coo_col_ind[idx] - idx_base);
row = __builtin_nontemporal_load(coo_row_ind + idx) - idx_base;
val = alpha * __builtin_nontemporal_load(coo_val + idx) *
__ldg(x + __builtin_nontemporal_load(coo_col_ind + idx) - idx_base);
}
else
{
Expand Down Expand Up @@ -173,8 +174,8 @@ static __device__ void coomvn_general_wf_reduce(rocsparse_int nnz,
// Write last entries into buffers for segmented block reduction
if(lid == WF_SIZE - 1)
{
row_block_red[wid] = row;
val_block_red[wid] = val;
__builtin_nontemporal_store(row, row_block_red + wid);
__builtin_nontemporal_store(val, val_block_red + wid);
}
}

Expand Down
8 changes: 4 additions & 4 deletions library/src/level2/csrsv_device.h
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,7 @@ __global__ void csrsv_analysis_kernel(rocsparse_int m,
// non-zero values. We must then ensure that the output from the row
// associated with the local_col is complete to ensure that we can
// calculate the right answer.
int local_col = csr_col_ind[j] - idx_base;
int local_col = __builtin_nontemporal_load(csr_col_ind + j) - idx_base;

// Store diagonal index
if(local_col == row)
Expand Down Expand Up @@ -394,16 +394,16 @@ __device__ void csrsv_device(rocsparse_int m,
if(lid == 0)
{
// Lane 0 initializes its local sum with alpha and x
local_sum = alpha * x[row];
local_sum = alpha * __builtin_nontemporal_load(x + row);
}

for(rocsparse_int j = row_begin + lid; j < row_end; j += WF_SIZE)
{
// Current column this lane operates on
rocsparse_int local_col = csr_col_ind[j] - idx_base;
rocsparse_int local_col = __builtin_nontemporal_load(csr_col_ind + j) - idx_base;

// Local value this lane operates with
T local_val = csr_val[j];
T local_val = __builtin_nontemporal_load(csr_val + j);

// Check for numerical zero
if(local_val == static_cast<T>(0) && local_col == row &&
Expand Down
9 changes: 5 additions & 4 deletions library/src/level2/ellmv_device.h
Original file line number Diff line number Diff line change
Expand Up @@ -53,11 +53,11 @@ static __device__ void ellmvn_device(rocsparse_int m,
for(rocsparse_int p = 0; p < ell_width; ++p)
{
rocsparse_int idx = ELL_IND(ai, p, m, ell_width);
rocsparse_int col = ell_col_ind[idx] - idx_base;
rocsparse_int col = __builtin_nontemporal_load(ell_col_ind + idx) - idx_base;

if(col >= 0 && col < n)
{
sum = fma(ell_val[idx], __ldg(x + col), sum);
sum = fma(__builtin_nontemporal_load(ell_val + idx), __ldg(x + col), sum);
}
else
{
Expand All @@ -67,11 +67,12 @@ static __device__ void ellmvn_device(rocsparse_int m,

if(beta != static_cast<T>(0))
{
y[ai] = fma(beta, y[ai], alpha * sum);
T yv = __builtin_nontemporal_load(y + ai);
__builtin_nontemporal_store(fma(beta, yv, alpha * sum), y + ai);
}
else
{
y[ai] = alpha * sum;
__builtin_nontemporal_store(alpha * sum, y + ai);
}
}

Expand Down

0 comments on commit c4b97e1

Please sign in to comment.