Skip to content

Commit

Permalink
update Makefile and test syclomatic conversion
Browse files Browse the repository at this point in the history
  • Loading branch information
cjknight committed Dec 13, 2024
1 parent be03fc2 commit 179f5ba
Show file tree
Hide file tree
Showing 3 changed files with 389 additions and 134 deletions.
4 changes: 2 additions & 2 deletions gpu/mini-apps/transpose/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ AR = ar rcs
CPP=cpp -P -traditional
INSTALL=../

ARCH ?= polaris
ARCH ?= polaris-gnu-nvcc
include $(BASE_LIBGPU)/arch/$(ARCH)

CXXFLAGS += -I$(BASE_LIBGPU)
Expand Down Expand Up @@ -50,7 +50,7 @@ install: $(EXE)
cp $(EXE) $(INSTALL)
# cp $(MOD) $(FOBJ) $(INSTALL)/include

pm:
update:
cp ../../src/pm*.cpp ./

####################################################################
Expand Down
318 changes: 186 additions & 132 deletions gpu/mini-apps/transpose/offload_sycl.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#if defined(_GPU_SYCL_CUDA)
#if defined(_GPU_SYCL) || defined(_GPU_SYCL_CUDA)

#include <sycl/sycl.hpp>
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
Expand All @@ -11,8 +12,7 @@
#define _TRANSPOSE_BLOCK_SIZE 16
#define _TRANSPOSE_NUM_ROWS 16

//#define _TILE(A,B) (A + B - 1) / B
#define _TILE(A,B) ((A + B - 1) / B) * B
#define _TILE(A,B) (A + B - 1) / B

#ifdef _SINGLE_PRECISION
typedef float real_t;
Expand All @@ -24,178 +24,232 @@ using namespace PM_NS;

class PM * pm_ = nullptr;

class Kernel_Copy_Naive;
class Kernel_Transpose_Naive;
class Kernel_Transpose_V1;
class Kernel_Transpose_V2;

// ----------------------------------------------------------------
// GPU Kernels
// ----------------------------------------------------------------

void init_pm(class PM * pm)
//https://github.com/NVIDIA-developer-blog/code-samples/blob/master/series/cuda-cpp/transpose/transpose.cu
// modified to support nonsquare matrices

void _transpose_gpu_v1(real_t * out, real_t * in, const int nrow, const int ncol,
const sycl::nd_item<3> &item_ct1,
sycl::local_accessor<real_t, 2> cache)
{
pm_ = pm;
}

// gridDim.{x,y,z} == get_num_group({0,1,2})
// blockDim.{x,y,z} == get_local_range({0,1,2})
// blockIdx.{x,y,z} == get_group({0,1,2})
// threadIdx.{x,y,z} == get_local_id({0,1,2})
int irow =
item_ct1.get_group(2) * _TRANSPOSE_BLOCK_SIZE + item_ct1.get_local_id(2);
int icol =
item_ct1.get_group(1) * _TRANSPOSE_BLOCK_SIZE + item_ct1.get_local_id(1);

void copy_naive_gpu(real_t * out, real_t * in, const int num_rows, const int num_cols)
{
sycl::queue * Q = pm_->dev_get_queue();

sycl::range<3> grid_size(num_rows, _TRANSPOSE_BLOCK_SIZE, 1);
sycl::range<3> block_size(1, _TRANSPOSE_BLOCK_SIZE, 1);
// load tile into fast local memory

sycl::nd_range<3> kernel_rng(grid_size, block_size);

Q->submit([&](sycl::handler &cgh) {

cgh.parallel_for<Kernel_Copy_Naive>(kernel_rng, [=](sycl::nd_item<3> idx) {
const int i = idx.get_group(0) * idx.get_local_range(0) + idx.get_local_id(0);

if(i >= num_rows) return;

int j = idx.get_group(1) * idx.get_local_range(1) + idx.get_local_id(1);

while(j < num_cols) {
out[i*num_cols + j] = in[i*num_cols + j];
j += idx.get_local_range(1);
}
}); // End of the kernel function

// }).wait(); // End of the queue commands
}); // End of the queue commands
const int indxi = irow * ncol + icol;
for(int i=0; i<_TRANSPOSE_BLOCK_SIZE; i+= _TRANSPOSE_NUM_ROWS) {
if(irow < nrow && (icol+i) < ncol) // nonsquare
cache[item_ct1.get_local_id(1) + i][item_ct1.get_local_id(2)] =
in[indxi + i]; // threads read chunk of a row and write as a column
}

// block to ensure reads finish

item_ct1.barrier(sycl::access::fence_space::local_space);

// swap indices

irow =
item_ct1.get_group(1) * _TRANSPOSE_BLOCK_SIZE + item_ct1.get_local_id(2);
icol =
item_ct1.get_group(2) * _TRANSPOSE_BLOCK_SIZE + item_ct1.get_local_id(1);

// write tile to global memory

const int indxo = irow * nrow + icol;
for(int i=0; i<_TRANSPOSE_BLOCK_SIZE; i+= _TRANSPOSE_NUM_ROWS) {
if(irow < ncol && (icol + i) < nrow) // nonsquare
out[indxo + i] =
cache[item_ct1.get_local_id(2)][item_ct1.get_local_id(1) + i];
}
}

// ----------------------------------------------------------------

void transpose_naive_gpu(real_t * out, real_t * in, const int num_rows, const int num_cols)
void _transpose_gpu_v2(real_t * out, real_t * in, const int nrow, const int ncol,
const sycl::nd_item<3> &item_ct1,
sycl::local_accessor<real_t, 2> cache)
{
sycl::queue * Q = pm_->dev_get_queue();

sycl::range<3> grid_size(num_rows, _TRANSPOSE_BLOCK_SIZE, 1);
sycl::range<3> block_size(1, _TRANSPOSE_BLOCK_SIZE, 1);

sycl::nd_range<3> kernel_rng(grid_size, block_size);

Q->submit([&](sycl::handler &cgh) {

cgh.parallel_for<Kernel_Transpose_Naive>(kernel_rng, [=](sycl::nd_item<3> idx) {
const int i = idx.get_group(0) * idx.get_local_range(0) + idx.get_local_id(0);

if(i >= num_rows) return;

int j = idx.get_group(1) * idx.get_local_range(1) + idx.get_local_id(1);

while(j < num_cols) {
out[j*num_rows + i] = in[i*num_cols + j];
j += idx.get_local_range(1);
}
}); // End of the kernel function

//}).wait(); // End of the queue commands
}); // End of the queue commands
int irow =
item_ct1.get_group(2) * _TRANSPOSE_BLOCK_SIZE + item_ct1.get_local_id(2);
int icol =
item_ct1.get_group(1) * _TRANSPOSE_BLOCK_SIZE + item_ct1.get_local_id(1);

// load tile into fast local memory

const int indxi = irow * ncol + icol;
for(int i=0; i<_TRANSPOSE_BLOCK_SIZE; i+= _TRANSPOSE_NUM_ROWS) {
if(irow < nrow && (icol+i) < ncol) // nonsquare
cache[item_ct1.get_local_id(1) + i][item_ct1.get_local_id(2)] =
in[indxi + i]; // threads read chunk of a row and write as a column
}

// block to ensure reads finish

item_ct1.barrier(sycl::access::fence_space::local_space);

// swap indices

irow =
item_ct1.get_group(1) * _TRANSPOSE_BLOCK_SIZE + item_ct1.get_local_id(2);
icol =
item_ct1.get_group(2) * _TRANSPOSE_BLOCK_SIZE + item_ct1.get_local_id(1);

// write tile to global memory

const int indxo = irow * nrow + icol;
for(int i=0; i<_TRANSPOSE_BLOCK_SIZE; i+= _TRANSPOSE_NUM_ROWS) {
if(irow < ncol && (icol + i) < nrow) // nonsquare
out[indxo + i] =
cache[item_ct1.get_local_id(2)][item_ct1.get_local_id(1) + i];
}
}

// ----------------------------------------------------------------

void transpose_gpu_v1(real_t * out, real_t * in, const int num_rows, const int num_cols)
void _transpose_naive_gpu(real_t * out, real_t * in, const int nrow, const int ncol,
const sycl::nd_item<3> &item_ct1)
{
sycl::queue * Q = pm_->dev_get_queue();

sycl::range<3> grid_size(_TILE(num_rows, _TRANSPOSE_BLOCK_SIZE), _TILE(num_cols, _TRANSPOSE_NUM_ROWS), 1);
sycl::range<3> block_size(_TRANSPOSE_BLOCK_SIZE, _TRANSPOSE_NUM_ROWS, 1);
const int i = item_ct1.get_group(2) * item_ct1.get_local_range(2) +
item_ct1.get_local_id(2);

if(i >= nrow) return;

sycl::nd_range<3> kernel_rng(grid_size, block_size);
int j = item_ct1.get_group(1) * item_ct1.get_local_range(1) +
item_ct1.get_local_id(1);

while (j < ncol) {
out[j*nrow + i] = in[i*ncol + j];
j += item_ct1.get_local_range(1);
}

Q->submit([&](sycl::handler &cgh) {
}

sycl::local_accessor<real_t, 2> cache(sycl::range(_TRANSPOSE_BLOCK_SIZE, _TRANSPOSE_BLOCK_SIZE), cgh);

cgh.parallel_for<Kernel_Transpose_V1>(kernel_rng, [=](sycl::nd_item<3> idx) {
int irow = idx.get_group(0) * _TRANSPOSE_BLOCK_SIZE + idx.get_local_id(0);
int icol = idx.get_group(1) * _TRANSPOSE_BLOCK_SIZE + idx.get_local_id(1);
// ----------------------------------------------------------------

// load tile into fast local memory
void _copy_naive_gpu(real_t * out, real_t * in, const int nrow, const int ncol,
const sycl::nd_item<3> &item_ct1)
{
const int i = item_ct1.get_group(2) * item_ct1.get_local_range(2) +
item_ct1.get_local_id(2);

const int indxi = irow * num_cols + icol;
for(int i=0; i<_TRANSPOSE_BLOCK_SIZE; i+= _TRANSPOSE_NUM_ROWS) {
if(irow < num_rows && (icol+i) < num_cols) // nonsquare
cache[idx.get_local_id(1)+i][idx.get_local_id(0)] = in[indxi + i]; // threads read chunk of a row and write as a column
}
if(i >= nrow) return;

// block to ensure reads finish
int j = item_ct1.get_group(1) * item_ct1.get_local_range(1) +
item_ct1.get_local_id(1);

idx.barrier(sycl::access::fence_space::local_space);
while (j < ncol) {
out[i*ncol + j] = in[i*ncol + j];
j += item_ct1.get_local_range(1);
}

}

// swap indices
// ----------------------------------------------------------------
// Host-side functions
// ----------------------------------------------------------------

irow = idx.get_group(1) * _TRANSPOSE_BLOCK_SIZE + idx.get_local_id(0);
icol = idx.get_group(0) * _TRANSPOSE_BLOCK_SIZE + idx.get_local_id(1);
void init_pm(class PM * pm)
{
pm_ = pm;
}

// write tile to global memory
void copy_naive_gpu(real_t * b, real_t * a, const int num_rows, const int num_cols)
{
sycl::range<3> grid_size(1, 1, num_rows);
sycl::range<3> block_size(1, _TRANSPOSE_BLOCK_SIZE, 1);

const int indxo = irow * num_rows + icol;
for(int i=0; i<_TRANSPOSE_BLOCK_SIZE; i+= _TRANSPOSE_NUM_ROWS) {
if(irow < num_cols && (icol+i) < num_rows) // nonsquare
out[indxo + i] = cache[idx.get_local_id(0)][idx.get_local_id(1) + i];
}
}); // End of the kernel function

// }).wait(); // End of the queue commands
}); // End of the queue commands
sycl::queue * s = pm_->dev_get_queue();

// dpct::get_in_order_queue().parallel_for(
s->parallel_for(
sycl::nd_range<3>(grid_size * block_size, block_size),
[=](sycl::nd_item<3> item_ct1) {
_copy_naive_gpu(b, a, num_rows, num_cols, item_ct1);
});
}

// ----------------------------------------------------------------

void transpose_gpu_v2(real_t * out, real_t * in, const int num_rows, const int num_cols)
void transpose_naive_gpu(real_t * b, real_t * a, const int num_rows, const int num_cols)
{
sycl::queue * Q = pm_->dev_get_queue();

sycl::range<3> grid_size(_TILE(num_rows, _TRANSPOSE_BLOCK_SIZE), _TILE(num_cols, _TRANSPOSE_NUM_ROWS), 1);
sycl::range<3> block_size(_TRANSPOSE_BLOCK_SIZE, _TRANSPOSE_NUM_ROWS, 1);
sycl::range<3> grid_size(1, 1, num_rows);
sycl::range<3> block_size(1, _TRANSPOSE_BLOCK_SIZE, 1);

sycl::nd_range<3> kernel_rng(grid_size, block_size);
sycl::queue * s = pm_->dev_get_queue();

Q->submit([&](sycl::handler &cgh) {

sycl::local_accessor<real_t, 2> cache(sycl::range(_TRANSPOSE_BLOCK_SIZE, _TRANSPOSE_BLOCK_SIZE+1), cgh);

cgh.parallel_for<Kernel_Transpose_V2>(kernel_rng, [=](sycl::nd_item<3> idx) {

int irow = idx.get_group(0) * _TRANSPOSE_BLOCK_SIZE + idx.get_local_id(0);
int icol = idx.get_group(1) * _TRANSPOSE_BLOCK_SIZE + idx.get_local_id(1);
// dpct::get_in_order_queue().parallel_for(
s->parallel_for(
sycl::nd_range<3>(grid_size * block_size, block_size),
[=](sycl::nd_item<3> item_ct1) {
_transpose_naive_gpu(b, a, num_rows, num_cols, item_ct1);
});
}

// load tile into fast local memory
// ----------------------------------------------------------------

const int indxi = irow * num_cols + icol;
for(int i=0; i<_TRANSPOSE_BLOCK_SIZE; i+= _TRANSPOSE_NUM_ROWS) {
if(irow < num_rows && (icol+i) < num_cols) // nonsquare
cache[idx.get_local_id(1)+i][idx.get_local_id(0)] = in[indxi + i]; // threads read chunk of a row and write as a column
}
void transpose_gpu_v1(real_t * b, real_t * a, const int num_rows, const int num_cols)
{
sycl::range<3> grid_size(1, _TILE(num_cols, _TRANSPOSE_BLOCK_SIZE),
_TILE(num_rows, _TRANSPOSE_BLOCK_SIZE));
sycl::range<3> block_size(1, _TRANSPOSE_NUM_ROWS, _TRANSPOSE_BLOCK_SIZE);

// block to ensure reads finish
// printf("\ngrid_size= %i %i %i\n", _TILE(_NUM_ROWS, _TRANSPOSE_BLOCK_SIZE), _TILE(_NUM_COLS, _TRANSPOSE_BLOCK_SIZE), 1);
// printf("block_size= %i %i %i\n",_TRANSPOSE_BLOCK_SIZE, _TRANSPOSE_NUM_ROWS, 1);

idx.barrier(sycl::access::fence_space::local_space);
sycl::queue * s = pm_->dev_get_queue();

//dpct::get_in_order_queue().submit([&](sycl::handler &cgh) {
s->submit([&](sycl::handler &cgh) {
sycl::local_accessor<real_t, 2> cache_acc_ct1(
sycl::range<2>(_TRANSPOSE_BLOCK_SIZE,
_TRANSPOSE_BLOCK_SIZE),
cgh);

cgh.parallel_for(sycl::nd_range<3>(grid_size * block_size, block_size),
[=](sycl::nd_item<3> item_ct1) {
_transpose_gpu_v1(b, a, num_rows, num_cols, item_ct1,
cache_acc_ct1);
});
});
}

// swap indices
// ----------------------------------------------------------------

irow = idx.get_group(1) * _TRANSPOSE_BLOCK_SIZE + idx.get_local_id(0);
icol = idx.get_group(0) * _TRANSPOSE_BLOCK_SIZE + idx.get_local_id(1);
void transpose_gpu_v2(real_t * b, real_t * a, const int num_rows, const int num_cols)
{
sycl::range<3> grid_size(1, _TILE(num_cols, _TRANSPOSE_BLOCK_SIZE),
_TILE(num_rows, _TRANSPOSE_BLOCK_SIZE));
sycl::range<3> block_size(1, _TRANSPOSE_NUM_ROWS, _TRANSPOSE_BLOCK_SIZE);

// write tile to global memory
// printf("\ngrid_size= %i %i %i\n", _TILE(_NUM_ROWS, _TRANSPOSE_BLOCK_SIZE), _TILE(_NUM_COLS, _TRANSPOSE_BLOCK_SIZE), 1);
// printf("block_size= %i %i %i\n",_TRANSPOSE_BLOCK_SIZE, _TRANSPOSE_NUM_ROWS, 1);

const int indxo = irow * num_rows + icol;
for(int i=0; i<_TRANSPOSE_BLOCK_SIZE; i+= _TRANSPOSE_NUM_ROWS) {
if(irow < num_cols && (icol+i) < num_rows) // nonsquare
out[indxo + i] = cache[idx.get_local_id(0)][idx.get_local_id(1) + i];
}
}); // End of the kernel function

// }).wait(); // End of the queue commands
}); // End of the queue commands
sycl::queue * s = pm_->dev_get_queue();

//dpct::get_in_order_queue().submit([&](sycl::handler &cgh) {
s->submit([&](sycl::handler &cgh) {
sycl::local_accessor<real_t, 2> cache_acc_ct1(
sycl::range<2>(_TRANSPOSE_BLOCK_SIZE,
_TRANSPOSE_BLOCK_SIZE+1),
cgh);

cgh.parallel_for(sycl::nd_range<3>(grid_size * block_size, block_size),
[=](sycl::nd_item<3> item_ct1) {
_transpose_gpu_v2(b, a, num_rows, num_cols, item_ct1,
cache_acc_ct1);
});
});
}

// ----------------------------------------------------------------

#endif
Loading

0 comments on commit 179f5ba

Please sign in to comment.