diff --git a/gpu/mini-apps/math/.gitignore b/gpu/mini-apps/math/.gitignore new file mode 100644 index 00000000..76a06813 --- /dev/null +++ b/gpu/mini-apps/math/.gitignore @@ -0,0 +1 @@ +pm*.cpp diff --git a/gpu/mini-apps/math/Makefile.nvcc b/gpu/mini-apps/math/Makefile.nvcc new file mode 100644 index 00000000..65908074 --- /dev/null +++ b/gpu/mini-apps/math/Makefile.nvcc @@ -0,0 +1,86 @@ + +# Definition of MACROS + +BASE_LIBGPU=../../src + +BINROOT=./ +EXE=a.out +SHELL=/bin/sh +CXX = g++ +CXXFLAGS= +FC=gfortran +FCFLAGS= +LD = $(CXX) +LDFLAGS = -fPIC -shared +AR = ar rcs +CPP = cpp -P -traditional +INSTALL=../ + +CUDA_CXX = nvcc +CUDA_CXXFLAGS = + +ARCH ?= polaris-gnu-nvcc +include $(BASE_LIBGPU)/arch/$(ARCH) + +CXXFLAGS += -I$(BASE_LIBGPU) +CUDA_CXXFLAGS += -I$(BASE_LIBGPU) + +$(info ARCH is [${ARCH}]) + +# -- subset of src files with cuda kernels +CUDA_SRC = pm_cuda.cpp offload.cpp +CUDA_OBJ = $(CUDA_SRC:.cpp=.o) + +CSRC = $(filter-out $(CUDA_SRC), $(wildcard *.cpp)) +INC = $(wildcard *.h) +COBJ = $(CSRC:.cpp=.o) + +FSRC = $(wildcard *.F) +MOD = $(FSRC:.F=.mod) +FOBJ = $(FSRC:.F=.o) + +# -- only copy source files; headers referenced with compiler flag +$(shell cp ../../src/pm*.cpp ./) + +# +# -- target : Dependencies +# -- Rule to create target + +$(EXE): $(COBJ) $(CUDA_OBJ) $(FOBJ) $(MOD) + $(LD) $(LDFLAGS) -o $@ $(COBJ) $(CUDA_OBJ) $(LIB) + +install: $(EXE) + cp $(EXE) $(INSTALL) +# cp $(MOD) $(FOBJ) $(INSTALL)/include + +#################################################################### + +$(COBJ): %.o: %.cpp + $(CXX) $(CXXFLAGS) -c $< + +$(FOBJ): %.o: %.F90 + $(FC) $(FCFLAGS) -c $< + +$(MOD): %.mod: %.F90 + $(FC) $(FCFLAGS) -c $< + +$(CUDA_OBJ): %.o: %.cpp + $(CUDA_CXX) -x cu $(CUDA_CXXFLAGS) -c $< -o $@ + +# +# -- Remove *.o and *~ from the directory +clean: + rm -f *.o *.mod *~ ./$(EXE) + rm -f $(INSTALL)/$(EXE) + rm -rf $(EXE).dSYM +# +# -- Remove *.o, *~, and executable from the directory +realclean: + rm -f *.o *.mod *~ ./$(EXE) + rm -f $(INSTALL)/$(EXE) + rm -rf $(EXE).dSYM + rm -f *.optrpt + rm -f pm*.h pm*.cpp + +# +# -- Simple dependencies diff --git a/gpu/mini-apps/math/main.cpp b/gpu/mini-apps/math/main.cpp new file mode 100644 index 00000000..f269a57e --- /dev/null +++ b/gpu/mini-apps/math/main.cpp @@ -0,0 +1,307 @@ +#include +#include +#include +#include +#include + +#include +#include + +#include "pm.h" + +#define _NUM_BATCHES 100 + +#define _NUM_ROWS_A 4 +#define _NUM_COLS_A 3 + +#define _NUM_ROWS_B _NUM_COLS_A +#define _NUM_COLS_B 5 + +#define _TOL 1e-8 +#define _NUM_ITERATIONS_CPU 10 +#define _NUM_ITERATIONS_GPU 1000 + +#ifdef _SINGLE_PRECISION + typedef float real_t; +#else + typedef double real_t; +#endif + +using namespace PM_NS; + +extern void init_pm(class PM *); + +// A is (m, k) matrix +// B is (k, n) matrix +// C is (m, n) matrix + +// Column-ordering transposes everything +// To compute A.B, then to call API with B.A + +extern "C" { + void dsymm_(const char*, const char*, const int*, const int*, + const double*, const double*, const int*, + const double*, const int*, + const double*, double*, const int*); + + void dgemm_(const char * transa, const char * transb, const int * m, const int * n, + const int * k, const double * alpha, const double * a, const int * lda, + const double * b, const int * ldb, const double * beta, double * c, + const int * ldc); +} + +// extern void transpose(real_t *, real_t *, const int, const int); +// extern void copy_naive_gpu(real_t *, real_t *, const int, const int); +// extern void transpose_naive_gpu(real_t *, real_t *, const int, const int); +// extern void transpose_gpu_v1(real_t *, real_t *, const int, const int); +// extern void transpose_gpu_v2(real_t *, real_t *, const int, const int); +// extern void transpose_gpu_v3(real_t *, real_t *, const int, const int); + +// ---------------------------------------------------------------- + +void gemm_NN0_naive_cpu(const int * m_, const int * n_, const int * k_, const real_t * alpha_, + real_t * a, const int * lda_, real_t * b, const int * ldb_, + const real_t * beta_, real_t * c, const int * ldc_) +{ + double alpha = *alpha_; + double beta = *beta_; + + int m = *m_; + int n = *n_; + int k = *k_; + + int lda = *lda_; + int ldb = *ldb_; + int ldc = *ldc_; + + for(int i=0; i _TOL) err++; + } + + if(err == 0) printf("Results from %s are correct!! :) \n", name); + else printf("Results from %s are incorrect!! :( \n", name); + + return err; +} + +// ---------------------------------------------------------------- + +void print_matrix(real_t * data, int num_rows, int num_cols, const char * name) +{ + printf("\nMatrix[%s] : %i x %i \n",name, num_rows, num_cols); + for(int i=0; idev_num_devices(); + + if(me == 0) { + printf("\n# of devices= %i\n",num_devices); + pm->dev_properties(num_devices); + } + + // Device ID + + int device_id = me % num_devices; + + pm->dev_set_device(device_id); + + for(int i=0; idev_malloc(_NUM_ROWS_A * _NUM_COLS_A * sizeof(real_t)); + real_t * d_b = (real_t *) pm->dev_malloc(_NUM_ROWS_B * _NUM_COLS_B * sizeof(real_t)); + real_t * d_c = (real_t *) pm->dev_malloc(_NUM_ROWS_A * _NUM_COLS_B * sizeof(real_t)); + + pm->dev_push(d_a, a, _NUM_ROWS_A * _NUM_COLS_A * sizeof(real_t)); + pm->dev_push(d_b, b, _NUM_ROWS_B * _NUM_COLS_B * sizeof(real_t)); + + { + const double alpha = 1.0; + const double beta = 0.0; + + const int m = _NUM_COLS_B; // # rows of first matrix B^T + const int n = _NUM_ROWS_A; // # cols of second matrix A^T + const int k = _NUM_ROWS_B; // # cols of first matrix B^T + + const int ldb = _NUM_COLS_B; // lead dimension of first matrix B^T + const int lda = _NUM_COLS_A; // lead dimension of second matrix A^T + const int ldc = _NUM_COLS_B; // lead dimension of result matrix C^T + +#ifdef _SINGLE_PRECISION + sgemm_((char *) "N", (char *) "N", &m, &n, &k, &alpha, d_b, &ldb, d_a, &lda, &beta, d_c, &ldc); +#else + dgemm_((char *) "N", (char *) "N", &m, &n, &k, &alpha, d_b, &ldb, d_a, &lda, &beta, d_c, &ldc); +#endif + + pm->dev_barrier(); + + double t0 = MPI_Wtime(); + for(int i=0; i<_NUM_ITERATIONS_CPU; ++i) { +#ifdef _SINGLE_PRECISION + sgemm_((char *) "N", (char *) "N", &m, &n, &k, &alpha, d_b, &ldb, d_a, &lda, &beta, d_c, &ldc); +#else + dgemm_((char *) "N", (char *) "N", &m, &n, &k, &alpha, d_b, &ldb, d_a, &lda, &beta, d_c, &ldc); +#endif + } + pm->dev_barrier(); + t = MPI_Wtime() - t0; + } + + pm->dev_pull(d_c, c, _NUM_ROWS_A * _NUM_COLS_B * sizeof(real_t)); + + print_summary(t, _NUM_ROWS_A, _NUM_COLS_A, _NUM_COLS_B, _NUM_ITERATIONS_CPU, "LAPACK gemm"); + + check_result(r, c, _NUM_ROWS_A*_NUM_COLS_B, "lapack_dgemm_cpu"); + + // print_matrix(r, _NUM_ROWS_A, _NUM_COLS_B, "Reference r"); + + // print_matrix(c, _NUM_ROWS_A, _NUM_COLS_B, "Output c"); + + // ---------------------------------------------------------------- + + // Clean up + + pm->dev_free(d_a); + pm->dev_free(d_b); + pm->dev_free(d_c); + + delete pm; + + free(a); + free(b); + free(c); + free(r); + + MPI_Finalize(); +} diff --git a/gpu/mini-apps/math/offload_cuda.cpp b/gpu/mini-apps/math/offload_cuda.cpp new file mode 100644 index 00000000..9832b622 --- /dev/null +++ b/gpu/mini-apps/math/offload_cuda.cpp @@ -0,0 +1,190 @@ +#if defined(_GPU_CUDA) + +#include +#include +#include +#include +#include + +#include "pm.h" + +#define _TRANSPOSE_BLOCK_SIZE 16 +#define _TRANSPOSE_NUM_ROWS 16 + +#define _TILE(A,B) (A + B - 1) / B + +#ifdef _SINGLE_PRECISION + typedef float real_t; +#else + typedef double real_t; +#endif + +using namespace PM_NS; + +class PM * pm_ = nullptr; + +// ---------------------------------------------------------------- +// GPU Kernels +// ---------------------------------------------------------------- + +//https://github.com/NVIDIA-developer-blog/code-samples/blob/master/series/cuda-cpp/transpose/transpose.cu +// modified to support nonsquare matrices + +__global__ void _transpose_gpu_v1(real_t * out, real_t * in, const int nrow, const int ncol) +{ + __shared__ real_t cache[_TRANSPOSE_BLOCK_SIZE][_TRANSPOSE_BLOCK_SIZE]; + + int irow = blockIdx.x * _TRANSPOSE_BLOCK_SIZE + threadIdx.x; + int icol = blockIdx.y * _TRANSPOSE_BLOCK_SIZE + threadIdx.y; + + // load tile into fast local memory + + const int indxi = irow * ncol + icol; + for(int i=0; i<_TRANSPOSE_BLOCK_SIZE; i+= _TRANSPOSE_NUM_ROWS) { + if(irow < nrow && (icol+i) < ncol) // nonsquare + cache[threadIdx.y + i][threadIdx.x] = in[indxi + i]; // threads read chunk of a row and write as a column + } + + // block to ensure reads finish + + __syncthreads(); + + // swap indices + + irow = blockIdx.y * _TRANSPOSE_BLOCK_SIZE + threadIdx.x; + icol = blockIdx.x * _TRANSPOSE_BLOCK_SIZE + threadIdx.y; + + // write tile to global memory + + const int indxo = irow * nrow + icol; + for(int i=0; i<_TRANSPOSE_BLOCK_SIZE; i+= _TRANSPOSE_NUM_ROWS) { + if(irow < ncol && (icol + i) < nrow) // nonsquare + out[indxo + i] = cache[threadIdx.x][threadIdx.y + i]; + } +} + +// ---------------------------------------------------------------- + +__global__ void _transpose_gpu_v2(real_t * out, real_t * in, const int nrow, const int ncol) +{ + __shared__ real_t cache[_TRANSPOSE_BLOCK_SIZE][_TRANSPOSE_BLOCK_SIZE+1]; + + int irow = blockIdx.x * _TRANSPOSE_BLOCK_SIZE + threadIdx.x; + int icol = blockIdx.y * _TRANSPOSE_BLOCK_SIZE + threadIdx.y; + + // load tile into fast local memory + + const int indxi = irow * ncol + icol; + for(int i=0; i<_TRANSPOSE_BLOCK_SIZE; i+= _TRANSPOSE_NUM_ROWS) { + if(irow < nrow && (icol+i) < ncol) // nonsquare + cache[threadIdx.y + i][threadIdx.x] = in[indxi + i]; // threads read chunk of a row and write as a column + } + + // block to ensure reads finish + + __syncthreads(); + + // swap indices + + irow = blockIdx.y * _TRANSPOSE_BLOCK_SIZE + threadIdx.x; + icol = blockIdx.x * _TRANSPOSE_BLOCK_SIZE + threadIdx.y; + + // write tile to global memory + + const int indxo = irow * nrow + icol; + for(int i=0; i<_TRANSPOSE_BLOCK_SIZE; i+= _TRANSPOSE_NUM_ROWS) { + if(irow < ncol && (icol + i) < nrow) // nonsquare + out[indxo + i] = cache[threadIdx.x][threadIdx.y + i]; + } +} + +// ---------------------------------------------------------------- + +__global__ void _transpose_naive_gpu(real_t * out, real_t * in, const int nrow, const int ncol) +{ + const int i = blockIdx.x * blockDim.x + threadIdx.x; + + if(i >= nrow) return; + + int j = blockIdx.y * blockDim.y + threadIdx.y; + + while (j < ncol) { + out[j*nrow + i] = in[i*ncol + j]; + j += blockDim.y; + } + +} + +// ---------------------------------------------------------------- + +__global__ void _copy_naive_gpu(real_t * out, real_t * in, const int nrow, const int ncol) +{ + const int i = blockIdx.x * blockDim.x + threadIdx.x; + + if(i >= nrow) return; + + int j = blockIdx.y * blockDim.y + threadIdx.y; + + while (j < ncol) { + out[i*ncol + j] = in[i*ncol + j]; + j += blockDim.y; + } + +} + +// ---------------------------------------------------------------- +// Host-side functions +// ---------------------------------------------------------------- + +void init_pm(class PM * pm) +{ + pm_ = pm; +} + +void copy_naive_gpu(real_t * b, real_t * a, const int num_rows, const int num_cols) +{ + dim3 grid_size(num_rows, 1, 1); + dim3 block_size(1, _TRANSPOSE_BLOCK_SIZE, 1); + + _copy_naive_gpu<<>>(b, a, num_rows, num_cols); +} + +// ---------------------------------------------------------------- + +void transpose_naive_gpu(real_t * b, real_t * a, const int num_rows, const int num_cols) +{ + dim3 grid_size(num_rows, 1, 1); + dim3 block_size(1, _TRANSPOSE_BLOCK_SIZE, 1); + + _transpose_naive_gpu<<>>(b, a, num_rows, num_cols); +} + +// ---------------------------------------------------------------- + +void transpose_gpu_v1(real_t * b, real_t * a, const int num_rows, const int num_cols) +{ + dim3 grid_size(_TILE(num_rows, _TRANSPOSE_BLOCK_SIZE), _TILE(num_cols, _TRANSPOSE_BLOCK_SIZE), 1); + dim3 block_size(_TRANSPOSE_BLOCK_SIZE, _TRANSPOSE_NUM_ROWS, 1); + + // printf("\ngrid_size= %i %i %i\n", _TILE(_NUM_ROWS, _TRANSPOSE_BLOCK_SIZE), _TILE(_NUM_COLS, _TRANSPOSE_BLOCK_SIZE), 1); + // printf("block_size= %i %i %i\n",_TRANSPOSE_BLOCK_SIZE, _TRANSPOSE_NUM_ROWS, 1); + + _transpose_gpu_v1<<>>(b, a, num_rows, num_cols); +} + +// ---------------------------------------------------------------- + +void transpose_gpu_v2(real_t * b, real_t * a, const int num_rows, const int num_cols) +{ + dim3 grid_size(_TILE(num_rows, _TRANSPOSE_BLOCK_SIZE), _TILE(num_cols, _TRANSPOSE_BLOCK_SIZE), 1); + dim3 block_size(_TRANSPOSE_BLOCK_SIZE, _TRANSPOSE_NUM_ROWS, 1); + + // printf("\ngrid_size= %i %i %i\n", _TILE(_NUM_ROWS, _TRANSPOSE_BLOCK_SIZE), _TILE(_NUM_COLS, _TRANSPOSE_BLOCK_SIZE), 1); + // printf("block_size= %i %i %i\n",_TRANSPOSE_BLOCK_SIZE, _TRANSPOSE_NUM_ROWS, 1); + + _transpose_gpu_v2<<>>(b, a, num_rows, num_cols); +} + +// ---------------------------------------------------------------- + +#endif diff --git a/gpu/mini-apps/math/offload_host.cpp b/gpu/mini-apps/math/offload_host.cpp new file mode 100644 index 00000000..7a4ddc88 --- /dev/null +++ b/gpu/mini-apps/math/offload_host.cpp @@ -0,0 +1,81 @@ +#if defined(_USE_CPU) + +#include +#include +#include +#include +#include + +#include "pm.h" + +#define _TRANSPOSE_BLOCK_SIZE 16 +#define _TRANSPOSE_NUM_ROWS 16 + +#define _TILE(A,B) (A + B - 1) / B + +#ifdef _SINGLE_PRECISION + typedef float real_t; +#else + typedef double real_t; +#endif + +using namespace PM_NS; + +class PM * pm_ = nullptr; + +// ---------------------------------------------------------------- +// Host-side functions +// ---------------------------------------------------------------- + +void init_pm(class PM * pm) +{ + pm_ = pm; +} + +// void copy_naive_gpu(real_t * b, real_t * a, const int num_rows, const int num_cols) +// { +// dim3 grid_size(num_rows, 1, 1); +// dim3 block_size(1, _TRANSPOSE_BLOCK_SIZE, 1); + +// _copy_naive_gpu<<>>(b, a, num_rows, num_cols); +// } + +// // ---------------------------------------------------------------- + +// void transpose_naive_gpu(real_t * b, real_t * a, const int num_rows, const int num_cols) +// { +// dim3 grid_size(num_rows, 1, 1); +// dim3 block_size(1, _TRANSPOSE_BLOCK_SIZE, 1); + +// _transpose_naive_gpu<<>>(b, a, num_rows, num_cols); +// } + +// // ---------------------------------------------------------------- + +// void transpose_gpu_v1(real_t * b, real_t * a, const int num_rows, const int num_cols) +// { +// dim3 grid_size(_TILE(num_rows, _TRANSPOSE_BLOCK_SIZE), _TILE(num_cols, _TRANSPOSE_BLOCK_SIZE), 1); +// dim3 block_size(_TRANSPOSE_BLOCK_SIZE, _TRANSPOSE_NUM_ROWS, 1); + +// // printf("\ngrid_size= %i %i %i\n", _TILE(_NUM_ROWS, _TRANSPOSE_BLOCK_SIZE), _TILE(_NUM_COLS, _TRANSPOSE_BLOCK_SIZE), 1); +// // printf("block_size= %i %i %i\n",_TRANSPOSE_BLOCK_SIZE, _TRANSPOSE_NUM_ROWS, 1); + +// _transpose_gpu_v1<<>>(b, a, num_rows, num_cols); +// } + +// // ---------------------------------------------------------------- + +// void transpose_gpu_v2(real_t * b, real_t * a, const int num_rows, const int num_cols) +// { +// dim3 grid_size(_TILE(num_rows, _TRANSPOSE_BLOCK_SIZE), _TILE(num_cols, _TRANSPOSE_BLOCK_SIZE), 1); +// dim3 block_size(_TRANSPOSE_BLOCK_SIZE, _TRANSPOSE_NUM_ROWS, 1); + +// // printf("\ngrid_size= %i %i %i\n", _TILE(_NUM_ROWS, _TRANSPOSE_BLOCK_SIZE), _TILE(_NUM_COLS, _TRANSPOSE_BLOCK_SIZE), 1); +// // printf("block_size= %i %i %i\n",_TRANSPOSE_BLOCK_SIZE, _TRANSPOSE_NUM_ROWS, 1); + +// _transpose_gpu_v2<<>>(b, a, num_rows, num_cols); +// } + +// // ---------------------------------------------------------------- + +#endif diff --git a/gpu/mini-apps/math/offload_sycl.cpp b/gpu/mini-apps/math/offload_sycl.cpp new file mode 100644 index 00000000..aa848ae4 --- /dev/null +++ b/gpu/mini-apps/math/offload_sycl.cpp @@ -0,0 +1,201 @@ +#if defined(_GPU_SYCL_CUDA) + +#include +#include +#include +#include +#include + +#include "pm.h" + +#define _TRANSPOSE_BLOCK_SIZE 16 +#define _TRANSPOSE_NUM_ROWS 16 + +//#define _TILE(A,B) (A + B - 1) / B +#define _TILE(A,B) ((A + B - 1) / B) * B + +#ifdef _SINGLE_PRECISION + typedef float real_t; +#else + typedef double real_t; +#endif + +using namespace PM_NS; + +class PM * pm_ = nullptr; + +class Kernel_Copy_Naive; +class Kernel_Transpose_Naive; +class Kernel_Transpose_V1; +class Kernel_Transpose_V2; + +// ---------------------------------------------------------------- + +void init_pm(class PM * pm) +{ + pm_ = pm; +} + +// gridDim.{x,y,z} == get_num_group({0,1,2}) +// blockDim.{x,y,z} == get_local_range({0,1,2}) +// blockIdx.{x,y,z} == get_group({0,1,2}) +// threadIdx.{x,y,z} == get_local_id({0,1,2}) + +void copy_naive_gpu(real_t * out, real_t * in, const int num_rows, const int num_cols) +{ + sycl::queue * Q = pm_->dev_get_queue(); + + sycl::range<3> grid_size(num_rows, _TRANSPOSE_BLOCK_SIZE, 1); + sycl::range<3> block_size(1, _TRANSPOSE_BLOCK_SIZE, 1); + + sycl::nd_range<3> kernel_rng(grid_size, block_size); + + Q->submit([&](sycl::handler &cgh) { + + cgh.parallel_for(kernel_rng, [=](sycl::nd_item<3> idx) { + const int i = idx.get_group(0) * idx.get_local_range(0) + idx.get_local_id(0); + + if(i >= num_rows) return; + + int j = idx.get_group(1) * idx.get_local_range(1) + idx.get_local_id(1); + + while(j < num_cols) { + out[i*num_cols + j] = in[i*num_cols + j]; + j += idx.get_local_range(1); + } + }); // End of the kernel function + + // }).wait(); // End of the queue commands + }); // End of the queue commands +} + +// ---------------------------------------------------------------- + +void transpose_naive_gpu(real_t * out, real_t * in, const int num_rows, const int num_cols) +{ + sycl::queue * Q = pm_->dev_get_queue(); + + sycl::range<3> grid_size(num_rows, _TRANSPOSE_BLOCK_SIZE, 1); + sycl::range<3> block_size(1, _TRANSPOSE_BLOCK_SIZE, 1); + + sycl::nd_range<3> kernel_rng(grid_size, block_size); + + Q->submit([&](sycl::handler &cgh) { + + cgh.parallel_for(kernel_rng, [=](sycl::nd_item<3> idx) { + const int i = idx.get_group(0) * idx.get_local_range(0) + idx.get_local_id(0); + + if(i >= num_rows) return; + + int j = idx.get_group(1) * idx.get_local_range(1) + idx.get_local_id(1); + + while(j < num_cols) { + out[j*num_rows + i] = in[i*num_cols + j]; + j += idx.get_local_range(1); + } + }); // End of the kernel function + + //}).wait(); // End of the queue commands + }); // End of the queue commands +} + +// ---------------------------------------------------------------- + +void transpose_gpu_v1(real_t * out, real_t * in, const int num_rows, const int num_cols) +{ + sycl::queue * Q = pm_->dev_get_queue(); + + sycl::range<3> grid_size(_TILE(num_rows, _TRANSPOSE_BLOCK_SIZE), _TILE(num_cols, _TRANSPOSE_NUM_ROWS), 1); + sycl::range<3> block_size(_TRANSPOSE_BLOCK_SIZE, _TRANSPOSE_NUM_ROWS, 1); + + sycl::nd_range<3> kernel_rng(grid_size, block_size); + + Q->submit([&](sycl::handler &cgh) { + + sycl::local_accessor cache(sycl::range(_TRANSPOSE_BLOCK_SIZE, _TRANSPOSE_BLOCK_SIZE), cgh); + + cgh.parallel_for(kernel_rng, [=](sycl::nd_item<3> idx) { + int irow = idx.get_group(0) * _TRANSPOSE_BLOCK_SIZE + idx.get_local_id(0); + int icol = idx.get_group(1) * _TRANSPOSE_BLOCK_SIZE + idx.get_local_id(1); + + // load tile into fast local memory + + const int indxi = irow * num_cols + icol; + for(int i=0; i<_TRANSPOSE_BLOCK_SIZE; i+= _TRANSPOSE_NUM_ROWS) { + if(irow < num_rows && (icol+i) < num_cols) // nonsquare + cache[idx.get_local_id(1)+i][idx.get_local_id(0)] = in[indxi + i]; // threads read chunk of a row and write as a column + } + + // block to ensure reads finish + + idx.barrier(sycl::access::fence_space::local_space); + + // swap indices + + irow = idx.get_group(1) * _TRANSPOSE_BLOCK_SIZE + idx.get_local_id(0); + icol = idx.get_group(0) * _TRANSPOSE_BLOCK_SIZE + idx.get_local_id(1); + + // write tile to global memory + + const int indxo = irow * num_rows + icol; + for(int i=0; i<_TRANSPOSE_BLOCK_SIZE; i+= _TRANSPOSE_NUM_ROWS) { + if(irow < num_cols && (icol+i) < num_rows) // nonsquare + out[indxo + i] = cache[idx.get_local_id(0)][idx.get_local_id(1) + i]; + } + }); // End of the kernel function + + // }).wait(); // End of the queue commands + }); // End of the queue commands +} + +// ---------------------------------------------------------------- + +void transpose_gpu_v2(real_t * out, real_t * in, const int num_rows, const int num_cols) +{ + sycl::queue * Q = pm_->dev_get_queue(); + + sycl::range<3> grid_size(_TILE(num_rows, _TRANSPOSE_BLOCK_SIZE), _TILE(num_cols, _TRANSPOSE_NUM_ROWS), 1); + sycl::range<3> block_size(_TRANSPOSE_BLOCK_SIZE, _TRANSPOSE_NUM_ROWS, 1); + + sycl::nd_range<3> kernel_rng(grid_size, block_size); + + Q->submit([&](sycl::handler &cgh) { + + sycl::local_accessor cache(sycl::range(_TRANSPOSE_BLOCK_SIZE, _TRANSPOSE_BLOCK_SIZE+1), cgh); + + cgh.parallel_for(kernel_rng, [=](sycl::nd_item<3> idx) { + + int irow = idx.get_group(0) * _TRANSPOSE_BLOCK_SIZE + idx.get_local_id(0); + int icol = idx.get_group(1) * _TRANSPOSE_BLOCK_SIZE + idx.get_local_id(1); + + // load tile into fast local memory + + const int indxi = irow * num_cols + icol; + for(int i=0; i<_TRANSPOSE_BLOCK_SIZE; i+= _TRANSPOSE_NUM_ROWS) { + if(irow < num_rows && (icol+i) < num_cols) // nonsquare + cache[idx.get_local_id(1)+i][idx.get_local_id(0)] = in[indxi + i]; // threads read chunk of a row and write as a column + } + + // block to ensure reads finish + + idx.barrier(sycl::access::fence_space::local_space); + + // swap indices + + irow = idx.get_group(1) * _TRANSPOSE_BLOCK_SIZE + idx.get_local_id(0); + icol = idx.get_group(0) * _TRANSPOSE_BLOCK_SIZE + idx.get_local_id(1); + + // write tile to global memory + + const int indxo = irow * num_rows + icol; + for(int i=0; i<_TRANSPOSE_BLOCK_SIZE; i+= _TRANSPOSE_NUM_ROWS) { + if(irow < num_cols && (icol+i) < num_rows) // nonsquare + out[indxo + i] = cache[idx.get_local_id(0)][idx.get_local_id(1) + i]; + } + }); // End of the kernel function + + // }).wait(); // End of the queue commands + }); // End of the queue commands +} + +#endif diff --git a/gpu/mini-apps/math/run.sh b/gpu/mini-apps/math/run.sh new file mode 100755 index 00000000..71ecf524 --- /dev/null +++ b/gpu/mini-apps/math/run.sh @@ -0,0 +1,7 @@ + +./a.out + +#nsys profile --stats=true ./a.out + +# WARNING :: set _NUM_ITERATIONS_GPU to something small (e.g. 5) +#ncu --print-summary per-kernel ./a.out diff --git a/gpu/src/pm_cuda.h b/gpu/src/pm_cuda.h index e0dbc754..02db1d8e 100644 --- a/gpu/src/pm_cuda.h +++ b/gpu/src/pm_cuda.h @@ -78,6 +78,7 @@ namespace PM_NS { void dev_stream_destroy(cudaStream_t & s); void dev_stream_wait(cudaStream_t & s); + private: void uuid_print(cudaUUID_t); };