update Makefile and test syclomatic conversion

cjknight · Dec 13, 2024 · 179f5ba · 179f5ba
1 parent be03fc2
commit 179f5ba
Show file tree

Hide file tree

Showing 3 changed files with 389 additions and 134 deletions.
diff --git a/gpu/mini-apps/transpose/Makefile b/gpu/mini-apps/transpose/Makefile
@@ -16,7 +16,7 @@ AR = ar rcs
 CPP=cpp -P -traditional
 INSTALL=../
 
-ARCH ?= polaris
+ARCH ?= polaris-gnu-nvcc
 include $(BASE_LIBGPU)/arch/$(ARCH)
 
 CXXFLAGS += -I$(BASE_LIBGPU)
@@ -50,7 +50,7 @@ install: $(EXE)
 	cp $(EXE) $(INSTALL)
 #	cp $(MOD) $(FOBJ) $(INSTALL)/include
 
-pm:
+update:
 	cp ../../src/pm*.cpp ./
 
 ####################################################################

diff --git a/gpu/mini-apps/transpose/offload_sycl.cpp b/gpu/mini-apps/transpose/offload_sycl.cpp
@@ -1,5 +1,6 @@
-#if defined(_GPU_SYCL_CUDA)
+#if defined(_GPU_SYCL) || defined(_GPU_SYCL_CUDA)
 
+#include <sycl/sycl.hpp>
 #include <stdio.h>
 #include <stdlib.h>
 #include <math.h>
@@ -11,8 +12,7 @@
 #define _TRANSPOSE_BLOCK_SIZE 16
 #define _TRANSPOSE_NUM_ROWS 16
 
-//#define _TILE(A,B)  (A + B - 1) / B
-#define _TILE(A,B) ((A + B - 1) / B) * B
+#define _TILE(A,B) (A + B - 1) / B
 
 #ifdef _SINGLE_PRECISION
   typedef float real_t;
@@ -24,178 +24,232 @@ using namespace PM_NS;
 
 class PM * pm_ = nullptr;
 
-class Kernel_Copy_Naive;
-class Kernel_Transpose_Naive;
-class Kernel_Transpose_V1;
-class Kernel_Transpose_V2;
-
+// ----------------------------------------------------------------
+// GPU Kernels
 // ----------------------------------------------------------------
 
-void init_pm(class PM * pm)
+//https://github.com/NVIDIA-developer-blog/code-samples/blob/master/series/cuda-cpp/transpose/transpose.cu
+// modified to support nonsquare matrices
+
+void _transpose_gpu_v1(real_t * out, real_t * in, const int nrow, const int ncol,
+                       const sycl::nd_item<3> &item_ct1,
+                       sycl::local_accessor<real_t, 2> cache)
 {
-  pm_ = pm;
-}
 
-// gridDim.{x,y,z} == get_num_group({0,1,2})
-// blockDim.{x,y,z} == get_local_range({0,1,2})
-// blockIdx.{x,y,z} == get_group({0,1,2})
-// threadIdx.{x,y,z} == get_local_id({0,1,2})
+  int irow =
+      item_ct1.get_group(2) * _TRANSPOSE_BLOCK_SIZE + item_ct1.get_local_id(2);
+  int icol =
+      item_ct1.get_group(1) * _TRANSPOSE_BLOCK_SIZE + item_ct1.get_local_id(1);
 
-void copy_naive_gpu(real_t * out, real_t * in, const int num_rows, const int num_cols)
-{
-  sycl::queue * Q = pm_->dev_get_queue();
-
-  sycl::range<3> grid_size(num_rows, _TRANSPOSE_BLOCK_SIZE, 1);
-  sycl::range<3> block_size(1, _TRANSPOSE_BLOCK_SIZE, 1);
+  // load tile into fast local memory
 
-  sycl::nd_range<3> kernel_rng(grid_size, block_size);
-
-  Q->submit([&](sycl::handler &cgh) {
-
-    cgh.parallel_for<Kernel_Copy_Naive>(kernel_rng, [=](sycl::nd_item<3> idx) {
-      const int i = idx.get_group(0) * idx.get_local_range(0) + idx.get_local_id(0);
-
-      if(i >= num_rows) return;
-
-      int j = idx.get_group(1) * idx.get_local_range(1) + idx.get_local_id(1);
-
-      while(j < num_cols) {
-	out[i*num_cols + j] = in[i*num_cols + j];
-	j += idx.get_local_range(1);
-      }
-    }); // End of the kernel function
-
-    //  }).wait();       // End of the queue commands
-  });       // End of the queue commands
+  const int indxi = irow * ncol + icol;
+  for(int i=0; i<_TRANSPOSE_BLOCK_SIZE; i+= _TRANSPOSE_NUM_ROWS) {
+    if(irow < nrow && (icol+i) < ncol) // nonsquare
+      cache[item_ct1.get_local_id(1) + i][item_ct1.get_local_id(2)] =
+          in[indxi + i]; // threads read chunk of a row and write as a column
+  }
+
+  // block to ensure reads finish
+
+  item_ct1.barrier(sycl::access::fence_space::local_space);
+
+  // swap indices
+
+  irow =
+      item_ct1.get_group(1) * _TRANSPOSE_BLOCK_SIZE + item_ct1.get_local_id(2);
+  icol =
+      item_ct1.get_group(2) * _TRANSPOSE_BLOCK_SIZE + item_ct1.get_local_id(1);
+
+  // write tile to global memory
+
+  const int indxo = irow * nrow + icol;
+  for(int i=0; i<_TRANSPOSE_BLOCK_SIZE; i+= _TRANSPOSE_NUM_ROWS) {
+    if(irow < ncol && (icol + i) < nrow) // nonsquare
+      out[indxo + i] =
+          cache[item_ct1.get_local_id(2)][item_ct1.get_local_id(1) + i];
+  }
 }
 
 // ----------------------------------------------------------------
 
-void transpose_naive_gpu(real_t * out, real_t * in, const int num_rows, const int num_cols)
+void _transpose_gpu_v2(real_t * out, real_t * in, const int nrow, const int ncol,
+                       const sycl::nd_item<3> &item_ct1,
+                       sycl::local_accessor<real_t, 2> cache)
 {
-  sycl::queue * Q = pm_->dev_get_queue();
-
-  sycl::range<3> grid_size(num_rows, _TRANSPOSE_BLOCK_SIZE, 1);
-  sycl::range<3> block_size(1, _TRANSPOSE_BLOCK_SIZE, 1);
 
-  sycl::nd_range<3> kernel_rng(grid_size, block_size);
-
-  Q->submit([&](sycl::handler &cgh) {
-
-    cgh.parallel_for<Kernel_Transpose_Naive>(kernel_rng, [=](sycl::nd_item<3> idx) {
-      const int i = idx.get_group(0) * idx.get_local_range(0) + idx.get_local_id(0);
-
-      if(i >= num_rows) return;
-
-      int j = idx.get_group(1) * idx.get_local_range(1) + idx.get_local_id(1);
-
-      while(j < num_cols) {
-	out[j*num_rows + i] = in[i*num_cols + j];
-	j += idx.get_local_range(1);
-      }
-    }); // End of the kernel function
-
-    //}).wait();       // End of the queue commands
-  });       // End of the queue commands
+  int irow =
+      item_ct1.get_group(2) * _TRANSPOSE_BLOCK_SIZE + item_ct1.get_local_id(2);
+  int icol =
+      item_ct1.get_group(1) * _TRANSPOSE_BLOCK_SIZE + item_ct1.get_local_id(1);
+
+  // load tile into fast local memory
+
+  const int indxi = irow * ncol + icol;
+  for(int i=0; i<_TRANSPOSE_BLOCK_SIZE; i+= _TRANSPOSE_NUM_ROWS) {
+    if(irow < nrow && (icol+i) < ncol) // nonsquare
+      cache[item_ct1.get_local_id(1) + i][item_ct1.get_local_id(2)] =
+          in[indxi + i]; // threads read chunk of a row and write as a column
+  }
+
+  // block to ensure reads finish
+
+  item_ct1.barrier(sycl::access::fence_space::local_space);
+
+  // swap indices
+
+  irow =
+      item_ct1.get_group(1) * _TRANSPOSE_BLOCK_SIZE + item_ct1.get_local_id(2);
+  icol =
+      item_ct1.get_group(2) * _TRANSPOSE_BLOCK_SIZE + item_ct1.get_local_id(1);
+
+  // write tile to global memory
+
+  const int indxo = irow * nrow + icol;
+  for(int i=0; i<_TRANSPOSE_BLOCK_SIZE; i+= _TRANSPOSE_NUM_ROWS) {
+    if(irow < ncol && (icol + i) < nrow) // nonsquare
+      out[indxo + i] =
+          cache[item_ct1.get_local_id(2)][item_ct1.get_local_id(1) + i];
+  }
 }
 
 // ----------------------------------------------------------------
 
-void transpose_gpu_v1(real_t * out, real_t * in, const int num_rows, const int num_cols)
+void _transpose_naive_gpu(real_t * out, real_t * in, const int nrow, const int ncol,
+                          const sycl::nd_item<3> &item_ct1)
 {
-  sycl::queue * Q = pm_->dev_get_queue();
-
-  sycl::range<3> grid_size(_TILE(num_rows, _TRANSPOSE_BLOCK_SIZE), _TILE(num_cols, _TRANSPOSE_NUM_ROWS), 1);
-  sycl::range<3> block_size(_TRANSPOSE_BLOCK_SIZE, _TRANSPOSE_NUM_ROWS, 1);
+  const int i = item_ct1.get_group(2) * item_ct1.get_local_range(2) +
+                item_ct1.get_local_id(2);
+
+  if(i >= nrow) return;
 
-  sycl::nd_range<3> kernel_rng(grid_size, block_size);
+  int j = item_ct1.get_group(1) * item_ct1.get_local_range(1) +
+          item_ct1.get_local_id(1);
+
+  while (j < ncol) {
+    out[j*nrow + i] = in[i*ncol + j];
+    j += item_ct1.get_local_range(1);
+  }
 
-  Q->submit([&](sycl::handler &cgh) {
+}
 
-    sycl::local_accessor<real_t, 2> cache(sycl::range(_TRANSPOSE_BLOCK_SIZE, _TRANSPOSE_BLOCK_SIZE), cgh);
-
-    cgh.parallel_for<Kernel_Transpose_V1>(kernel_rng, [=](sycl::nd_item<3> idx) {
-      int irow = idx.get_group(0) * _TRANSPOSE_BLOCK_SIZE + idx.get_local_id(0);
-      int icol = idx.get_group(1) * _TRANSPOSE_BLOCK_SIZE + idx.get_local_id(1);
+// ----------------------------------------------------------------
 
-      // load tile into fast local memory
+void _copy_naive_gpu(real_t * out, real_t * in, const int nrow, const int ncol,
+                     const sycl::nd_item<3> &item_ct1)
+{
+  const int i = item_ct1.get_group(2) * item_ct1.get_local_range(2) +
+                item_ct1.get_local_id(2);
 
-      const int indxi = irow * num_cols + icol;
-      for(int i=0; i<_TRANSPOSE_BLOCK_SIZE; i+= _TRANSPOSE_NUM_ROWS) {
-	if(irow < num_rows && (icol+i) < num_cols) // nonsquare
-	  cache[idx.get_local_id(1)+i][idx.get_local_id(0)] = in[indxi + i]; // threads read chunk of a row and write as a column
-      }
+  if(i >= nrow) return;
 
-      // block to ensure reads finish
+  int j = item_ct1.get_group(1) * item_ct1.get_local_range(1) +
+          item_ct1.get_local_id(1);
 
-      idx.barrier(sycl::access::fence_space::local_space);
+  while (j < ncol) {
+    out[i*ncol + j] = in[i*ncol + j];
+    j += item_ct1.get_local_range(1);
+  }
+
+}
 
-      // swap indices
+// ----------------------------------------------------------------
+// Host-side functions
+// ----------------------------------------------------------------
 
-      irow = idx.get_group(1) * _TRANSPOSE_BLOCK_SIZE + idx.get_local_id(0);
-      icol = idx.get_group(0) * _TRANSPOSE_BLOCK_SIZE + idx.get_local_id(1);      
+void init_pm(class PM * pm)
+{
+  pm_ = pm;
+}
 
-      // write tile to global memory
+void copy_naive_gpu(real_t * b, real_t * a, const int num_rows, const int num_cols)
+{
+  sycl::range<3> grid_size(1, 1, num_rows);
+  sycl::range<3> block_size(1, _TRANSPOSE_BLOCK_SIZE, 1);
 
-      const int indxo = irow * num_rows + icol;
-      for(int i=0; i<_TRANSPOSE_BLOCK_SIZE; i+= _TRANSPOSE_NUM_ROWS) {
-	if(irow < num_cols && (icol+i) < num_rows) // nonsquare
-	  out[indxo + i] = cache[idx.get_local_id(0)][idx.get_local_id(1) + i];
-      }
-    }); // End of the kernel function
-
-    //  }).wait();       // End of the queue commands
-  });       // End of the queue commands
+  sycl::queue * s = pm_->dev_get_queue();  
+
+  //  dpct::get_in_order_queue().parallel_for(
+  s->parallel_for(
+      sycl::nd_range<3>(grid_size * block_size, block_size),
+      [=](sycl::nd_item<3> item_ct1) {
+        _copy_naive_gpu(b, a, num_rows, num_cols, item_ct1);
+      });
 }
 
 // ----------------------------------------------------------------
 
-void transpose_gpu_v2(real_t * out, real_t * in, const int num_rows, const int num_cols)
+void transpose_naive_gpu(real_t * b, real_t * a, const int num_rows, const int num_cols)
 {
-  sycl::queue * Q = pm_->dev_get_queue();
-
-  sycl::range<3> grid_size(_TILE(num_rows, _TRANSPOSE_BLOCK_SIZE), _TILE(num_cols, _TRANSPOSE_NUM_ROWS), 1);
-  sycl::range<3> block_size(_TRANSPOSE_BLOCK_SIZE, _TRANSPOSE_NUM_ROWS, 1);
+  sycl::range<3> grid_size(1, 1, num_rows);
+  sycl::range<3> block_size(1, _TRANSPOSE_BLOCK_SIZE, 1);
 
-  sycl::nd_range<3> kernel_rng(grid_size, block_size);
+  sycl::queue * s = pm_->dev_get_queue();  
 
-  Q->submit([&](sycl::handler &cgh) {
-
-    sycl::local_accessor<real_t, 2> cache(sycl::range(_TRANSPOSE_BLOCK_SIZE, _TRANSPOSE_BLOCK_SIZE+1), cgh);
-
-    cgh.parallel_for<Kernel_Transpose_V2>(kernel_rng, [=](sycl::nd_item<3> idx) {
-
-      int irow = idx.get_group(0) * _TRANSPOSE_BLOCK_SIZE + idx.get_local_id(0);
-      int icol = idx.get_group(1) * _TRANSPOSE_BLOCK_SIZE + idx.get_local_id(1);
+  //  dpct::get_in_order_queue().parallel_for(
+  s->parallel_for(
+      sycl::nd_range<3>(grid_size * block_size, block_size),
+      [=](sycl::nd_item<3> item_ct1) {
+        _transpose_naive_gpu(b, a, num_rows, num_cols, item_ct1);
+      });
+}
 
-      // load tile into fast local memory
+// ----------------------------------------------------------------
 
-      const int indxi = irow * num_cols + icol;
-      for(int i=0; i<_TRANSPOSE_BLOCK_SIZE; i+= _TRANSPOSE_NUM_ROWS) {
-	if(irow < num_rows && (icol+i) < num_cols) // nonsquare
-	  cache[idx.get_local_id(1)+i][idx.get_local_id(0)] = in[indxi + i]; // threads read chunk of a row and write as a column
-      }
+void transpose_gpu_v1(real_t * b, real_t * a, const int num_rows, const int num_cols)
+{
+  sycl::range<3> grid_size(1, _TILE(num_cols, _TRANSPOSE_BLOCK_SIZE),
+                           _TILE(num_rows, _TRANSPOSE_BLOCK_SIZE));
+  sycl::range<3> block_size(1, _TRANSPOSE_NUM_ROWS, _TRANSPOSE_BLOCK_SIZE);
 
-      // block to ensure reads finish
+  //  printf("\ngrid_size= %i %i %i\n", _TILE(_NUM_ROWS, _TRANSPOSE_BLOCK_SIZE), _TILE(_NUM_COLS, _TRANSPOSE_BLOCK_SIZE), 1);
+  //  printf("block_size= %i %i %i\n",_TRANSPOSE_BLOCK_SIZE, _TRANSPOSE_NUM_ROWS, 1);
 
-      idx.barrier(sycl::access::fence_space::local_space);
+  sycl::queue * s = pm_->dev_get_queue();  
+
+  //dpct::get_in_order_queue().submit([&](sycl::handler &cgh) {
+  s->submit([&](sycl::handler &cgh) {
+    sycl::local_accessor<real_t, 2> cache_acc_ct1(
+        sycl::range<2>(_TRANSPOSE_BLOCK_SIZE,
+                       _TRANSPOSE_BLOCK_SIZE),
+        cgh);
+
+    cgh.parallel_for(sycl::nd_range<3>(grid_size * block_size, block_size),
+                     [=](sycl::nd_item<3> item_ct1) {
+                       _transpose_gpu_v1(b, a, num_rows, num_cols, item_ct1,
+                                         cache_acc_ct1);
+                     });
+  });
+}
 
-      // swap indices
+// ----------------------------------------------------------------
 
-      irow = idx.get_group(1) * _TRANSPOSE_BLOCK_SIZE + idx.get_local_id(0);
-      icol = idx.get_group(0) * _TRANSPOSE_BLOCK_SIZE + idx.get_local_id(1);      
+void transpose_gpu_v2(real_t * b, real_t * a, const int num_rows, const int num_cols)
+{
+  sycl::range<3> grid_size(1, _TILE(num_cols, _TRANSPOSE_BLOCK_SIZE),
+                           _TILE(num_rows, _TRANSPOSE_BLOCK_SIZE));
+  sycl::range<3> block_size(1, _TRANSPOSE_NUM_ROWS, _TRANSPOSE_BLOCK_SIZE);
 
-      // write tile to global memory
+  //  printf("\ngrid_size= %i %i %i\n", _TILE(_NUM_ROWS, _TRANSPOSE_BLOCK_SIZE), _TILE(_NUM_COLS, _TRANSPOSE_BLOCK_SIZE), 1);
+  //  printf("block_size= %i %i %i\n",_TRANSPOSE_BLOCK_SIZE, _TRANSPOSE_NUM_ROWS, 1);
 
-      const int indxo = irow * num_rows + icol;
-      for(int i=0; i<_TRANSPOSE_BLOCK_SIZE; i+= _TRANSPOSE_NUM_ROWS) {
-	if(irow < num_cols && (icol+i) < num_rows) // nonsquare
-	  out[indxo + i] = cache[idx.get_local_id(0)][idx.get_local_id(1) + i];
-      }
-    }); // End of the kernel function
-
-    //  }).wait();       // End of the queue commands
-  });       // End of the queue commands
+  sycl::queue * s = pm_->dev_get_queue();  
+
+  //dpct::get_in_order_queue().submit([&](sycl::handler &cgh) {
+  s->submit([&](sycl::handler &cgh) {
+    sycl::local_accessor<real_t, 2> cache_acc_ct1(
+        sycl::range<2>(_TRANSPOSE_BLOCK_SIZE,
+                       _TRANSPOSE_BLOCK_SIZE+1),
+        cgh);
+
+    cgh.parallel_for(sycl::nd_range<3>(grid_size * block_size, block_size),
+                     [=](sycl::nd_item<3> item_ct1) {
+                       _transpose_gpu_v2(b, a, num_rows, num_cols, item_ct1,
+                                         cache_acc_ct1);
+                     });
+  });
 }
 
+// ----------------------------------------------------------------
+
 #endif