diff --git a/gpu/mini-apps/math/.gitignore b/gpu/mini-apps/math/.gitignore
new file mode 100644
index 00000000..76a06813
--- /dev/null
+++ b/gpu/mini-apps/math/.gitignore
@@ -0,0 +1 @@
+pm*.cpp
diff --git a/gpu/mini-apps/math/Makefile.nvcc b/gpu/mini-apps/math/Makefile.nvcc
new file mode 100644
index 00000000..65908074
--- /dev/null
+++ b/gpu/mini-apps/math/Makefile.nvcc
@@ -0,0 +1,86 @@
+
+# Definition of MACROS
+
+BASE_LIBGPU=../../src
+
+BINROOT=./
+EXE=a.out
+SHELL=/bin/sh
+CXX = g++
+CXXFLAGS=
+FC=gfortran
+FCFLAGS=
+LD = $(CXX)
+LDFLAGS = -fPIC -shared
+AR = ar rcs
+CPP = cpp -P -traditional
+INSTALL=../
+
+CUDA_CXX = nvcc
+CUDA_CXXFLAGS = 
+
+ARCH ?= polaris-gnu-nvcc
+include $(BASE_LIBGPU)/arch/$(ARCH)
+
+CXXFLAGS += -I$(BASE_LIBGPU)
+CUDA_CXXFLAGS += -I$(BASE_LIBGPU)
+
+$(info ARCH is [${ARCH}])
+
+# -- subset of src files with cuda kernels
+CUDA_SRC = pm_cuda.cpp offload.cpp
+CUDA_OBJ = $(CUDA_SRC:.cpp=.o)
+
+CSRC = $(filter-out $(CUDA_SRC), $(wildcard *.cpp))
+INC = $(wildcard *.h)
+COBJ = $(CSRC:.cpp=.o)
+
+FSRC = $(wildcard *.F)
+MOD = $(FSRC:.F=.mod)
+FOBJ = $(FSRC:.F=.o)
+
+# -- only copy source files; headers referenced with compiler flag
+$(shell cp ../../src/pm*.cpp ./)
+
+#
+# -- target : 	Dependencies
+# --		Rule to create target
+
+$(EXE): $(COBJ) $(CUDA_OBJ) $(FOBJ) $(MOD)
+	$(LD) $(LDFLAGS) -o $@ $(COBJ) $(CUDA_OBJ) $(LIB)
+
+install: $(EXE)
+	 cp $(EXE) $(INSTALL)
+#	 cp $(MOD) $(FOBJ) $(INSTALL)/include
+
+####################################################################
+
+$(COBJ): %.o: %.cpp
+	$(CXX) $(CXXFLAGS) -c $<
+
+$(FOBJ): %.o: %.F90
+	$(FC) $(FCFLAGS) -c $<
+
+$(MOD): %.mod: %.F90
+	$(FC) $(FCFLAGS) -c $<
+
+$(CUDA_OBJ): %.o: %.cpp
+	$(CUDA_CXX) -x cu $(CUDA_CXXFLAGS) -c $< -o $@
+
+#
+# -- Remove *.o and *~ from the directory
+clean:
+	rm -f *.o *.mod *~ ./$(EXE)
+	rm -f $(INSTALL)/$(EXE)
+	rm -rf $(EXE).dSYM
+#
+# -- Remove *.o, *~, and executable from the directory
+realclean:
+	rm -f *.o *.mod *~ ./$(EXE)
+	rm -f $(INSTALL)/$(EXE)
+	rm -rf $(EXE).dSYM
+	rm -f *.optrpt
+	rm -f pm*.h pm*.cpp
+
+#
+# -- Simple dependencies
diff --git a/gpu/mini-apps/math/main.cpp b/gpu/mini-apps/math/main.cpp
new file mode 100644
index 00000000..f269a57e
--- /dev/null
+++ b/gpu/mini-apps/math/main.cpp
@@ -0,0 +1,307 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <iostream>
+#include <cassert>
+
+#include <mpi.h>
+#include <omp.h>
+
+#include "pm.h"
+
+#define _NUM_BATCHES 100
+
+#define _NUM_ROWS_A 4
+#define _NUM_COLS_A 3
+
+#define _NUM_ROWS_B _NUM_COLS_A
+#define _NUM_COLS_B 5
+
+#define _TOL 1e-8
+#define _NUM_ITERATIONS_CPU 10
+#define _NUM_ITERATIONS_GPU 1000
+
+#ifdef _SINGLE_PRECISION
+  typedef float real_t;
+#else
+  typedef double real_t;
+#endif
+
+using namespace PM_NS;
+
+extern void init_pm(class PM *);
+
+// A is (m, k) matrix
+// B is (k, n) matrix
+// C is (m, n) matrix
+
+// Column-ordering transposes everything
+// To compute A.B, then to call API with B.A
+
+extern "C" {
+  void dsymm_(const char*, const char*, const int*, const int*,
+	      const double*, const double*, const int*,
+	      const double*, const int*,
+	      const double*, double*, const int*);
+  
+  void dgemm_(const char * transa, const char * transb, const int * m, const int * n,
+	      const int * k, const double * alpha, const double * a, const int * lda,
+	      const double * b, const int * ldb, const double * beta, double * c,
+	      const int * ldc);
+}
+
+// extern void transpose(real_t *, real_t *, const int, const int);
+// extern void copy_naive_gpu(real_t *, real_t *, const int, const int);
+// extern void transpose_naive_gpu(real_t *, real_t *, const int, const int);
+// extern void transpose_gpu_v1(real_t *, real_t *, const int, const int);
+// extern void transpose_gpu_v2(real_t *, real_t *, const int, const int);
+// extern void transpose_gpu_v3(real_t *, real_t *, const int, const int);
+
+// ----------------------------------------------------------------
+
+void gemm_NN0_naive_cpu(const int * m_, const int * n_, const int * k_, const real_t * alpha_,
+			real_t * a, const int * lda_, real_t * b, const int * ldb_,
+			const real_t * beta_, real_t * c, const int * ldc_)
+{
+  double alpha = *alpha_;
+  double beta = *beta_;
+  
+  int m = *m_;
+  int n = *n_;
+  int k = *k_;
+
+  int lda = *lda_;
+  int ldb = *ldb_;
+  int ldc = *ldc_;
+  
+  for(int i=0; i<m; ++i)
+    for(int j=0; j<n; ++j) {
+      double val = 0.0;
+      for(int l=0; l<k; ++l) val += a[i*lda+l] * b[l*ldb+j];
+      c[i*ldc+j] = alpha * val + beta * c[i*ldc+j];
+    }
+}
+
+// ----------------------------------------------------------------
+
+int check_result(real_t * ref, real_t * test, int n, const char * name)
+{
+  int err = 0;
+  for(int i=0; i<n; ++i) {
+    real_t diff = (ref[i] - test[i]) * (ref[i] - test[i]);
+    //    printf(" -- i= %i  ref= %f  test= %f  diff= %f\n",i,ref[i],test[i],diff);
+    if(diff > _TOL) err++;
+  }
+  
+  if(err == 0) printf("Results from %s are correct!! :) \n", name);
+  else printf("Results from %s are incorrect!! :( \n", name);
+  
+  return err;
+}
+
+// ----------------------------------------------------------------
+
+void print_matrix(real_t * data, int num_rows, int num_cols, const char * name)
+{
+  printf("\nMatrix[%s] : %i x %i \n",name, num_rows, num_cols);
+  for(int i=0; i<num_rows; ++i) {
+    for(int j=0; j<num_cols; ++j) printf(" %f", data[i*num_cols + j]);
+    printf("\n");
+  }
+}
+
+// ----------------------------------------------------------------
+
+void print_summary(double t, int num_rows_a, int num_cols_a, int num_cols_b, int num_iter, const char * name)
+{
+  double flop = 2.0 * num_rows_a * num_cols_a * num_cols_b / 1024.0 / 1024.0 / 1024.0; // GFlop
+
+  double flops = flop * num_iter / t; // GFlop/s
+
+  printf("\nMatrix[%s] : mnk= %i %i %i  num_iter= %i  time= %f  [ms]  flops= %f  [GB/s]\n",
+	 name, num_rows_a, num_cols_b, num_cols_a, num_iter, t*1000.0, flops);
+}
+
+// ----------------------------------------------------------------
+						 
+int main( int argc, char* argv[] )
+{
+  MPI_Init(&argc, &argv);
+
+  int me,nranks;
+  MPI_Comm_size(MPI_COMM_WORLD, &nranks);
+  MPI_Comm_rank(MPI_COMM_WORLD, &me);
+
+#ifdef _SINGLE_PRECISION
+  if(me == 0) printf("Using single-precision\n\n");
+#else
+  if(me == 0) printf("Using double-precision\n\n");
+#endif
+  
+  // ----------------------------------------------------------------
+
+  class PM * pm = new PM();
+
+  init_pm(pm);
+  
+  int num_devices = pm->dev_num_devices();
+
+  if(me == 0) {
+    printf("\n# of devices= %i\n",num_devices);
+    pm->dev_properties(num_devices);
+  }
+  
+  // Device ID
+
+  int device_id = me % num_devices;
+
+  pm->dev_set_device(device_id);
+  
+  for(int i=0; i<nranks; ++i) {
+    if(i == me) {
+      printf("Rank %i running on GPU %i!\n",me,device_id);
+    }
+    MPI_Barrier(MPI_COMM_WORLD);
+  }
+
+  real_t * a = (real_t*) malloc(_NUM_ROWS_A * _NUM_COLS_A * sizeof(real_t));
+  real_t * b = (real_t*) malloc(_NUM_ROWS_B * _NUM_COLS_B * sizeof(real_t));
+  
+  real_t * c = (real_t*) malloc(_NUM_ROWS_A * _NUM_COLS_B * sizeof(real_t));
+  real_t * r = (real_t*) malloc(_NUM_ROWS_A * _NUM_COLS_B * sizeof(real_t));
+
+  // Initialize host
+  
+  for(int i=0; i<_NUM_ROWS_A; ++i) {
+    for(int j=0; j<_NUM_COLS_A; ++j) {
+      a[i*_NUM_COLS_A + j] = (i * _NUM_COLS_A + j) * 0.1;
+    }
+  }
+  
+  for(int i=0; i<_NUM_ROWS_B; ++i) {
+    for(int j=0; j<_NUM_COLS_B; ++j) {
+      b[i*_NUM_COLS_B + j] = (i * _NUM_COLS_B + j) * 0.1;
+    }
+  }
+
+  // ----------------------------------------------------------------
+  // Naive CPU reference: Matrix Multiply
+  // ----------------------------------------------------------------
+
+  printf("\nMatrix Multiplication :: C(%i x %i) = A(%i x %i).B(%i x %i)\n",
+	 _NUM_ROWS_A, _NUM_COLS_B, _NUM_ROWS_A, _NUM_COLS_A, _NUM_ROWS_B, _NUM_COLS_B);
+  
+  for(int i=0; i<_NUM_ROWS_A; ++i)
+    for(int j=0; j<_NUM_COLS_B; ++j)
+      {
+	double val = 0.0;
+	for(int k=0; k<_NUM_COLS_A; ++k) val += a[i*_NUM_COLS_A+k] * b[k*_NUM_COLS_B+j];
+	r[i*_NUM_COLS_B+j] = val;
+      }
+
+  double t;
+  
+  {
+    const double alpha = 1.0;
+    const double beta = 0.0;
+
+    const int m = _NUM_ROWS_A;  // # rows of first matrix A
+    const int n = _NUM_COLS_B;  // # cols of second matrix B
+    const int k = _NUM_COLS_A;  // # cols of first matrix A
+
+    const int lda = _NUM_COLS_A; // lead dimension of first matrix A
+    const int ldb = _NUM_COLS_B; // lead dimension of second matrix B
+    const int ldc = _NUM_COLS_B; // lead dimension of result matrix C
+  
+    gemm_NN0_naive_cpu(&m, &n, &k, &alpha, a, &lda, b, &ldb, &beta, c, &ldc);
+
+    double t0 = MPI_Wtime();
+    for(int i=0; i<_NUM_ITERATIONS_CPU; ++i)
+      gemm_NN0_naive_cpu(&m, &n, &k, &alpha, a, &lda, b, &ldb, &beta, c, &ldc);
+    t = MPI_Wtime() - t0;
+  }
+
+  print_summary(t, _NUM_ROWS_A, _NUM_COLS_A, _NUM_COLS_B, _NUM_ITERATIONS_CPU, "gemm_NN0_naive_cpu");
+
+  check_result(r, c, _NUM_ROWS_A*_NUM_COLS_B, "naive_cpu");
+  
+  //  print_matrix(a, _NUM_ROWS_A, _NUM_COLS_A, "Original a");
+
+  //  print_matrix(b, _NUM_ROWS_B, _NUM_COLS_B, "Original b");
+
+  //  print_matrix(r, _NUM_ROWS_A, _NUM_COLS_B, "Reference r");
+  
+  //  print_matrix(c, _NUM_ROWS_A, _NUM_COLS_B, "Output c");
+
+  // ----------------------------------------------------------------
+  // Optimized math library: Matrix Multiply
+  // ----------------------------------------------------------------
+
+  // Create device buffers and transfer data to device
+  
+  real_t * d_a = (real_t *) pm->dev_malloc(_NUM_ROWS_A * _NUM_COLS_A * sizeof(real_t));
+  real_t * d_b = (real_t *) pm->dev_malloc(_NUM_ROWS_B * _NUM_COLS_B * sizeof(real_t));
+  real_t * d_c = (real_t *) pm->dev_malloc(_NUM_ROWS_A * _NUM_COLS_B * sizeof(real_t));
+
+  pm->dev_push(d_a, a, _NUM_ROWS_A * _NUM_COLS_A * sizeof(real_t));
+  pm->dev_push(d_b, b, _NUM_ROWS_B * _NUM_COLS_B * sizeof(real_t));
+  
+  {
+    const double alpha = 1.0;
+    const double beta = 0.0;
+
+    const int m = _NUM_COLS_B;  // # rows of first matrix B^T
+    const int n = _NUM_ROWS_A;  // # cols of second matrix A^T
+    const int k = _NUM_ROWS_B;  // # cols of first matrix B^T
+
+    const int ldb = _NUM_COLS_B; // lead dimension of first matrix B^T
+    const int lda = _NUM_COLS_A; // lead dimension of second matrix A^T
+    const int ldc = _NUM_COLS_B; // lead dimension of result matrix C^T
+
+#ifdef _SINGLE_PRECISION
+    sgemm_((char *) "N", (char *) "N", &m, &n, &k, &alpha, d_b, &ldb, d_a, &lda, &beta, d_c, &ldc);
+#else
+    dgemm_((char *) "N", (char *) "N", &m, &n, &k, &alpha, d_b, &ldb, d_a, &lda, &beta, d_c, &ldc);
+#endif
+
+    pm->dev_barrier();
+    
+    double t0 = MPI_Wtime();
+    for(int i=0; i<_NUM_ITERATIONS_CPU; ++i) {
+#ifdef _SINGLE_PRECISION
+      sgemm_((char *) "N", (char *) "N", &m, &n, &k, &alpha, d_b, &ldb, d_a, &lda, &beta, d_c, &ldc);
+#else	
+      dgemm_((char *) "N", (char *) "N", &m, &n, &k, &alpha, d_b, &ldb, d_a, &lda, &beta, d_c, &ldc);
+#endif
+    }
+    pm->dev_barrier();
+    t = MPI_Wtime() - t0;
+  }
+
+  pm->dev_pull(d_c, c, _NUM_ROWS_A * _NUM_COLS_B * sizeof(real_t));
+  
+  print_summary(t, _NUM_ROWS_A, _NUM_COLS_A, _NUM_COLS_B, _NUM_ITERATIONS_CPU, "LAPACK gemm");
+
+  check_result(r, c, _NUM_ROWS_A*_NUM_COLS_B, "lapack_dgemm_cpu");
+
+  //  print_matrix(r, _NUM_ROWS_A, _NUM_COLS_B, "Reference r");
+  
+  //  print_matrix(c, _NUM_ROWS_A, _NUM_COLS_B, "Output c");
+  
+  // ----------------------------------------------------------------
+  
+  // Clean up
+
+  pm->dev_free(d_a);
+  pm->dev_free(d_b);
+  pm->dev_free(d_c);
+  
+  delete pm;
+    
+  free(a);
+  free(b);
+  free(c);
+  free(r);
+
+  MPI_Finalize();
+}
diff --git a/gpu/mini-apps/math/offload_cuda.cpp b/gpu/mini-apps/math/offload_cuda.cpp
new file mode 100644
index 00000000..9832b622
--- /dev/null
+++ b/gpu/mini-apps/math/offload_cuda.cpp
@@ -0,0 +1,190 @@
+#if defined(_GPU_CUDA)
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <iostream>
+#include <cassert>
+
+#include "pm.h"
+
+#define _TRANSPOSE_BLOCK_SIZE 16
+#define _TRANSPOSE_NUM_ROWS 16
+
+#define _TILE(A,B) (A + B - 1) / B
+
+#ifdef _SINGLE_PRECISION
+  typedef float real_t;
+#else
+  typedef double real_t;
+#endif
+
+using namespace PM_NS;
+
+class PM * pm_ = nullptr;
+
+// ----------------------------------------------------------------
+// GPU Kernels
+// ----------------------------------------------------------------
+
+//https://github.com/NVIDIA-developer-blog/code-samples/blob/master/series/cuda-cpp/transpose/transpose.cu
+// modified to support nonsquare matrices
+
+__global__ void _transpose_gpu_v1(real_t * out, real_t * in, const int nrow, const int ncol)
+{
+  __shared__ real_t cache[_TRANSPOSE_BLOCK_SIZE][_TRANSPOSE_BLOCK_SIZE];
+  
+  int irow = blockIdx.x * _TRANSPOSE_BLOCK_SIZE + threadIdx.x;
+  int icol = blockIdx.y * _TRANSPOSE_BLOCK_SIZE + threadIdx.y;
+
+  // load tile into fast local memory
+
+  const int indxi = irow * ncol + icol;
+  for(int i=0; i<_TRANSPOSE_BLOCK_SIZE; i+= _TRANSPOSE_NUM_ROWS) {
+    if(irow < nrow && (icol+i) < ncol) // nonsquare
+      cache[threadIdx.y + i][threadIdx.x] = in[indxi + i]; // threads read chunk of a row and write as a column
+  }
+
+  // block to ensure reads finish
+  
+  __syncthreads();
+
+  // swap indices
+  
+  irow = blockIdx.y * _TRANSPOSE_BLOCK_SIZE + threadIdx.x;
+  icol = blockIdx.x * _TRANSPOSE_BLOCK_SIZE + threadIdx.y;
+
+  // write tile to global memory
+
+  const int indxo = irow * nrow + icol;
+  for(int i=0; i<_TRANSPOSE_BLOCK_SIZE; i+= _TRANSPOSE_NUM_ROWS) {
+    if(irow < ncol && (icol + i) < nrow) // nonsquare
+      out[indxo + i] = cache[threadIdx.x][threadIdx.y + i];
+  }
+}
+
+// ----------------------------------------------------------------
+
+__global__ void _transpose_gpu_v2(real_t * out, real_t * in, const int nrow, const int ncol)
+{
+  __shared__ real_t cache[_TRANSPOSE_BLOCK_SIZE][_TRANSPOSE_BLOCK_SIZE+1];
+  
+  int irow = blockIdx.x * _TRANSPOSE_BLOCK_SIZE + threadIdx.x;
+  int icol = blockIdx.y * _TRANSPOSE_BLOCK_SIZE + threadIdx.y;
+
+  // load tile into fast local memory
+
+  const int indxi = irow * ncol + icol;
+  for(int i=0; i<_TRANSPOSE_BLOCK_SIZE; i+= _TRANSPOSE_NUM_ROWS) {
+    if(irow < nrow && (icol+i) < ncol) // nonsquare
+      cache[threadIdx.y + i][threadIdx.x] = in[indxi + i]; // threads read chunk of a row and write as a column
+  }
+
+  // block to ensure reads finish
+  
+  __syncthreads();
+
+  // swap indices
+  
+  irow = blockIdx.y * _TRANSPOSE_BLOCK_SIZE + threadIdx.x;
+  icol = blockIdx.x * _TRANSPOSE_BLOCK_SIZE + threadIdx.y;
+
+  // write tile to global memory
+
+  const int indxo = irow * nrow + icol;
+  for(int i=0; i<_TRANSPOSE_BLOCK_SIZE; i+= _TRANSPOSE_NUM_ROWS) {
+    if(irow < ncol && (icol + i) < nrow) // nonsquare
+      out[indxo + i] = cache[threadIdx.x][threadIdx.y + i];
+  }
+}
+
+// ----------------------------------------------------------------
+
+__global__ void _transpose_naive_gpu(real_t * out, real_t * in, const int nrow, const int ncol)
+{
+  const int i = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if(i >= nrow) return;
+
+  int j = blockIdx.y * blockDim.y + threadIdx.y;
+  
+  while (j < ncol) {
+    out[j*nrow + i] = in[i*ncol + j];
+    j += blockDim.y;
+  }
+  
+}
+
+// ----------------------------------------------------------------
+
+__global__ void _copy_naive_gpu(real_t * out, real_t * in, const int nrow, const int ncol)
+{
+  const int i = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if(i >= nrow) return;
+
+  int j = blockIdx.y * blockDim.y + threadIdx.y;
+  
+  while (j < ncol) {
+    out[i*ncol + j] = in[i*ncol + j];
+    j += blockDim.y;
+  }
+  
+}
+
+// ----------------------------------------------------------------
+// Host-side functions
+// ----------------------------------------------------------------
+
+void init_pm(class PM * pm)
+{
+  pm_ = pm;
+}
+
+void copy_naive_gpu(real_t * b, real_t * a, const int num_rows, const int num_cols)
+{
+  dim3 grid_size(num_rows, 1, 1);
+  dim3 block_size(1, _TRANSPOSE_BLOCK_SIZE, 1);
+  
+  _copy_naive_gpu<<<grid_size, block_size>>>(b, a, num_rows, num_cols);
+}
+
+// ----------------------------------------------------------------
+
+void transpose_naive_gpu(real_t * b, real_t * a, const int num_rows, const int num_cols)
+{
+  dim3 grid_size(num_rows, 1, 1);
+  dim3 block_size(1, _TRANSPOSE_BLOCK_SIZE, 1);
+  
+  _transpose_naive_gpu<<<grid_size, block_size>>>(b, a, num_rows, num_cols);
+}
+
+// ----------------------------------------------------------------
+
+void transpose_gpu_v1(real_t * b, real_t * a, const int num_rows, const int num_cols)
+{
+  dim3 grid_size(_TILE(num_rows, _TRANSPOSE_BLOCK_SIZE), _TILE(num_cols, _TRANSPOSE_BLOCK_SIZE), 1);
+  dim3 block_size(_TRANSPOSE_BLOCK_SIZE, _TRANSPOSE_NUM_ROWS, 1);
+  
+  //  printf("\ngrid_size= %i %i %i\n", _TILE(_NUM_ROWS, _TRANSPOSE_BLOCK_SIZE), _TILE(_NUM_COLS, _TRANSPOSE_BLOCK_SIZE), 1);
+  //  printf("block_size= %i %i %i\n",_TRANSPOSE_BLOCK_SIZE, _TRANSPOSE_NUM_ROWS, 1);
+  
+  _transpose_gpu_v1<<<grid_size, block_size>>>(b, a, num_rows, num_cols);
+}
+
+// ----------------------------------------------------------------
+
+void transpose_gpu_v2(real_t * b, real_t * a, const int num_rows, const int num_cols)
+{
+  dim3 grid_size(_TILE(num_rows, _TRANSPOSE_BLOCK_SIZE), _TILE(num_cols, _TRANSPOSE_BLOCK_SIZE), 1);
+  dim3 block_size(_TRANSPOSE_BLOCK_SIZE, _TRANSPOSE_NUM_ROWS, 1);
+  
+  //  printf("\ngrid_size= %i %i %i\n", _TILE(_NUM_ROWS, _TRANSPOSE_BLOCK_SIZE), _TILE(_NUM_COLS, _TRANSPOSE_BLOCK_SIZE), 1);
+  //  printf("block_size= %i %i %i\n",_TRANSPOSE_BLOCK_SIZE, _TRANSPOSE_NUM_ROWS, 1);
+  
+  _transpose_gpu_v2<<<grid_size, block_size>>>(b, a, num_rows, num_cols);
+}
+
+// ----------------------------------------------------------------
+
+#endif
diff --git a/gpu/mini-apps/math/offload_host.cpp b/gpu/mini-apps/math/offload_host.cpp
new file mode 100644
index 00000000..7a4ddc88
--- /dev/null
+++ b/gpu/mini-apps/math/offload_host.cpp
@@ -0,0 +1,81 @@
+#if defined(_USE_CPU)
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <iostream>
+#include <cassert>
+
+#include "pm.h"
+
+#define _TRANSPOSE_BLOCK_SIZE 16
+#define _TRANSPOSE_NUM_ROWS 16
+
+#define _TILE(A,B) (A + B - 1) / B
+
+#ifdef _SINGLE_PRECISION
+  typedef float real_t;
+#else
+  typedef double real_t;
+#endif
+
+using namespace PM_NS;
+
+class PM * pm_ = nullptr;
+
+// ----------------------------------------------------------------
+// Host-side functions
+// ----------------------------------------------------------------
+
+void init_pm(class PM * pm)
+{
+  pm_ = pm;
+}
+
+// void copy_naive_gpu(real_t * b, real_t * a, const int num_rows, const int num_cols)
+// {
+//   dim3 grid_size(num_rows, 1, 1);
+//   dim3 block_size(1, _TRANSPOSE_BLOCK_SIZE, 1);
+  
+//   _copy_naive_gpu<<<grid_size, block_size>>>(b, a, num_rows, num_cols);
+// }
+
+// // ----------------------------------------------------------------
+
+// void transpose_naive_gpu(real_t * b, real_t * a, const int num_rows, const int num_cols)
+// {
+//   dim3 grid_size(num_rows, 1, 1);
+//   dim3 block_size(1, _TRANSPOSE_BLOCK_SIZE, 1);
+  
+//   _transpose_naive_gpu<<<grid_size, block_size>>>(b, a, num_rows, num_cols);
+// }
+
+// // ----------------------------------------------------------------
+
+// void transpose_gpu_v1(real_t * b, real_t * a, const int num_rows, const int num_cols)
+// {
+//   dim3 grid_size(_TILE(num_rows, _TRANSPOSE_BLOCK_SIZE), _TILE(num_cols, _TRANSPOSE_BLOCK_SIZE), 1);
+//   dim3 block_size(_TRANSPOSE_BLOCK_SIZE, _TRANSPOSE_NUM_ROWS, 1);
+  
+//   //  printf("\ngrid_size= %i %i %i\n", _TILE(_NUM_ROWS, _TRANSPOSE_BLOCK_SIZE), _TILE(_NUM_COLS, _TRANSPOSE_BLOCK_SIZE), 1);
+//   //  printf("block_size= %i %i %i\n",_TRANSPOSE_BLOCK_SIZE, _TRANSPOSE_NUM_ROWS, 1);
+  
+//   _transpose_gpu_v1<<<grid_size, block_size>>>(b, a, num_rows, num_cols);
+// }
+
+// // ----------------------------------------------------------------
+
+// void transpose_gpu_v2(real_t * b, real_t * a, const int num_rows, const int num_cols)
+// {
+//   dim3 grid_size(_TILE(num_rows, _TRANSPOSE_BLOCK_SIZE), _TILE(num_cols, _TRANSPOSE_BLOCK_SIZE), 1);
+//   dim3 block_size(_TRANSPOSE_BLOCK_SIZE, _TRANSPOSE_NUM_ROWS, 1);
+  
+//   //  printf("\ngrid_size= %i %i %i\n", _TILE(_NUM_ROWS, _TRANSPOSE_BLOCK_SIZE), _TILE(_NUM_COLS, _TRANSPOSE_BLOCK_SIZE), 1);
+//   //  printf("block_size= %i %i %i\n",_TRANSPOSE_BLOCK_SIZE, _TRANSPOSE_NUM_ROWS, 1);
+  
+//   _transpose_gpu_v2<<<grid_size, block_size>>>(b, a, num_rows, num_cols);
+// }
+
+// // ----------------------------------------------------------------
+
+#endif
diff --git a/gpu/mini-apps/math/offload_sycl.cpp b/gpu/mini-apps/math/offload_sycl.cpp
new file mode 100644
index 00000000..aa848ae4
--- /dev/null
+++ b/gpu/mini-apps/math/offload_sycl.cpp
@@ -0,0 +1,201 @@
+#if defined(_GPU_SYCL_CUDA)
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <iostream>
+#include <cassert>
+
+#include "pm.h"
+
+#define _TRANSPOSE_BLOCK_SIZE 16
+#define _TRANSPOSE_NUM_ROWS 16
+
+//#define _TILE(A,B)  (A + B - 1) / B
+#define _TILE(A,B) ((A + B - 1) / B) * B
+
+#ifdef _SINGLE_PRECISION
+  typedef float real_t;
+#else
+  typedef double real_t;
+#endif
+
+using namespace PM_NS;
+
+class PM * pm_ = nullptr;
+
+class Kernel_Copy_Naive;
+class Kernel_Transpose_Naive;
+class Kernel_Transpose_V1;
+class Kernel_Transpose_V2;
+
+// ----------------------------------------------------------------
+
+void init_pm(class PM * pm)
+{
+  pm_ = pm;
+}
+
+// gridDim.{x,y,z} == get_num_group({0,1,2})
+// blockDim.{x,y,z} == get_local_range({0,1,2})
+// blockIdx.{x,y,z} == get_group({0,1,2})
+// threadIdx.{x,y,z} == get_local_id({0,1,2})
+
+void copy_naive_gpu(real_t * out, real_t * in, const int num_rows, const int num_cols)
+{
+  sycl::queue * Q = pm_->dev_get_queue();
+  
+  sycl::range<3> grid_size(num_rows, _TRANSPOSE_BLOCK_SIZE, 1);
+  sycl::range<3> block_size(1, _TRANSPOSE_BLOCK_SIZE, 1);
+
+  sycl::nd_range<3> kernel_rng(grid_size, block_size);
+  
+  Q->submit([&](sycl::handler &cgh) {
+
+    cgh.parallel_for<Kernel_Copy_Naive>(kernel_rng, [=](sycl::nd_item<3> idx) {
+      const int i = idx.get_group(0) * idx.get_local_range(0) + idx.get_local_id(0);
+
+      if(i >= num_rows) return;
+
+      int j = idx.get_group(1) * idx.get_local_range(1) + idx.get_local_id(1);
+      
+      while(j < num_cols) {
+	out[i*num_cols + j] = in[i*num_cols + j];
+	j += idx.get_local_range(1);
+      }
+    }); // End of the kernel function
+    
+    //  }).wait();       // End of the queue commands
+  });       // End of the queue commands
+}
+
+// ----------------------------------------------------------------
+
+void transpose_naive_gpu(real_t * out, real_t * in, const int num_rows, const int num_cols)
+{
+  sycl::queue * Q = pm_->dev_get_queue();
+  
+  sycl::range<3> grid_size(num_rows, _TRANSPOSE_BLOCK_SIZE, 1);
+  sycl::range<3> block_size(1, _TRANSPOSE_BLOCK_SIZE, 1);
+
+  sycl::nd_range<3> kernel_rng(grid_size, block_size);
+  
+  Q->submit([&](sycl::handler &cgh) {
+
+    cgh.parallel_for<Kernel_Transpose_Naive>(kernel_rng, [=](sycl::nd_item<3> idx) {
+      const int i = idx.get_group(0) * idx.get_local_range(0) + idx.get_local_id(0);
+
+      if(i >= num_rows) return;
+
+      int j = idx.get_group(1) * idx.get_local_range(1) + idx.get_local_id(1);
+      
+      while(j < num_cols) {
+	out[j*num_rows + i] = in[i*num_cols + j];
+	j += idx.get_local_range(1);
+      }
+    }); // End of the kernel function
+    
+    //}).wait();       // End of the queue commands
+  });       // End of the queue commands
+}
+
+// ----------------------------------------------------------------
+
+void transpose_gpu_v1(real_t * out, real_t * in, const int num_rows, const int num_cols)
+{
+  sycl::queue * Q = pm_->dev_get_queue();
+  
+  sycl::range<3> grid_size(_TILE(num_rows, _TRANSPOSE_BLOCK_SIZE), _TILE(num_cols, _TRANSPOSE_NUM_ROWS), 1);
+  sycl::range<3> block_size(_TRANSPOSE_BLOCK_SIZE, _TRANSPOSE_NUM_ROWS, 1);
+
+  sycl::nd_range<3> kernel_rng(grid_size, block_size);
+  
+  Q->submit([&](sycl::handler &cgh) {
+
+    sycl::local_accessor<real_t, 2> cache(sycl::range(_TRANSPOSE_BLOCK_SIZE, _TRANSPOSE_BLOCK_SIZE), cgh);
+      
+    cgh.parallel_for<Kernel_Transpose_V1>(kernel_rng, [=](sycl::nd_item<3> idx) {
+      int irow = idx.get_group(0) * _TRANSPOSE_BLOCK_SIZE + idx.get_local_id(0);
+      int icol = idx.get_group(1) * _TRANSPOSE_BLOCK_SIZE + idx.get_local_id(1);
+
+      // load tile into fast local memory
+
+      const int indxi = irow * num_cols + icol;
+      for(int i=0; i<_TRANSPOSE_BLOCK_SIZE; i+= _TRANSPOSE_NUM_ROWS) {
+	if(irow < num_rows && (icol+i) < num_cols) // nonsquare
+	  cache[idx.get_local_id(1)+i][idx.get_local_id(0)] = in[indxi + i]; // threads read chunk of a row and write as a column
+      }
+
+      // block to ensure reads finish
+
+      idx.barrier(sycl::access::fence_space::local_space);
+
+      // swap indices
+
+      irow = idx.get_group(1) * _TRANSPOSE_BLOCK_SIZE + idx.get_local_id(0);
+      icol = idx.get_group(0) * _TRANSPOSE_BLOCK_SIZE + idx.get_local_id(1);      
+
+      // write tile to global memory
+
+      const int indxo = irow * num_rows + icol;
+      for(int i=0; i<_TRANSPOSE_BLOCK_SIZE; i+= _TRANSPOSE_NUM_ROWS) {
+	if(irow < num_cols && (icol+i) < num_rows) // nonsquare
+	  out[indxo + i] = cache[idx.get_local_id(0)][idx.get_local_id(1) + i];
+      }
+    }); // End of the kernel function
+    
+    //  }).wait();       // End of the queue commands
+  });       // End of the queue commands
+}
+
+// ----------------------------------------------------------------
+
+void transpose_gpu_v2(real_t * out, real_t * in, const int num_rows, const int num_cols)
+{
+  sycl::queue * Q = pm_->dev_get_queue();
+  
+  sycl::range<3> grid_size(_TILE(num_rows, _TRANSPOSE_BLOCK_SIZE), _TILE(num_cols, _TRANSPOSE_NUM_ROWS), 1);
+  sycl::range<3> block_size(_TRANSPOSE_BLOCK_SIZE, _TRANSPOSE_NUM_ROWS, 1);
+
+  sycl::nd_range<3> kernel_rng(grid_size, block_size);
+  
+  Q->submit([&](sycl::handler &cgh) {
+
+    sycl::local_accessor<real_t, 2> cache(sycl::range(_TRANSPOSE_BLOCK_SIZE, _TRANSPOSE_BLOCK_SIZE+1), cgh);
+      
+    cgh.parallel_for<Kernel_Transpose_V2>(kernel_rng, [=](sycl::nd_item<3> idx) {
+      
+      int irow = idx.get_group(0) * _TRANSPOSE_BLOCK_SIZE + idx.get_local_id(0);
+      int icol = idx.get_group(1) * _TRANSPOSE_BLOCK_SIZE + idx.get_local_id(1);
+
+      // load tile into fast local memory
+
+      const int indxi = irow * num_cols + icol;
+      for(int i=0; i<_TRANSPOSE_BLOCK_SIZE; i+= _TRANSPOSE_NUM_ROWS) {
+	if(irow < num_rows && (icol+i) < num_cols) // nonsquare
+	  cache[idx.get_local_id(1)+i][idx.get_local_id(0)] = in[indxi + i]; // threads read chunk of a row and write as a column
+      }
+
+      // block to ensure reads finish
+
+      idx.barrier(sycl::access::fence_space::local_space);
+
+      // swap indices
+
+      irow = idx.get_group(1) * _TRANSPOSE_BLOCK_SIZE + idx.get_local_id(0);
+      icol = idx.get_group(0) * _TRANSPOSE_BLOCK_SIZE + idx.get_local_id(1);      
+
+      // write tile to global memory
+
+      const int indxo = irow * num_rows + icol;
+      for(int i=0; i<_TRANSPOSE_BLOCK_SIZE; i+= _TRANSPOSE_NUM_ROWS) {
+	if(irow < num_cols && (icol+i) < num_rows) // nonsquare
+	  out[indxo + i] = cache[idx.get_local_id(0)][idx.get_local_id(1) + i];
+      }
+    }); // End of the kernel function
+    
+    //  }).wait();       // End of the queue commands
+  });       // End of the queue commands
+}
+
+#endif
diff --git a/gpu/mini-apps/math/run.sh b/gpu/mini-apps/math/run.sh
new file mode 100755
index 00000000..71ecf524
--- /dev/null
+++ b/gpu/mini-apps/math/run.sh
@@ -0,0 +1,7 @@
+
+./a.out
+
+#nsys profile --stats=true ./a.out
+
+# WARNING :: set _NUM_ITERATIONS_GPU to something small (e.g. 5)
+#ncu --print-summary per-kernel ./a.out
diff --git a/gpu/src/pm_cuda.h b/gpu/src/pm_cuda.h
index e0dbc754..02db1d8e 100644
--- a/gpu/src/pm_cuda.h
+++ b/gpu/src/pm_cuda.h
@@ -78,6 +78,7 @@ namespace PM_NS {
     void dev_stream_destroy(cudaStream_t & s);
     void dev_stream_wait(cudaStream_t & s);
 
+  private: 
     void uuid_print(cudaUUID_t);
   };