From 6708d1c8fa751cf5f0eb4c8da1f8ab89fd33b190 Mon Sep 17 00:00:00 2001
From: Pedro Costa <p.simoes.costa@gmail.com>
Date: Fri, 15 Mar 2024 13:54:40 +0100
Subject: [PATCH] Single GPU version without cuDecomp dependency.

---
 build.conf               |   4 +-
 configs/libs.mk          |   5 +-
 dependencies/external.mk |   4 +-
 src/bound.f90            |  36 ++++----
 src/common_cudecomp.f90  |  14 ++--
 src/initmpi.f90          | 175 ++++++++++++++++++++------------------
 src/param.f90            |   4 +-
 src/solver_gpu.f90       | 176 ++++++++++++++++++++++++++++++---------
 src/utils.f90            |   5 +-
 src/workspaces.f90       |  42 ++++++----
 10 files changed, 292 insertions(+), 173 deletions(-)

diff --git a/build.conf b/build.conf
index bff572e0..72059bf2 100644
--- a/build.conf
+++ b/build.conf
@@ -1,7 +1,7 @@
 #
 # compiler and compiling profile
 #
-FCOMP=GNU           # options: GNU, NVIDIA, INTEL
+FCOMP=NVIDIA        # options: GNU, NVIDIA, INTEL
 FFLAGS_OPT=1        # for production runs
 FFLAGS_OPT_MAX=0    # for production runs (more aggressive optimization)
 FFLAGS_DEBUG=0      # for debugging
@@ -18,5 +18,5 @@ SINGLE_PRECISION=0         # perform the whole calculation in single precision
 #
 # GPU-related
 #
-GPU=0
+GPU=1
 USE_NVTX=0                 # use the NVTX-enabled Git branch 'with-nvtx' to see the markers
diff --git a/configs/libs.mk b/configs/libs.mk
index b6e9ba5c..6f12af13 100644
--- a/configs/libs.mk
+++ b/configs/libs.mk
@@ -2,8 +2,9 @@ override LIBS += -L$(LIBS_DIR)/2decomp-fft -ldecomp2d
 override INCS += -I$(LIBS_DIR)/2decomp-fft/mod
 
 ifeq ($(strip $(GPU)),1)
-override LIBS += -L$(LIBS_DIR)/cuDecomp/build/lib -lcudecomp -lcudecomp_fort -cudalib=cufft
-override INCS += -I$(LIBS_DIR)/cuDecomp/build/include
+#override LIBS += -L$(LIBS_DIR)/cuDecomp/build/lib -lcudecomp -lcudecomp_fort -cudalib=cufft
+override LIBS += -cudalib=cufft
+#override INCS += -I$(LIBS_DIR)/cuDecomp/build/include
 endif
 
 ifeq ($(strip $(USE_NVTX)),1)
diff --git a/dependencies/external.mk b/dependencies/external.mk
index 52666aa3..32cfe32f 100644
--- a/dependencies/external.mk
+++ b/dependencies/external.mk
@@ -4,10 +4,10 @@
 ifeq ($(strip $(GPU)),1)
 libs: $(wildcard $(LIBS_DIR)/2decomp-fft/src/*.f90)
 	cd $(LIBS_DIR)/2decomp-fft && make
-	cd $(LIBS_DIR)/cuDecomp && make lib -j
+#	cd $(LIBS_DIR)/cuDecomp && make lib -j
 libsclean: $(wildcard $(LIBS_DIR)/2decomp-fft/src/*.f90)
 	cd $(LIBS_DIR)/2decomp-fft && make clean
-	cd $(LIBS_DIR)/cuDecomp && make clean
+#	cd $(LIBS_DIR)/cuDecomp && make clean
 else
 libs: $(wildcard $(LIBS_DIR)/2decomp-fft/src/*.f90)
 	cd $(LIBS_DIR)/2decomp-fft && make
diff --git a/src/bound.f90 b/src/bound.f90
index 16e56284..56ede3ea 100644
--- a/src/bound.f90
+++ b/src/bound.f90
@@ -530,29 +530,29 @@ end subroutine updthalo
 #if defined(_OPENACC)
   subroutine updthalo_gpu(nh,periods,p)
     use mod_types
-    use cudecomp
-    use mod_common_cudecomp, only: work => work_halo, &
-                                   ch => handle,gd => gd_halo, &
-                                   dtype => cudecomp_real_rp, &
-                                   istream => istream_acc_queue_1
+    !!!PC use cudecomp
+    !!!PC use mod_common_cudecomp, only: work => work_halo, &
+    !!!PC                                ch => handle,gd => gd_halo, &
+    !!!PC                                dtype => cudecomp_real_rp, &
+    !!!PC                                istream => istream_acc_queue_1
     implicit none
     integer , intent(in) :: nh
     logical , intent(in) :: periods(3)
     real(rp), intent(inout), dimension(1-nh:,1-nh:,1-nh:) :: p
     integer :: istat
-    !$acc host_data use_device(p,work)
-    select case(ipencil_axis)
-    case(1)
-      istat = cudecompUpdateHalosX(ch,gd,p,work,dtype,[nh,nh,nh],periods,2,stream=istream)
-      istat = cudecompUpdateHalosX(ch,gd,p,work,dtype,[nh,nh,nh],periods,3,stream=istream)
-    case(2)
-      istat = cudecompUpdateHalosY(ch,gd,p,work,dtype,[nh,nh,nh],periods,1,stream=istream)
-      istat = cudecompUpdateHalosY(ch,gd,p,work,dtype,[nh,nh,nh],periods,3,stream=istream)
-    case(3)
-      istat = cudecompUpdateHalosZ(ch,gd,p,work,dtype,[nh,nh,nh],periods,1,stream=istream)
-      istat = cudecompUpdateHalosZ(ch,gd,p,work,dtype,[nh,nh,nh],periods,2,stream=istream)
-    end select
-    !$acc end host_data
+    !!!PC !$acc host_data use_device(p,work)
+    !!!PC select case(ipencil_axis)
+    !!!PC case(1)
+    !!!PC   istat = cudecompUpdateHalosX(ch,gd,p,work,dtype,[nh,nh,nh],periods,2,stream=istream)
+    !!!PC   istat = cudecompUpdateHalosX(ch,gd,p,work,dtype,[nh,nh,nh],periods,3,stream=istream)
+    !!!PC case(2)
+    !!!PC   istat = cudecompUpdateHalosY(ch,gd,p,work,dtype,[nh,nh,nh],periods,1,stream=istream)
+    !!!PC   istat = cudecompUpdateHalosY(ch,gd,p,work,dtype,[nh,nh,nh],periods,3,stream=istream)
+    !!!PC case(3)
+    !!!PC   istat = cudecompUpdateHalosZ(ch,gd,p,work,dtype,[nh,nh,nh],periods,1,stream=istream)
+    !!!PC   istat = cudecompUpdateHalosZ(ch,gd,p,work,dtype,[nh,nh,nh],periods,2,stream=istream)
+    !!!PC end select
+    !!!PC !$acc end host_data
   end subroutine updthalo_gpu
 #endif
 end module mod_bound
diff --git a/src/common_cudecomp.f90 b/src/common_cudecomp.f90
index 47985d1f..b89dbab9 100644
--- a/src/common_cudecomp.f90
+++ b/src/common_cudecomp.f90
@@ -8,22 +8,22 @@ module mod_common_cudecomp
 #if defined(_OPENACC)
   use mod_types
   !@cuf use cudafor
-  use cudecomp
+  !!!PC use cudecomp
   use openacc
   use mod_param, only: cudecomp_is_t_in_place
   implicit none
   public
-  integer :: cudecomp_real_rp
-  type(cudecompHandle)     :: handle
-  type(cudecompGridDesc)   :: gd_halo,gd_poi
-  type(cudecompPencilInfo) :: ap_x,ap_y,ap_z,ap_x_poi,ap_y_poi,ap_z_poi
+!!!PC  integer :: cudecomp_real_rp
+!!!PC  type(cudecompHandle)     :: handle
+!!!PC  type(cudecompGridDesc)   :: gd_halo,gd_poi
+!!!PC  type(cudecompPencilInfo) :: ap_x,ap_y,ap_z,ap_x_poi,ap_y_poi,ap_z_poi
   !
   ! workspace stuff
   !
   integer(i8) :: wsize_fft
   real(rp), pointer, contiguous, dimension(:) :: work     ,work_cuda
-  real(rp), pointer, contiguous, dimension(:) :: work_halo,work_halo_cuda
-  !@cuf attributes(device) :: work_cuda,work_halo_cuda
+!!!PC  real(rp), pointer, contiguous, dimension(:) :: work_halo,work_halo_cuda
+  !@cuf attributes(device) :: work_cuda !!!PC,work_halo_cuda
   real(rp), target, allocatable, dimension(:) :: solver_buf_0,solver_buf_1
 #if !defined(_IMPDIFF_1D)
   real(rp), allocatable, dimension(:,:,:) :: pz_aux_1,pz_aux_2
diff --git a/src/initmpi.f90 b/src/initmpi.f90
index a689b489..0adcd216 100644
--- a/src/initmpi.f90
+++ b/src/initmpi.f90
@@ -10,8 +10,8 @@ module mod_initmpi
   use mod_common_mpi, only: myid,ierr,halo,ipencil => ipencil_axis
   use mod_types
   !@acc use openacc
-  !@acc use cudecomp
-  !@cuf use cudafor, only: cudaGetDeviceCount,cudaSetDevice
+  !!!PC !@acc use cudecomp
+  !!!PC !@cuf use cudafor, only: cudaGetDeviceCount,cudaSetDevice
 #if defined(_OPENACC)
   use mod_common_cudecomp, only: cudecomp_real_rp, &
                                  ch => handle,gd => gd_halo,gd_poi, &
@@ -40,8 +40,8 @@ subroutine initmpi(ng,dims,bc,lo,hi,n,n_x_fft,n_y_fft,lo_z,hi_z,n_z,nb,is_bound)
     integer(acc_device_kind) ::dev_type
     integer :: local_comm,mydev,ndev
     integer :: istat
-    type(cudecompGridDescConfig)          :: conf,conf_poi
-    type(cudecompGridDescAutotuneOptions) :: atune_conf
+!!!PC    type(cudecompGridDescConfig)          :: conf,conf_poi
+!!!PC    type(cudecompGridDescAutotuneOptions) :: atune_conf
 #else
     integer :: comm_cart
 #endif
@@ -53,7 +53,7 @@ subroutine initmpi(ng,dims,bc,lo,hi,n,n_x_fft,n_y_fft,lo_z,hi_z,n_z,nb,is_bound)
     call MPI_COMM_SPLIT_TYPE(MPI_COMM_WORLD,MPI_COMM_TYPE_SHARED,0,MPI_INFO_NULL,local_comm,ierr)
     call MPI_COMM_RANK(local_comm,mydev,ierr)
     dev_type = acc_get_device_type()
-#if 1
+#if 0
     istat = cudaGetDeviceCount(ndev)      ! may be tweaked with environment variable CUDA_VISIBLE_DEVICES
     mydev = mod(mydev,ndev)
     istat = cudaSetDevice(mydev)
@@ -65,52 +65,52 @@ subroutine initmpi(ng,dims,bc,lo,hi,n,n_x_fft,n_y_fft,lo_z,hi_z,n_z,nb,is_bound)
     call acc_set_device_num(mydev,dev_type)
     call acc_init(dev_type)
     !
-    istat = cudecompInit(ch,MPI_COMM_WORLD)
+    !!!PCistat = cudecompInit(ch,MPI_COMM_WORLD)
     !
     ! setup descriptor for the Poisson solver
     !
-    istat = cudecompGridDescConfigSetDefaults(conf)
-    conf%transpose_comm_backend = cudecomp_t_comm_backend
-    conf%transpose_axis_contiguous(:) = [.true.,.true.,.false.]
-    conf%gdims(:)      = [2*(ng(1)/2+1),2*(ng(2)/2+1),ng(3)]
-    conf%pdims(:)      = dims(1:2)
-    conf%gdims_dist(:) = ng(:)
-    istat = cudecompGridDescAutotuneOptionsSetDefaults(atune_conf)
-    if(rp == dp) then
-      cudecomp_real_rp = CUDECOMP_DOUBLE
-    else
-      cudecomp_real_rp = CUDECOMP_FLOAT
-    end if
-    atune_conf%dtype = cudecomp_real_rp
-    atune_conf%grid_mode = CUDECOMP_AUTOTUNE_GRID_TRANSPOSE
-    atune_conf%autotune_transpose_backend = cudecomp_is_t_comm_autotune
-    atune_conf%disable_nccl_backends    = .not.cudecomp_is_t_enable_nccl
-    atune_conf%disable_nvshmem_backends = .not.cudecomp_is_t_enable_nvshmem
-    istat = cudecompGridDescCreate(ch,gd_poi,conf,atune_conf)
-    conf_poi = conf
-    dims(:) = conf%pdims
-    !
-    ! setup descriptor for halo exchanges
-    !
-    istat = cudecompGridDescConfigSetDefaults(conf)
-    conf%gdims(:)      = ng(:)
-    conf%pdims(:)      = dims(1:2)
-    conf%halo_comm_backend = cudecomp_h_comm_backend
-    conf%transpose_axis_contiguous(:) = .false.
-    istat = cudecompGridDescAutotuneOptionsSetDefaults(atune_conf)
-    atune_conf%halo_extents(:) = 1
-    atune_conf%halo_periods(:) = periods(:)
-    atune_conf%dtype = cudecomp_real_rp
-    atune_conf%autotune_halo_backend = cudecomp_is_h_comm_autotune
-    atune_conf%disable_nccl_backends    = .not.cudecomp_is_t_enable_nccl
-    atune_conf%disable_nvshmem_backends = .not.cudecomp_is_t_enable_nvshmem
-    if(all(conf_poi%transpose_comm_backend /= [CUDECOMP_TRANSPOSE_COMM_NVSHMEM,CUDECOMP_TRANSPOSE_COMM_NVSHMEM_PL])) then
-      !
-      ! disable NVSHMEM halo backend autotuning when NVSHMEM is NOT used for transposes
-      !
-      atune_conf%disable_nvshmem_backends = .true.
-    end if
-    istat = cudecompGridDescCreate(ch,gd,conf,atune_conf)
+    !!!PC istat = cudecompGridDescConfigSetDefaults(conf)
+    !!!PC conf%transpose_comm_backend = cudecomp_t_comm_backend
+    !!!PC conf%transpose_axis_contiguous(:) = [.true.,.true.,.false.]
+    !!!PC conf%gdims(:)      = [2*(ng(1)/2+1),2*(ng(2)/2+1),ng(3)]
+    !!!PC conf%pdims(:)      = dims(1:2)
+    !!!PC conf%gdims_dist(:) = ng(:)
+    !!!PC istat = cudecompGridDescAutotuneOptionsSetDefaults(atune_conf)
+    !!!PC if(rp == dp) then
+    !!!PC   cudecomp_real_rp = CUDECOMP_DOUBLE
+    !!!PC else
+    !!!PC   cudecomp_real_rp = CUDECOMP_FLOAT
+    !!!PC end if
+    !!!PC atune_conf%dtype = cudecomp_real_rp
+    !!!PC atune_conf%grid_mode = CUDECOMP_AUTOTUNE_GRID_TRANSPOSE
+    !!!PC atune_conf%autotune_transpose_backend = cudecomp_is_t_comm_autotune
+    !!!PC atune_conf%disable_nccl_backends    = .not.cudecomp_is_t_enable_nccl
+    !!!PC atune_conf%disable_nvshmem_backends = .not.cudecomp_is_t_enable_nvshmem
+    !!!PC istat = cudecompGridDescCreate(ch,gd_poi,conf,atune_conf)
+    !!!PC conf_poi = conf
+    !!!PC dims(:) = conf%pdims
+    !!!PC !
+    !!!PC ! setup descriptor for halo exchanges
+    !!!PC !
+    !!!PC istat = cudecompGridDescConfigSetDefaults(conf)
+    !!!PC conf%gdims(:)      = ng(:)
+    !!!PC conf%pdims(:)      = dims(1:2)
+    !!!PC conf%halo_comm_backend = cudecomp_h_comm_backend
+    !!!PC conf%transpose_axis_contiguous(:) = .false.
+    !!!PC istat = cudecompGridDescAutotuneOptionsSetDefaults(atune_conf)
+    !!!PC atune_conf%halo_extents(:) = 1
+    !!!PC atune_conf%halo_periods(:) = periods(:)
+    !!!PC atune_conf%dtype = cudecomp_real_rp
+    !!!PC atune_conf%autotune_halo_backend = cudecomp_is_h_comm_autotune
+    !!!PC atune_conf%disable_nccl_backends    = .not.cudecomp_is_t_enable_nccl
+    !!!PC atune_conf%disable_nvshmem_backends = .not.cudecomp_is_t_enable_nvshmem
+    !!!PC if(all(conf_poi%transpose_comm_backend /= [CUDECOMP_TRANSPOSE_COMM_NVSHMEM,CUDECOMP_TRANSPOSE_COMM_NVSHMEM_PL])) then
+    !!!PC   !
+    !!!PC   ! disable NVSHMEM halo backend autotuning when NVSHMEM is NOT used for transposes
+    !!!PC   !
+    !!!PC   atune_conf%disable_nvshmem_backends = .true.
+    !!!PC end if
+    !!!PC istat = cudecompGridDescCreate(ch,gd,conf,atune_conf)
 #endif
     call decomp_2d_init(ng(1),ng(2),ng(3),dims(1),dims(2),periods)
 #if !defined(_DECOMP_Y) && !defined(_DECOMP_Z)
@@ -123,40 +123,51 @@ subroutine initmpi(ng,dims,bc,lo,hi,n,n_x_fft,n_y_fft,lo_z,hi_z,n_z,nb,is_bound)
     ipencil_t(:) = pack([1,2,3],[1,2,3] /= ipencil)
     is_bound(:,:) = .false.
 #if defined(_OPENACC)
-    !
-    ! fetch lo(:), hi(:), n(:) and n_z(:) from cuDecomp (should match the modified 2decomp one)
-    !
-    istat = cudecompGetPencilInfo(ch,gd    ,ap_x    ,1)
-    istat = cudecompGetPencilInfo(ch,gd    ,ap_y    ,2)
-    istat = cudecompGetPencilInfo(ch,gd    ,ap_z    ,3)
-    istat = cudecompGetPencilInfo(ch,gd_poi,ap_x_poi,1)
-    istat = cudecompGetPencilInfo(ch,gd_poi,ap_y_poi,2)
-    istat = cudecompGetPencilInfo(ch,gd_poi,ap_z_poi,3)
-    select case(ipencil)
-    case(1)
-      lo(:) = ap_x%lo(:)
-      hi(:) = ap_x%hi(:)
-    case(2)
-      lo(:) = ap_y%lo(:)
-      hi(:) = ap_y%hi(:)
-    case(3)
-      lo(:) = ap_z%lo(:)
-      hi(:) = ap_z%hi(:)
-    end select
-    n(:)       = hi(:)-lo(:)+1
-    n_x_fft(:) = ap_x_poi%shape(:)
-    n_y_fft(:) = ap_y_poi%shape(:)
-    lo_z(:)    = ap_z%lo(:)
-    hi_z(:)    = ap_z%hi(:)
-    n_z(:)     = ap_z%shape(:)
-    nb(:,ipencil) = CUDECOMP_RANK_NULL
-    associate(ip_t => ipencil_t)
-      istat = cudecompGetShiftedRank(ch,gd,ipencil,ip_t(1),-1,periods(ip_t(1)),nb(0,ip_t(1)))
-      istat = cudecompGetShiftedRank(ch,gd,ipencil,ip_t(1), 1,periods(ip_t(1)),nb(1,ip_t(1)))
-      istat = cudecompGetShiftedRank(ch,gd,ipencil,ip_t(2),-1,periods(ip_t(2)),nb(0,ip_t(2)))
-      istat = cudecompGetShiftedRank(ch,gd,ipencil,ip_t(2), 1,periods(ip_t(2)),nb(1,ip_t(2)))
-    end associate
-    where(nb(:,:) == CUDECOMP_RANK_NULL) is_bound(:,:) = .true.
+    !!!PC !
+    !!!PC ! fetch lo(:), hi(:), n(:) and n_z(:) from cuDecomp (should match the modified 2decomp one)
+    !!!PC !
+    !!!PC istat = cudecompGetPencilInfo(ch,gd    ,ap_x    ,1)
+    !!!PC istat = cudecompGetPencilInfo(ch,gd    ,ap_y    ,2)
+    !!!PC istat = cudecompGetPencilInfo(ch,gd    ,ap_z    ,3)
+    !!!PC istat = cudecompGetPencilInfo(ch,gd_poi,ap_x_poi,1)
+    !!!PC istat = cudecompGetPencilInfo(ch,gd_poi,ap_y_poi,2)
+    !!!PC istat = cudecompGetPencilInfo(ch,gd_poi,ap_z_poi,3)
+    !!!PC select case(ipencil)
+    !!!PC case(1)
+    !!!PC   lo(:) = ap_x%lo(:)
+    !!!PC   hi(:) = ap_x%hi(:)
+    !!!PC case(2)
+    !!!PC   lo(:) = ap_y%lo(:)
+    !!!PC   hi(:) = ap_y%hi(:)
+    !!!PC case(3)
+    !!!PC   lo(:) = ap_z%lo(:)
+    !!!PC   hi(:) = ap_z%hi(:)
+    !!!PC end select
+    !!!PC n(:)       = hi(:)-lo(:)+1
+    !!!PC n_x_fft(:) = ap_x_poi%shape(:)
+    !!!PC n_y_fft(:) = ap_y_poi%shape(:)
+    !!!PC lo_z(:)    = ap_z%lo(:)
+    !!!PC hi_z(:)    = ap_z%hi(:)
+    !!!PC n_z(:)     = ap_z%shape(:)
+    !!!PC nb(:,ipencil) = CUDECOMP_RANK_NULL
+    !!!PC associate(ip_t => ipencil_t)
+    !!!PC   istat = cudecompGetShiftedRank(ch,gd,ipencil,ip_t(1),-1,periods(ip_t(1)),nb(0,ip_t(1)))
+    !!!PC   istat = cudecompGetShiftedRank(ch,gd,ipencil,ip_t(1), 1,periods(ip_t(1)),nb(1,ip_t(1)))
+    !!!PC   istat = cudecompGetShiftedRank(ch,gd,ipencil,ip_t(2),-1,periods(ip_t(2)),nb(0,ip_t(2)))
+    !!!PC   istat = cudecompGetShiftedRank(ch,gd,ipencil,ip_t(2), 1,periods(ip_t(2)),nb(1,ip_t(2)))
+    !!!PC end associate
+    !!!PC where(nb(:,:) == CUDECOMP_RANK_NULL) is_bound(:,:) = .true.
+    lo(:) = 1                                                   !!!PC
+    hi(:) = ng(:)                                               !!!PC
+    n(:)  = hi(:)-lo(:)+1                                       !!!PC
+    n_x_fft(:) = [2*(ng(1)/2+1),2*(ng(2)/2+1),ng(3)]            !!!PC
+    n_y_fft(:) = [2*(ng(2)/2+1),ng(3),2*(ng(1)/2+1)]            !!!PC
+    lo_z(:)    = lo(:)                                          !!!PC
+    hi_z(:)    = hi(:)                                          !!!PC
+    n_z(:)     = n(:)                                           !!!PC
+    nb(:,ipencil) = CUDECOMP_RANK_NULL                          !!!PC
+    where(nb(:,:) == CUDECOMP_RANK_NULL) is_bound(:,:) = .true. !!!PC
+    is_bound(:,:) = .true.                                      !!!PC
 #else
     select case(ipencil)
     case(1)
diff --git a/src/param.f90 b/src/param.f90
index 80b319a1..62d95af8 100644
--- a/src/param.f90
+++ b/src/param.f90
@@ -6,7 +6,7 @@
 ! -
 module mod_param
 use mod_types
-!@acc use cudecomp
+!!@acc use cudecomp
 implicit none
 public
 !
@@ -112,7 +112,7 @@ subroutine read_input(myid)
     dl(:) = l(:)/(1.*ng(:))
     dli(:) = dl(:)**(-1)
     visc = visci**(-1)
-#if defined(_OPENACC)
+#if 0
     !
     ! read cuDecomp parameter file cudecomp.in, if it exists
     !
diff --git a/src/solver_gpu.f90 b/src/solver_gpu.f90
index 4d80b5dd..6f32372c 100644
--- a/src/solver_gpu.f90
+++ b/src/solver_gpu.f90
@@ -7,19 +7,19 @@
 module mod_solver_gpu
 #if defined(_OPENACC)
   use, intrinsic :: iso_c_binding, only: C_PTR
-  use cudecomp
+  !!!PC use cudecomp
   use mod_fft  ,  only: signal_processing,fftf_gpu,fftb_gpu
   use mod_types
   use mod_common_mpi     , only: ipencil_axis
-  use mod_common_cudecomp, only: dtype_rp => cudecomp_real_rp, &
+  use mod_common_cudecomp, only:  & !!!PC dtype_rp => cudecomp_real_rp, &
                                  cudecomp_is_t_in_place, &
                                  solver_buf_0,solver_buf_1, &
                                  pz_aux_1,pz_aux_2,work, &
-                                 ap_x   => ap_x_poi, &
-                                 ap_y   => ap_y_poi, &
-                                 ap_z   => ap_z_poi, &
-                                 ap_z_0 => ap_z    , &
-                                 ch => handle,gd => gd_poi, &
+                                 !ap_x   => ap_x_poi, &
+                                 !ap_y   => ap_y_poi, &
+                                 !ap_z   => ap_z_poi, &
+                                 !ap_z_0 => ap_z    , &
+                                 !ch => handle,gd => gd_poi, &
                                  istream => istream_acc_queue_1
   implicit none
   private
@@ -42,11 +42,19 @@ subroutine solver_gpu(n,ng,arrplan,normfft,lambdaxy,a,b,c,bc,c_or_f,p)
     integer :: q
     integer, dimension(3) :: n_x,n_y,n_z,n_z_0
     integer :: istat
+    integer :: i,j,k,n_x_1,n_x_2,n_x_3 !!!PC
     !
-    n_z_0(:) = ap_z_0%shape(:)
-    n_x(:) = ap_x%shape(:)
-    n_y(:) = ap_y%shape(:)
-    n_z(:) = ap_z%shape(:)
+    n_z_0(:) = ng(:) !!!PC
+    n_x(:) = [2*(ng(1)/2+1),2*(ng(2)/2+1),ng(3)] !!!PC
+    n_y(:) = [2*(ng(2)/2+1),ng(3),2*(ng(1)/2+1)] !!!PC
+    n_z(:) = [2*(ng(1)/2+1),2*(ng(2)/2+1),ng(3)] !!!PC
+    n_x_1 = n_x(1) !!!PC
+    n_x_2 = n_x(2) !!!PC
+    n_x_3 = n_x(3) !!!PC
+    !!!PC n_z_0(:) = ap_z_0%shape(:)
+    !!!PC n_x(:) = ap_x%shape(:)
+    !!!PC n_y(:) = ap_y%shape(:)
+    !!!PC n_z(:) = ap_z%shape(:)
     px(1:n_x(1),1:n_x(2),1:n_x(3)) => solver_buf_0(1:product(n_x(:)))
     if(cudecomp_is_t_in_place) then
       py(1:n_y(1),1:n_y(2),1:n_y(3)) => solver_buf_0(1:product(n_y(:)))
@@ -64,7 +72,7 @@ subroutine solver_gpu(n,ng,arrplan,normfft,lambdaxy,a,b,c,bc,c_or_f,p)
       !$acc end kernels
     case(2)
       block
-        integer :: i,j,k
+        !!!PC integer :: i,j,k
         !
         ! transpose p -> py to axis-contiguous layout
         !
@@ -78,36 +86,81 @@ subroutine solver_gpu(n,ng,arrplan,normfft,lambdaxy,a,b,c,bc,c_or_f,p)
           end do
         end do
       end block
-      !$acc host_data use_device(py,px,work)
-      istat = cudecompTransposeYtoX(ch,gd,py,px,work,dtype_rp,stream=istream)
-      !$acc end host_data
+      !!!PC !$acc host_data use_device(py,px,work)
+      !!!PC istat = cudecompTransposeYtoX(ch,gd,py,px,work,dtype_rp,stream=istream)
+      !!!PC !$acc end host_data
+      !$acc parallel loop collapse(3) default(present) async(1) !!!PC
+      !$OMP PARALLEL DO   COLLAPSE(3) DEFAULT(shared)           !!!PC
+      do k=1,n_x_3                                              !!!PC
+        do j=1,n_x_2                                            !!!PC
+          do i=1,n_x_1                                          !!!PC
+            px(i,j,k) = py(j,k,i)                               !!!PC
+          end do                                                !!!PC
+        end do                                                  !!!PC
+      end do                                                    !!!PC
     case(3)
       !$acc kernels default(present) async(1)
       !$OMP PARALLEL WORKSHARE
       pz(1:n(1),1:n(2),1:n(3)) = p(1:n(1),1:n(2),1:n(3))
       !$OMP END PARALLEL WORKSHARE
       !$acc end kernels
-      !$acc host_data use_device(pz,py,px,work)
-      istat = cudecompTransposeZtoY(ch,gd,pz,py,work,dtype_rp,stream=istream)
-      istat = cudecompTransposeYtoX(ch,gd,py,px,work,dtype_rp,stream=istream)
-      !$acc end host_data
+      !!!PC !$acc host_data use_device(pz,py,px,work)
+      !!!PC istat = cudecompTransposeZtoY(ch,gd,pz,py,work,dtype_rp,stream=istream)
+      !!!PC istat = cudecompTransposeYtoX(ch,gd,py,px,work,dtype_rp,stream=istream)
+      !!!PC !$acc end host_data
+      !$acc parallel loop collapse(3) default(present) async(1) !!!PC
+      !$OMP PARALLEL DO   COLLAPSE(3) DEFAULT(shared)           !!!PC
+      do k=1,n_x_3                                              !!!PC
+        do j=1,n_x_2                                            !!!PC
+          do i=1,n_x_1                                          !!!PC
+            py(j,k,i) = pz(i,j,k)                               !!!PC
+          end do                                                !!!PC
+        end do                                                  !!!PC
+      end do                                                    !!!PC
+      !$acc parallel loop collapse(3) default(present) async(1) !!!PC
+      !$OMP PARALLEL DO   COLLAPSE(3) DEFAULT(shared)           !!!PC
+      do k=1,n_x_3                                              !!!PC
+        do j=1,n_x_2                                            !!!PC
+          do i=1,n_x_1                                          !!!PC
+            px(i,j,k) = py(j,k,i)                               !!!PC
+          end do                                                !!!PC
+        end do                                                  !!!PC
+      end do                                                    !!!PC
     end select
     !
     call signal_processing(0,'F',bc(0,1)//bc(1,1),c_or_f(1),ng(1),n_x,1,px)
     call fftf_gpu(arrplan(1,1),px)
     call signal_processing(1,'F',bc(0,1)//bc(1,1),c_or_f(1),ng(1),n_x,1,px)
     !
-    !$acc host_data use_device(px,py,work)
-    istat = cudecompTransposeXtoY(ch,gd,px,py,work,dtype_rp,stream=istream)
-    !$acc end host_data
+    !!!PC !$acc host_data use_device(px,py,work)
+    !!!PC istat = cudecompTransposeXtoY(ch,gd,px,py,work,dtype_rp,stream=istream)
+    !!!PC !$acc end host_data
+    !$acc parallel loop collapse(3) default(present) async(1) !!!PC
+    !$OMP PARALLEL DO   COLLAPSE(3) DEFAULT(shared)           !!!PC
+    do k=1,n_x_3                                              !!!PC
+      do j=1,n_x_2                                            !!!PC
+        do i=1,n_x_1                                          !!!PC
+          py(j,k,i) = px(i,j,k)                               !!!PC
+        end do                                                !!!PC
+      end do                                                  !!!PC
+    end do                                                    !!!PC
     !
     call signal_processing(0,'F',bc(0,2)//bc(1,2),c_or_f(2),ng(2),n_y,1,py)
     call fftf_gpu(arrplan(1,2),py)
     call signal_processing(1,'F',bc(0,2)//bc(1,2),c_or_f(2),ng(2),n_y,1,py)
     !
-    !$acc host_data use_device(py,pz,work)
-    istat = cudecompTransposeYtoZ(ch,gd,py,pz,work,dtype_rp,stream=istream)
-    !$acc end host_data
+    !!!PC !$acc host_data use_device(py,pz,work)
+    !!!PC istat = cudecompTransposeYtoZ(ch,gd,py,pz,work,dtype_rp,stream=istream)
+    !!!PC !$acc end host_data
+    !$acc parallel loop collapse(3) default(present) async(1) !!!PC
+    !$OMP PARALLEL DO   COLLAPSE(3) DEFAULT(shared)           !!!PC
+    do k=1,n_x_3                                              !!!PC
+      do j=1,n_x_2                                            !!!PC
+        do i=1,n_x_1                                          !!!PC
+          pz(i,j,k) = py(j,k,i)                               !!!PC
+        end do                                                !!!PC
+      end do                                                  !!!PC
+    end do                                                    !!!PC
     !
     q = 0
     if(c_or_f(3) == 'f'.and.bc(1,3) == 'D') q = 1
@@ -117,17 +170,35 @@ subroutine solver_gpu(n,ng,arrplan,normfft,lambdaxy,a,b,c,bc,c_or_f,p)
       call gaussel_gpu(         n_z_0(1),n_z_0(2),n_z_0(3)-q,0,a,b,c,pz,work,lambdaxy)
     end if
     !
-    !$acc host_data use_device(pz,py,work)
-    istat = cudecompTransposeZtoY(ch,gd,pz,py,work,dtype_rp,stream=istream)
-    !$acc end host_data
+    !!!PC !$acc host_data use_device(pz,py,work)
+    !!!PC istat = cudecompTransposeZtoY(ch,gd,pz,py,work,dtype_rp,stream=istream)
+    !!!PC !$acc end host_data
+    !$acc parallel loop collapse(3) default(present) async(1) !!!PC
+    !$OMP PARALLEL DO   COLLAPSE(3) DEFAULT(shared)           !!!PC
+    do k=1,n_x_3                                              !!!PC
+      do j=1,n_x_2                                            !!!PC
+        do i=1,n_x_1                                          !!!PC
+          py(j,k,i) = pz(i,j,k)                               !!!PC
+        end do                                                !!!PC
+      end do                                                  !!!PC
+    end do                                                    !!!PC
     !
     call signal_processing(0,'B',bc(0,2)//bc(1,2),c_or_f(2),ng(2),n_y,1,py)
     call fftb_gpu(arrplan(2,2),py)
     call signal_processing(1,'B',bc(0,2)//bc(1,2),c_or_f(2),ng(2),n_y,1,py)
     !
-    !$acc host_data use_device(py,px,work)
-    istat = cudecompTransposeYtoX(ch,gd,py,px,work,dtype_rp,stream=istream)
-    !$acc end host_data
+    !!!PC !$acc host_data use_device(py,px,work)
+    !!!PC istat = cudecompTransposeYtoX(ch,gd,py,px,work,dtype_rp,stream=istream)
+    !!!PC !$acc end host_data
+    !$acc parallel loop collapse(3) default(present) async(1) !!!PC
+    !$OMP PARALLEL DO   COLLAPSE(3) DEFAULT(shared)           !!!PC
+    do k=1,n_x_3                                              !!!PC
+      do j=1,n_x_2                                            !!!PC
+        do i=1,n_x_1                                          !!!PC
+          px(i,j,k) = py(j,k,i)                               !!!PC
+        end do                                                !!!PC
+      end do                                                  !!!PC
+    end do                                                    !!!PC
     !
     call signal_processing(0,'B',bc(0,1)//bc(1,1),c_or_f(1),ng(1),n_x,1,px)
     call fftb_gpu(arrplan(2,1),px)
@@ -141,11 +212,20 @@ subroutine solver_gpu(n,ng,arrplan,normfft,lambdaxy,a,b,c,bc,c_or_f,p)
       !$OMP END PARALLEL WORKSHARE
       !$acc end kernels
     case(2)
-      !$acc host_data use_device(px,py,work)
-      istat = cudecompTransposeXtoY(ch,gd,px,py,work,dtype_rp,stream=istream)
-      !$acc end host_data
+      !!!PC   !$acc host_data use_device(px,py,work)
+      !!!PC   istat = cudecompTransposeXtoY(ch,gd,px,py,work,dtype_rp,stream=istream)
+      !!!PC   !$acc end host_data
+      !$acc parallel loop collapse(3) default(present) async(1) !!!PC
+      !$OMP PARALLEL DO   COLLAPSE(3) DEFAULT(shared)           !!!PC
+      do k=1,n_x_3                                              !!!PC
+        do j=1,n_x_2                                            !!!PC
+          do i=1,n_x_1                                          !!!PC
+            py(j,k,i) = px(i,j,k)                               !!!PC
+          end do                                                !!!PC
+        end do                                                  !!!PC
+      end do                                                    !!!PC
       block
-        integer :: i,j,k
+        !!!PC integer :: i,j,k
         !
         ! transpose py -> p to default layout
         !
@@ -160,10 +240,28 @@ subroutine solver_gpu(n,ng,arrplan,normfft,lambdaxy,a,b,c,bc,c_or_f,p)
         end do
       end block
     case(3)
-      !$acc host_data use_device(px,py,pz,work)
-      istat = cudecompTransposeXtoY(ch,gd,px,py,work,dtype_rp,stream=istream)
-      istat = cudecompTransposeYtoZ(ch,gd,py,pz,work,dtype_rp,stream=istream)
-      !$acc end host_data
+      !!!PC !$acc host_data use_device(px,py,pz,work)
+      !!!PC istat = cudecompTransposeXtoY(ch,gd,px,py,work,dtype_rp,stream=istream)
+      !!!PC istat = cudecompTransposeYtoZ(ch,gd,py,pz,work,dtype_rp,stream=istream)
+      !!!PC !$acc end host_data
+      !$acc parallel loop collapse(3) default(present) async(1) !!!PC
+      !$OMP PARALLEL DO   COLLAPSE(3) DEFAULT(shared)           !!!PC
+      do k=1,n_x_3                                              !!!PC
+        do j=1,n_x_2                                            !!!PC
+          do i=1,n_x_1                                          !!!PC
+            py(j,k,i) = px(i,j,k)                               !!!PC
+          end do                                                !!!PC
+        end do                                                  !!!PC
+      end do                                                    !!!PC
+      !$acc parallel loop collapse(3) default(present) async(1) !!!PC
+      !$OMP PARALLEL DO   COLLAPSE(3) DEFAULT(shared)           !!!PC
+      do k=1,n_x_3                                              !!!PC
+        do j=1,n_x_2                                            !!!PC
+          do i=1,n_x_1                                          !!!PC
+            pz(i,j,k) = py(j,k,i)                               !!!PC
+          end do                                                !!!PC
+        end do                                                  !!!PC
+      end do                                                    !!!PC
       !$acc kernels default(present) async(1)
       !$OMP PARALLEL WORKSHARE
       p(1:n(1),1:n(2),1:n(3)) = pz(1:n(1),1:n(2),1:n(3))*normfft
diff --git a/src/utils.f90 b/src/utils.f90
index 849c1316..c634302b 100644
--- a/src/utils.f90
+++ b/src/utils.f90
@@ -116,8 +116,9 @@ function device_memory_footprint(n,n_z) result(itotal)
     !    taken directly from `mod_common_cudecomp`
     !
     block
-      use mod_common_cudecomp, only: work,work_halo,solver_buf_0,solver_buf_1,pz_aux_1,pz_aux_2
-      itemp = storage_size(work        ,i8)*size(work        ) + storage_size(work_halo   ,i8)*size(work_halo   ) + &
+      !!!PC use mod_common_cudecomp, only: work,work_halo,solver_buf_0,solver_buf_1,pz_aux_1,pz_aux_2
+      use mod_common_cudecomp, only: work,solver_buf_0,solver_buf_1,pz_aux_1,pz_aux_2 !!!PC
+      itemp = storage_size(work        ,i8)*size(work        ) + & !!!PC storage_size(work_halo   ,i8)*size(work_halo   ) + &
               storage_size(solver_buf_0,i8)*size(solver_buf_0) + storage_size(solver_buf_1,i8)*size(solver_buf_1) + &
               storage_size(pz_aux_1    ,i8)*size(pz_aux_1    ) + storage_size(pz_aux_2    ,i8)*size(pz_aux_2    )
       itotal = itotal + itemp/8
diff --git a/src/workspaces.f90 b/src/workspaces.f90
index e33f5603..fdf8a08c 100644
--- a/src/workspaces.f90
+++ b/src/workspaces.f90
@@ -7,62 +7,70 @@
 module mod_workspaces
 #if defined(_OPENACC)
   use mod_types
-  use mod_common_cudecomp, only: work,work_cuda,work_halo,work_halo_cuda
+  use mod_common_cudecomp, only: work,work_cuda!!!PC,work_halo,work_halo_cuda
   use mod_utils, only: f_sizeof
   implicit none
   private
   public init_wspace_arrays,set_cufft_wspace
 contains
   subroutine init_wspace_arrays
-    use mod_common_cudecomp, only: handle,gd_halo,gd_poi     , &
-                                   ap_x_poi,ap_y_poi,ap_z_poi, &
+    use mod_common_cudecomp, only: & !!!PC handle,gd_halo,gd_poi     , &
+                                     !!!PC ap_x_poi,ap_y_poi,ap_z_poi, &
                                    solver_buf_0,solver_buf_1, &
-                                   ap_z,pz_aux_1,pz_aux_2, &
+                                   !!!PC ap_z,pz_aux_1,pz_aux_2, &
+                                   pz_aux_1,pz_aux_2, & !!!PC
                                    istream_acc_queue_1
     use mod_common_mpi     , only: ipencil => ipencil_axis
     use mod_fft            , only: wsize_fft
     use mod_param          , only: cudecomp_is_t_in_place,cbcpre
-    use cudecomp
+    use mod_param          , only: ng !!!PC
+!!!PC    use cudecomp
     use openacc
+    !!!PC use cudafor !!!PC
     implicit none
     integer :: istat
     integer(i8) :: wsize,max_wsize
-    integer :: nh(3)
+!!!PC    integer :: nh(3)
     !
     ! allocate cuDecomp workspace buffer for transposes (reused for FFTs)
     !
     max_wsize = -1
     wsize     = wsize_fft
     max_wsize = max(max_wsize,wsize)
-    istat = cudecompGetTransposeWorkspaceSize(handle,gd_poi,wsize)
-    max_wsize = max(max_wsize,wsize)
+!!!PC    istat = cudecompGetTransposeWorkspaceSize(handle,gd_poi,wsize)
+!!!PC    max_wsize = max(max_wsize,wsize)
     !
     ! one can append more checks here (e.g., for spectra calculation)
     !
     allocate(work(max_wsize))
-    istat = cudecompMalloc(handle,gd_poi,work_cuda,max_wsize)
+!!!PC    istat = cudecompMalloc(handle,gd_poi,work_cuda,max_wsize)
+    allocate(work_cuda,mold=work) !!!PC
+    !istat = cudaMalloc(c_devloc(work_cuda),max_wsize*f_sizeof(work(1))) !!!PC
     call acc_map_data(work,work_cuda,max_wsize*f_sizeof(work(1)))
     !
     ! allocate cuDecomp workspace buffer for halos
     ! (needs to be different due to the possible need of an NVSHMEM allocator
     !  in one of the descriptors, rather than a simple cudaMaloc)
     !
-    nh(:) = 1
-    istat = cudecompGetHaloWorkspaceSize(handle,gd_halo,ipencil,nh,max_wsize)
-    allocate(work_halo(max_wsize))
+!!!PC    nh(:) = 1
+!!!PC    istat = cudecompGetHaloWorkspaceSize(handle,gd_halo,ipencil,nh,max_wsize)
+!!!PC    allocate(work_halo(max_wsize))
     !
-    istat = cudecompMalloc(handle,gd_halo,work_halo_cuda,max_wsize)
-    call acc_map_data(work_halo,work_halo_cuda,max_wsize*f_sizeof(work_halo(1)))
+!!!PC    istat = cudecompMalloc(handle,gd_halo,work_halo_cuda,max_wsize)
+!!!PC    call acc_map_data(work_halo,work_halo_cuda,max_wsize*f_sizeof(work_halo(1)))
     !
     ! allocate transpose buffers
     !
-    wsize = max(ap_x_poi%size,ap_y_poi%size,ap_z_poi%size)
+!!!PC    wsize = max(ap_x_poi%size,ap_y_poi%size,ap_z_poi%size)
+    wsize = product([2*(ng(1)/2+1),2*(ng(2)/2+1),ng(3)]) !!!PC
     allocate(solver_buf_0(wsize))
     if(.not.cudecomp_is_t_in_place) allocate(solver_buf_1,mold=solver_buf_0)
     !$acc enter data create(solver_buf_0,solver_buf_1)
     if(cbcpre(0,3)//cbcpre(1,3) == 'PP') then
-      allocate(pz_aux_1(ap_z%shape(1),ap_z%shape(2),ap_z%shape(3)), &
-               pz_aux_2(ap_z%shape(1),ap_z%shape(2),ap_z%shape(3)))
+!!!PC      allocate(pz_aux_1(ap_z%shape(1),ap_z%shape(2),ap_z%shape(3)), &
+!!!PC               pz_aux_2(ap_z%shape(1),ap_z%shape(2),ap_z%shape(3)))
+      allocate(pz_aux_1(ng(1),ng(2),ng(3)), & !!!PC
+               pz_aux_2(ng(1),ng(2),ng(3)))   !!!PC
       !$acc enter data create(pz_aux_1,pz_aux_2)
     end if
     istream_acc_queue_1 = acc_get_cuda_stream(1) ! fetch CUDA stream of OpenACC queue 1