From 6708d1c8fa751cf5f0eb4c8da1f8ab89fd33b190 Mon Sep 17 00:00:00 2001
From: Pedro Costa
Date: Fri, 15 Mar 2024 13:54:40 +0100
Subject: [PATCH] Single GPU version without cuDecomp dependency.
---
build.conf | 4 +-
configs/libs.mk | 5 +-
dependencies/external.mk | 4 +-
src/bound.f90 | 36 ++++----
src/common_cudecomp.f90 | 14 ++--
src/initmpi.f90 | 175 ++++++++++++++++++++------------------
src/param.f90 | 4 +-
src/solver_gpu.f90 | 176 ++++++++++++++++++++++++++++++---------
src/utils.f90 | 5 +-
src/workspaces.f90 | 42 ++++++----
10 files changed, 292 insertions(+), 173 deletions(-)
diff --git a/build.conf b/build.conf
index bff572e0..72059bf2 100644
--- a/build.conf
+++ b/build.conf
@@ -1,7 +1,7 @@
#
# compiler and compiling profile
#
-FCOMP=GNU # options: GNU, NVIDIA, INTEL
+FCOMP=NVIDIA # options: GNU, NVIDIA, INTEL
FFLAGS_OPT=1 # for production runs
FFLAGS_OPT_MAX=0 # for production runs (more aggressive optimization)
FFLAGS_DEBUG=0 # for debugging
@@ -18,5 +18,5 @@ SINGLE_PRECISION=0 # perform the whole calculation in single precision
#
# GPU-related
#
-GPU=0
+GPU=1
USE_NVTX=0 # use the NVTX-enabled Git branch 'with-nvtx' to see the markers
diff --git a/configs/libs.mk b/configs/libs.mk
index b6e9ba5c..6f12af13 100644
--- a/configs/libs.mk
+++ b/configs/libs.mk
@@ -2,8 +2,9 @@ override LIBS += -L$(LIBS_DIR)/2decomp-fft -ldecomp2d
override INCS += -I$(LIBS_DIR)/2decomp-fft/mod
ifeq ($(strip $(GPU)),1)
-override LIBS += -L$(LIBS_DIR)/cuDecomp/build/lib -lcudecomp -lcudecomp_fort -cudalib=cufft
-override INCS += -I$(LIBS_DIR)/cuDecomp/build/include
+#override LIBS += -L$(LIBS_DIR)/cuDecomp/build/lib -lcudecomp -lcudecomp_fort -cudalib=cufft
+override LIBS += -cudalib=cufft
+#override INCS += -I$(LIBS_DIR)/cuDecomp/build/include
endif
ifeq ($(strip $(USE_NVTX)),1)
diff --git a/dependencies/external.mk b/dependencies/external.mk
index 52666aa3..32cfe32f 100644
--- a/dependencies/external.mk
+++ b/dependencies/external.mk
@@ -4,10 +4,10 @@
ifeq ($(strip $(GPU)),1)
libs: $(wildcard $(LIBS_DIR)/2decomp-fft/src/*.f90)
cd $(LIBS_DIR)/2decomp-fft && make
- cd $(LIBS_DIR)/cuDecomp && make lib -j
+# cd $(LIBS_DIR)/cuDecomp && make lib -j
libsclean: $(wildcard $(LIBS_DIR)/2decomp-fft/src/*.f90)
cd $(LIBS_DIR)/2decomp-fft && make clean
- cd $(LIBS_DIR)/cuDecomp && make clean
+# cd $(LIBS_DIR)/cuDecomp && make clean
else
libs: $(wildcard $(LIBS_DIR)/2decomp-fft/src/*.f90)
cd $(LIBS_DIR)/2decomp-fft && make
diff --git a/src/bound.f90 b/src/bound.f90
index 16e56284..56ede3ea 100644
--- a/src/bound.f90
+++ b/src/bound.f90
@@ -530,29 +530,29 @@ end subroutine updthalo
#if defined(_OPENACC)
subroutine updthalo_gpu(nh,periods,p)
use mod_types
- use cudecomp
- use mod_common_cudecomp, only: work => work_halo, &
- ch => handle,gd => gd_halo, &
- dtype => cudecomp_real_rp, &
- istream => istream_acc_queue_1
+ !!!PC use cudecomp
+ !!!PC use mod_common_cudecomp, only: work => work_halo, &
+ !!!PC ch => handle,gd => gd_halo, &
+ !!!PC dtype => cudecomp_real_rp, &
+ !!!PC istream => istream_acc_queue_1
implicit none
integer , intent(in) :: nh
logical , intent(in) :: periods(3)
real(rp), intent(inout), dimension(1-nh:,1-nh:,1-nh:) :: p
integer :: istat
- !$acc host_data use_device(p,work)
- select case(ipencil_axis)
- case(1)
- istat = cudecompUpdateHalosX(ch,gd,p,work,dtype,[nh,nh,nh],periods,2,stream=istream)
- istat = cudecompUpdateHalosX(ch,gd,p,work,dtype,[nh,nh,nh],periods,3,stream=istream)
- case(2)
- istat = cudecompUpdateHalosY(ch,gd,p,work,dtype,[nh,nh,nh],periods,1,stream=istream)
- istat = cudecompUpdateHalosY(ch,gd,p,work,dtype,[nh,nh,nh],periods,3,stream=istream)
- case(3)
- istat = cudecompUpdateHalosZ(ch,gd,p,work,dtype,[nh,nh,nh],periods,1,stream=istream)
- istat = cudecompUpdateHalosZ(ch,gd,p,work,dtype,[nh,nh,nh],periods,2,stream=istream)
- end select
- !$acc end host_data
+ !!!PC !$acc host_data use_device(p,work)
+ !!!PC select case(ipencil_axis)
+ !!!PC case(1)
+ !!!PC istat = cudecompUpdateHalosX(ch,gd,p,work,dtype,[nh,nh,nh],periods,2,stream=istream)
+ !!!PC istat = cudecompUpdateHalosX(ch,gd,p,work,dtype,[nh,nh,nh],periods,3,stream=istream)
+ !!!PC case(2)
+ !!!PC istat = cudecompUpdateHalosY(ch,gd,p,work,dtype,[nh,nh,nh],periods,1,stream=istream)
+ !!!PC istat = cudecompUpdateHalosY(ch,gd,p,work,dtype,[nh,nh,nh],periods,3,stream=istream)
+ !!!PC case(3)
+ !!!PC istat = cudecompUpdateHalosZ(ch,gd,p,work,dtype,[nh,nh,nh],periods,1,stream=istream)
+ !!!PC istat = cudecompUpdateHalosZ(ch,gd,p,work,dtype,[nh,nh,nh],periods,2,stream=istream)
+ !!!PC end select
+ !!!PC !$acc end host_data
end subroutine updthalo_gpu
#endif
end module mod_bound
diff --git a/src/common_cudecomp.f90 b/src/common_cudecomp.f90
index 47985d1f..b89dbab9 100644
--- a/src/common_cudecomp.f90
+++ b/src/common_cudecomp.f90
@@ -8,22 +8,22 @@ module mod_common_cudecomp
#if defined(_OPENACC)
use mod_types
!@cuf use cudafor
- use cudecomp
+ !!!PC use cudecomp
use openacc
use mod_param, only: cudecomp_is_t_in_place
implicit none
public
- integer :: cudecomp_real_rp
- type(cudecompHandle) :: handle
- type(cudecompGridDesc) :: gd_halo,gd_poi
- type(cudecompPencilInfo) :: ap_x,ap_y,ap_z,ap_x_poi,ap_y_poi,ap_z_poi
+!!!PC integer :: cudecomp_real_rp
+!!!PC type(cudecompHandle) :: handle
+!!!PC type(cudecompGridDesc) :: gd_halo,gd_poi
+!!!PC type(cudecompPencilInfo) :: ap_x,ap_y,ap_z,ap_x_poi,ap_y_poi,ap_z_poi
!
! workspace stuff
!
integer(i8) :: wsize_fft
real(rp), pointer, contiguous, dimension(:) :: work ,work_cuda
- real(rp), pointer, contiguous, dimension(:) :: work_halo,work_halo_cuda
- !@cuf attributes(device) :: work_cuda,work_halo_cuda
+!!!PC real(rp), pointer, contiguous, dimension(:) :: work_halo,work_halo_cuda
+ !@cuf attributes(device) :: work_cuda !!!PC,work_halo_cuda
real(rp), target, allocatable, dimension(:) :: solver_buf_0,solver_buf_1
#if !defined(_IMPDIFF_1D)
real(rp), allocatable, dimension(:,:,:) :: pz_aux_1,pz_aux_2
diff --git a/src/initmpi.f90 b/src/initmpi.f90
index a689b489..0adcd216 100644
--- a/src/initmpi.f90
+++ b/src/initmpi.f90
@@ -10,8 +10,8 @@ module mod_initmpi
use mod_common_mpi, only: myid,ierr,halo,ipencil => ipencil_axis
use mod_types
!@acc use openacc
- !@acc use cudecomp
- !@cuf use cudafor, only: cudaGetDeviceCount,cudaSetDevice
+ !!!PC !@acc use cudecomp
+ !!!PC !@cuf use cudafor, only: cudaGetDeviceCount,cudaSetDevice
#if defined(_OPENACC)
use mod_common_cudecomp, only: cudecomp_real_rp, &
ch => handle,gd => gd_halo,gd_poi, &
@@ -40,8 +40,8 @@ subroutine initmpi(ng,dims,bc,lo,hi,n,n_x_fft,n_y_fft,lo_z,hi_z,n_z,nb,is_bound)
integer(acc_device_kind) ::dev_type
integer :: local_comm,mydev,ndev
integer :: istat
- type(cudecompGridDescConfig) :: conf,conf_poi
- type(cudecompGridDescAutotuneOptions) :: atune_conf
+!!!PC type(cudecompGridDescConfig) :: conf,conf_poi
+!!!PC type(cudecompGridDescAutotuneOptions) :: atune_conf
#else
integer :: comm_cart
#endif
@@ -53,7 +53,7 @@ subroutine initmpi(ng,dims,bc,lo,hi,n,n_x_fft,n_y_fft,lo_z,hi_z,n_z,nb,is_bound)
call MPI_COMM_SPLIT_TYPE(MPI_COMM_WORLD,MPI_COMM_TYPE_SHARED,0,MPI_INFO_NULL,local_comm,ierr)
call MPI_COMM_RANK(local_comm,mydev,ierr)
dev_type = acc_get_device_type()
-#if 1
+#if 0
istat = cudaGetDeviceCount(ndev) ! may be tweaked with environment variable CUDA_VISIBLE_DEVICES
mydev = mod(mydev,ndev)
istat = cudaSetDevice(mydev)
@@ -65,52 +65,52 @@ subroutine initmpi(ng,dims,bc,lo,hi,n,n_x_fft,n_y_fft,lo_z,hi_z,n_z,nb,is_bound)
call acc_set_device_num(mydev,dev_type)
call acc_init(dev_type)
!
- istat = cudecompInit(ch,MPI_COMM_WORLD)
+ !!!PCistat = cudecompInit(ch,MPI_COMM_WORLD)
!
! setup descriptor for the Poisson solver
!
- istat = cudecompGridDescConfigSetDefaults(conf)
- conf%transpose_comm_backend = cudecomp_t_comm_backend
- conf%transpose_axis_contiguous(:) = [.true.,.true.,.false.]
- conf%gdims(:) = [2*(ng(1)/2+1),2*(ng(2)/2+1),ng(3)]
- conf%pdims(:) = dims(1:2)
- conf%gdims_dist(:) = ng(:)
- istat = cudecompGridDescAutotuneOptionsSetDefaults(atune_conf)
- if(rp == dp) then
- cudecomp_real_rp = CUDECOMP_DOUBLE
- else
- cudecomp_real_rp = CUDECOMP_FLOAT
- end if
- atune_conf%dtype = cudecomp_real_rp
- atune_conf%grid_mode = CUDECOMP_AUTOTUNE_GRID_TRANSPOSE
- atune_conf%autotune_transpose_backend = cudecomp_is_t_comm_autotune
- atune_conf%disable_nccl_backends = .not.cudecomp_is_t_enable_nccl
- atune_conf%disable_nvshmem_backends = .not.cudecomp_is_t_enable_nvshmem
- istat = cudecompGridDescCreate(ch,gd_poi,conf,atune_conf)
- conf_poi = conf
- dims(:) = conf%pdims
- !
- ! setup descriptor for halo exchanges
- !
- istat = cudecompGridDescConfigSetDefaults(conf)
- conf%gdims(:) = ng(:)
- conf%pdims(:) = dims(1:2)
- conf%halo_comm_backend = cudecomp_h_comm_backend
- conf%transpose_axis_contiguous(:) = .false.
- istat = cudecompGridDescAutotuneOptionsSetDefaults(atune_conf)
- atune_conf%halo_extents(:) = 1
- atune_conf%halo_periods(:) = periods(:)
- atune_conf%dtype = cudecomp_real_rp
- atune_conf%autotune_halo_backend = cudecomp_is_h_comm_autotune
- atune_conf%disable_nccl_backends = .not.cudecomp_is_t_enable_nccl
- atune_conf%disable_nvshmem_backends = .not.cudecomp_is_t_enable_nvshmem
- if(all(conf_poi%transpose_comm_backend /= [CUDECOMP_TRANSPOSE_COMM_NVSHMEM,CUDECOMP_TRANSPOSE_COMM_NVSHMEM_PL])) then
- !
- ! disable NVSHMEM halo backend autotuning when NVSHMEM is NOT used for transposes
- !
- atune_conf%disable_nvshmem_backends = .true.
- end if
- istat = cudecompGridDescCreate(ch,gd,conf,atune_conf)
+ !!!PC istat = cudecompGridDescConfigSetDefaults(conf)
+ !!!PC conf%transpose_comm_backend = cudecomp_t_comm_backend
+ !!!PC conf%transpose_axis_contiguous(:) = [.true.,.true.,.false.]
+ !!!PC conf%gdims(:) = [2*(ng(1)/2+1),2*(ng(2)/2+1),ng(3)]
+ !!!PC conf%pdims(:) = dims(1:2)
+ !!!PC conf%gdims_dist(:) = ng(:)
+ !!!PC istat = cudecompGridDescAutotuneOptionsSetDefaults(atune_conf)
+ !!!PC if(rp == dp) then
+ !!!PC cudecomp_real_rp = CUDECOMP_DOUBLE
+ !!!PC else
+ !!!PC cudecomp_real_rp = CUDECOMP_FLOAT
+ !!!PC end if
+ !!!PC atune_conf%dtype = cudecomp_real_rp
+ !!!PC atune_conf%grid_mode = CUDECOMP_AUTOTUNE_GRID_TRANSPOSE
+ !!!PC atune_conf%autotune_transpose_backend = cudecomp_is_t_comm_autotune
+ !!!PC atune_conf%disable_nccl_backends = .not.cudecomp_is_t_enable_nccl
+ !!!PC atune_conf%disable_nvshmem_backends = .not.cudecomp_is_t_enable_nvshmem
+ !!!PC istat = cudecompGridDescCreate(ch,gd_poi,conf,atune_conf)
+ !!!PC conf_poi = conf
+ !!!PC dims(:) = conf%pdims
+ !!!PC !
+ !!!PC ! setup descriptor for halo exchanges
+ !!!PC !
+ !!!PC istat = cudecompGridDescConfigSetDefaults(conf)
+ !!!PC conf%gdims(:) = ng(:)
+ !!!PC conf%pdims(:) = dims(1:2)
+ !!!PC conf%halo_comm_backend = cudecomp_h_comm_backend
+ !!!PC conf%transpose_axis_contiguous(:) = .false.
+ !!!PC istat = cudecompGridDescAutotuneOptionsSetDefaults(atune_conf)
+ !!!PC atune_conf%halo_extents(:) = 1
+ !!!PC atune_conf%halo_periods(:) = periods(:)
+ !!!PC atune_conf%dtype = cudecomp_real_rp
+ !!!PC atune_conf%autotune_halo_backend = cudecomp_is_h_comm_autotune
+ !!!PC atune_conf%disable_nccl_backends = .not.cudecomp_is_t_enable_nccl
+ !!!PC atune_conf%disable_nvshmem_backends = .not.cudecomp_is_t_enable_nvshmem
+ !!!PC if(all(conf_poi%transpose_comm_backend /= [CUDECOMP_TRANSPOSE_COMM_NVSHMEM,CUDECOMP_TRANSPOSE_COMM_NVSHMEM_PL])) then
+ !!!PC !
+ !!!PC ! disable NVSHMEM halo backend autotuning when NVSHMEM is NOT used for transposes
+ !!!PC !
+ !!!PC atune_conf%disable_nvshmem_backends = .true.
+ !!!PC end if
+ !!!PC istat = cudecompGridDescCreate(ch,gd,conf,atune_conf)
#endif
call decomp_2d_init(ng(1),ng(2),ng(3),dims(1),dims(2),periods)
#if !defined(_DECOMP_Y) && !defined(_DECOMP_Z)
@@ -123,40 +123,51 @@ subroutine initmpi(ng,dims,bc,lo,hi,n,n_x_fft,n_y_fft,lo_z,hi_z,n_z,nb,is_bound)
ipencil_t(:) = pack([1,2,3],[1,2,3] /= ipencil)
is_bound(:,:) = .false.
#if defined(_OPENACC)
- !
- ! fetch lo(:), hi(:), n(:) and n_z(:) from cuDecomp (should match the modified 2decomp one)
- !
- istat = cudecompGetPencilInfo(ch,gd ,ap_x ,1)
- istat = cudecompGetPencilInfo(ch,gd ,ap_y ,2)
- istat = cudecompGetPencilInfo(ch,gd ,ap_z ,3)
- istat = cudecompGetPencilInfo(ch,gd_poi,ap_x_poi,1)
- istat = cudecompGetPencilInfo(ch,gd_poi,ap_y_poi,2)
- istat = cudecompGetPencilInfo(ch,gd_poi,ap_z_poi,3)
- select case(ipencil)
- case(1)
- lo(:) = ap_x%lo(:)
- hi(:) = ap_x%hi(:)
- case(2)
- lo(:) = ap_y%lo(:)
- hi(:) = ap_y%hi(:)
- case(3)
- lo(:) = ap_z%lo(:)
- hi(:) = ap_z%hi(:)
- end select
- n(:) = hi(:)-lo(:)+1
- n_x_fft(:) = ap_x_poi%shape(:)
- n_y_fft(:) = ap_y_poi%shape(:)
- lo_z(:) = ap_z%lo(:)
- hi_z(:) = ap_z%hi(:)
- n_z(:) = ap_z%shape(:)
- nb(:,ipencil) = CUDECOMP_RANK_NULL
- associate(ip_t => ipencil_t)
- istat = cudecompGetShiftedRank(ch,gd,ipencil,ip_t(1),-1,periods(ip_t(1)),nb(0,ip_t(1)))
- istat = cudecompGetShiftedRank(ch,gd,ipencil,ip_t(1), 1,periods(ip_t(1)),nb(1,ip_t(1)))
- istat = cudecompGetShiftedRank(ch,gd,ipencil,ip_t(2),-1,periods(ip_t(2)),nb(0,ip_t(2)))
- istat = cudecompGetShiftedRank(ch,gd,ipencil,ip_t(2), 1,periods(ip_t(2)),nb(1,ip_t(2)))
- end associate
- where(nb(:,:) == CUDECOMP_RANK_NULL) is_bound(:,:) = .true.
+ !!!PC !
+ !!!PC ! fetch lo(:), hi(:), n(:) and n_z(:) from cuDecomp (should match the modified 2decomp one)
+ !!!PC !
+ !!!PC istat = cudecompGetPencilInfo(ch,gd ,ap_x ,1)
+ !!!PC istat = cudecompGetPencilInfo(ch,gd ,ap_y ,2)
+ !!!PC istat = cudecompGetPencilInfo(ch,gd ,ap_z ,3)
+ !!!PC istat = cudecompGetPencilInfo(ch,gd_poi,ap_x_poi,1)
+ !!!PC istat = cudecompGetPencilInfo(ch,gd_poi,ap_y_poi,2)
+ !!!PC istat = cudecompGetPencilInfo(ch,gd_poi,ap_z_poi,3)
+ !!!PC select case(ipencil)
+ !!!PC case(1)
+ !!!PC lo(:) = ap_x%lo(:)
+ !!!PC hi(:) = ap_x%hi(:)
+ !!!PC case(2)
+ !!!PC lo(:) = ap_y%lo(:)
+ !!!PC hi(:) = ap_y%hi(:)
+ !!!PC case(3)
+ !!!PC lo(:) = ap_z%lo(:)
+ !!!PC hi(:) = ap_z%hi(:)
+ !!!PC end select
+ !!!PC n(:) = hi(:)-lo(:)+1
+ !!!PC n_x_fft(:) = ap_x_poi%shape(:)
+ !!!PC n_y_fft(:) = ap_y_poi%shape(:)
+ !!!PC lo_z(:) = ap_z%lo(:)
+ !!!PC hi_z(:) = ap_z%hi(:)
+ !!!PC n_z(:) = ap_z%shape(:)
+ !!!PC nb(:,ipencil) = CUDECOMP_RANK_NULL
+ !!!PC associate(ip_t => ipencil_t)
+ !!!PC istat = cudecompGetShiftedRank(ch,gd,ipencil,ip_t(1),-1,periods(ip_t(1)),nb(0,ip_t(1)))
+ !!!PC istat = cudecompGetShiftedRank(ch,gd,ipencil,ip_t(1), 1,periods(ip_t(1)),nb(1,ip_t(1)))
+ !!!PC istat = cudecompGetShiftedRank(ch,gd,ipencil,ip_t(2),-1,periods(ip_t(2)),nb(0,ip_t(2)))
+ !!!PC istat = cudecompGetShiftedRank(ch,gd,ipencil,ip_t(2), 1,periods(ip_t(2)),nb(1,ip_t(2)))
+ !!!PC end associate
+ !!!PC where(nb(:,:) == CUDECOMP_RANK_NULL) is_bound(:,:) = .true.
+ lo(:) = 1 !!!PC
+ hi(:) = ng(:) !!!PC
+ n(:) = hi(:)-lo(:)+1 !!!PC
+ n_x_fft(:) = [2*(ng(1)/2+1),2*(ng(2)/2+1),ng(3)] !!!PC
+ n_y_fft(:) = [2*(ng(2)/2+1),ng(3),2*(ng(1)/2+1)] !!!PC
+ lo_z(:) = lo(:) !!!PC
+ hi_z(:) = hi(:) !!!PC
+ n_z(:) = n(:) !!!PC
+ nb(:,ipencil) = CUDECOMP_RANK_NULL !!!PC
+ where(nb(:,:) == CUDECOMP_RANK_NULL) is_bound(:,:) = .true. !!!PC
+ is_bound(:,:) = .true. !!!PC
#else
select case(ipencil)
case(1)
diff --git a/src/param.f90 b/src/param.f90
index 80b319a1..62d95af8 100644
--- a/src/param.f90
+++ b/src/param.f90
@@ -6,7 +6,7 @@
! -
module mod_param
use mod_types
-!@acc use cudecomp
+!!@acc use cudecomp
implicit none
public
!
@@ -112,7 +112,7 @@ subroutine read_input(myid)
dl(:) = l(:)/(1.*ng(:))
dli(:) = dl(:)**(-1)
visc = visci**(-1)
-#if defined(_OPENACC)
+#if 0
!
! read cuDecomp parameter file cudecomp.in, if it exists
!
diff --git a/src/solver_gpu.f90 b/src/solver_gpu.f90
index 4d80b5dd..6f32372c 100644
--- a/src/solver_gpu.f90
+++ b/src/solver_gpu.f90
@@ -7,19 +7,19 @@
module mod_solver_gpu
#if defined(_OPENACC)
use, intrinsic :: iso_c_binding, only: C_PTR
- use cudecomp
+ !!!PC use cudecomp
use mod_fft , only: signal_processing,fftf_gpu,fftb_gpu
use mod_types
use mod_common_mpi , only: ipencil_axis
- use mod_common_cudecomp, only: dtype_rp => cudecomp_real_rp, &
+ use mod_common_cudecomp, only: & !!!PC dtype_rp => cudecomp_real_rp, &
cudecomp_is_t_in_place, &
solver_buf_0,solver_buf_1, &
pz_aux_1,pz_aux_2,work, &
- ap_x => ap_x_poi, &
- ap_y => ap_y_poi, &
- ap_z => ap_z_poi, &
- ap_z_0 => ap_z , &
- ch => handle,gd => gd_poi, &
+ !ap_x => ap_x_poi, &
+ !ap_y => ap_y_poi, &
+ !ap_z => ap_z_poi, &
+ !ap_z_0 => ap_z , &
+ !ch => handle,gd => gd_poi, &
istream => istream_acc_queue_1
implicit none
private
@@ -42,11 +42,19 @@ subroutine solver_gpu(n,ng,arrplan,normfft,lambdaxy,a,b,c,bc,c_or_f,p)
integer :: q
integer, dimension(3) :: n_x,n_y,n_z,n_z_0
integer :: istat
+ integer :: i,j,k,n_x_1,n_x_2,n_x_3 !!!PC
!
- n_z_0(:) = ap_z_0%shape(:)
- n_x(:) = ap_x%shape(:)
- n_y(:) = ap_y%shape(:)
- n_z(:) = ap_z%shape(:)
+ n_z_0(:) = ng(:) !!!PC
+ n_x(:) = [2*(ng(1)/2+1),2*(ng(2)/2+1),ng(3)] !!!PC
+ n_y(:) = [2*(ng(2)/2+1),ng(3),2*(ng(1)/2+1)] !!!PC
+ n_z(:) = [2*(ng(1)/2+1),2*(ng(2)/2+1),ng(3)] !!!PC
+ n_x_1 = n_x(1) !!!PC
+ n_x_2 = n_x(2) !!!PC
+ n_x_3 = n_x(3) !!!PC
+ !!!PC n_z_0(:) = ap_z_0%shape(:)
+ !!!PC n_x(:) = ap_x%shape(:)
+ !!!PC n_y(:) = ap_y%shape(:)
+ !!!PC n_z(:) = ap_z%shape(:)
px(1:n_x(1),1:n_x(2),1:n_x(3)) => solver_buf_0(1:product(n_x(:)))
if(cudecomp_is_t_in_place) then
py(1:n_y(1),1:n_y(2),1:n_y(3)) => solver_buf_0(1:product(n_y(:)))
@@ -64,7 +72,7 @@ subroutine solver_gpu(n,ng,arrplan,normfft,lambdaxy,a,b,c,bc,c_or_f,p)
!$acc end kernels
case(2)
block
- integer :: i,j,k
+ !!!PC integer :: i,j,k
!
! transpose p -> py to axis-contiguous layout
!
@@ -78,36 +86,81 @@ subroutine solver_gpu(n,ng,arrplan,normfft,lambdaxy,a,b,c,bc,c_or_f,p)
end do
end do
end block
- !$acc host_data use_device(py,px,work)
- istat = cudecompTransposeYtoX(ch,gd,py,px,work,dtype_rp,stream=istream)
- !$acc end host_data
+ !!!PC !$acc host_data use_device(py,px,work)
+ !!!PC istat = cudecompTransposeYtoX(ch,gd,py,px,work,dtype_rp,stream=istream)
+ !!!PC !$acc end host_data
+ !$acc parallel loop collapse(3) default(present) async(1) !!!PC
+ !$OMP PARALLEL DO COLLAPSE(3) DEFAULT(shared) !!!PC
+ do k=1,n_x_3 !!!PC
+ do j=1,n_x_2 !!!PC
+ do i=1,n_x_1 !!!PC
+ px(i,j,k) = py(j,k,i) !!!PC
+ end do !!!PC
+ end do !!!PC
+ end do !!!PC
case(3)
!$acc kernels default(present) async(1)
!$OMP PARALLEL WORKSHARE
pz(1:n(1),1:n(2),1:n(3)) = p(1:n(1),1:n(2),1:n(3))
!$OMP END PARALLEL WORKSHARE
!$acc end kernels
- !$acc host_data use_device(pz,py,px,work)
- istat = cudecompTransposeZtoY(ch,gd,pz,py,work,dtype_rp,stream=istream)
- istat = cudecompTransposeYtoX(ch,gd,py,px,work,dtype_rp,stream=istream)
- !$acc end host_data
+ !!!PC !$acc host_data use_device(pz,py,px,work)
+ !!!PC istat = cudecompTransposeZtoY(ch,gd,pz,py,work,dtype_rp,stream=istream)
+ !!!PC istat = cudecompTransposeYtoX(ch,gd,py,px,work,dtype_rp,stream=istream)
+ !!!PC !$acc end host_data
+ !$acc parallel loop collapse(3) default(present) async(1) !!!PC
+ !$OMP PARALLEL DO COLLAPSE(3) DEFAULT(shared) !!!PC
+ do k=1,n_x_3 !!!PC
+ do j=1,n_x_2 !!!PC
+ do i=1,n_x_1 !!!PC
+ py(j,k,i) = pz(i,j,k) !!!PC
+ end do !!!PC
+ end do !!!PC
+ end do !!!PC
+ !$acc parallel loop collapse(3) default(present) async(1) !!!PC
+ !$OMP PARALLEL DO COLLAPSE(3) DEFAULT(shared) !!!PC
+ do k=1,n_x_3 !!!PC
+ do j=1,n_x_2 !!!PC
+ do i=1,n_x_1 !!!PC
+ px(i,j,k) = py(j,k,i) !!!PC
+ end do !!!PC
+ end do !!!PC
+ end do !!!PC
end select
!
call signal_processing(0,'F',bc(0,1)//bc(1,1),c_or_f(1),ng(1),n_x,1,px)
call fftf_gpu(arrplan(1,1),px)
call signal_processing(1,'F',bc(0,1)//bc(1,1),c_or_f(1),ng(1),n_x,1,px)
!
- !$acc host_data use_device(px,py,work)
- istat = cudecompTransposeXtoY(ch,gd,px,py,work,dtype_rp,stream=istream)
- !$acc end host_data
+ !!!PC !$acc host_data use_device(px,py,work)
+ !!!PC istat = cudecompTransposeXtoY(ch,gd,px,py,work,dtype_rp,stream=istream)
+ !!!PC !$acc end host_data
+ !$acc parallel loop collapse(3) default(present) async(1) !!!PC
+ !$OMP PARALLEL DO COLLAPSE(3) DEFAULT(shared) !!!PC
+ do k=1,n_x_3 !!!PC
+ do j=1,n_x_2 !!!PC
+ do i=1,n_x_1 !!!PC
+ py(j,k,i) = px(i,j,k) !!!PC
+ end do !!!PC
+ end do !!!PC
+ end do !!!PC
!
call signal_processing(0,'F',bc(0,2)//bc(1,2),c_or_f(2),ng(2),n_y,1,py)
call fftf_gpu(arrplan(1,2),py)
call signal_processing(1,'F',bc(0,2)//bc(1,2),c_or_f(2),ng(2),n_y,1,py)
!
- !$acc host_data use_device(py,pz,work)
- istat = cudecompTransposeYtoZ(ch,gd,py,pz,work,dtype_rp,stream=istream)
- !$acc end host_data
+ !!!PC !$acc host_data use_device(py,pz,work)
+ !!!PC istat = cudecompTransposeYtoZ(ch,gd,py,pz,work,dtype_rp,stream=istream)
+ !!!PC !$acc end host_data
+ !$acc parallel loop collapse(3) default(present) async(1) !!!PC
+ !$OMP PARALLEL DO COLLAPSE(3) DEFAULT(shared) !!!PC
+ do k=1,n_x_3 !!!PC
+ do j=1,n_x_2 !!!PC
+ do i=1,n_x_1 !!!PC
+ pz(i,j,k) = py(j,k,i) !!!PC
+ end do !!!PC
+ end do !!!PC
+ end do !!!PC
!
q = 0
if(c_or_f(3) == 'f'.and.bc(1,3) == 'D') q = 1
@@ -117,17 +170,35 @@ subroutine solver_gpu(n,ng,arrplan,normfft,lambdaxy,a,b,c,bc,c_or_f,p)
call gaussel_gpu( n_z_0(1),n_z_0(2),n_z_0(3)-q,0,a,b,c,pz,work,lambdaxy)
end if
!
- !$acc host_data use_device(pz,py,work)
- istat = cudecompTransposeZtoY(ch,gd,pz,py,work,dtype_rp,stream=istream)
- !$acc end host_data
+ !!!PC !$acc host_data use_device(pz,py,work)
+ !!!PC istat = cudecompTransposeZtoY(ch,gd,pz,py,work,dtype_rp,stream=istream)
+ !!!PC !$acc end host_data
+ !$acc parallel loop collapse(3) default(present) async(1) !!!PC
+ !$OMP PARALLEL DO COLLAPSE(3) DEFAULT(shared) !!!PC
+ do k=1,n_x_3 !!!PC
+ do j=1,n_x_2 !!!PC
+ do i=1,n_x_1 !!!PC
+ py(j,k,i) = pz(i,j,k) !!!PC
+ end do !!!PC
+ end do !!!PC
+ end do !!!PC
!
call signal_processing(0,'B',bc(0,2)//bc(1,2),c_or_f(2),ng(2),n_y,1,py)
call fftb_gpu(arrplan(2,2),py)
call signal_processing(1,'B',bc(0,2)//bc(1,2),c_or_f(2),ng(2),n_y,1,py)
!
- !$acc host_data use_device(py,px,work)
- istat = cudecompTransposeYtoX(ch,gd,py,px,work,dtype_rp,stream=istream)
- !$acc end host_data
+ !!!PC !$acc host_data use_device(py,px,work)
+ !!!PC istat = cudecompTransposeYtoX(ch,gd,py,px,work,dtype_rp,stream=istream)
+ !!!PC !$acc end host_data
+ !$acc parallel loop collapse(3) default(present) async(1) !!!PC
+ !$OMP PARALLEL DO COLLAPSE(3) DEFAULT(shared) !!!PC
+ do k=1,n_x_3 !!!PC
+ do j=1,n_x_2 !!!PC
+ do i=1,n_x_1 !!!PC
+ px(i,j,k) = py(j,k,i) !!!PC
+ end do !!!PC
+ end do !!!PC
+ end do !!!PC
!
call signal_processing(0,'B',bc(0,1)//bc(1,1),c_or_f(1),ng(1),n_x,1,px)
call fftb_gpu(arrplan(2,1),px)
@@ -141,11 +212,20 @@ subroutine solver_gpu(n,ng,arrplan,normfft,lambdaxy,a,b,c,bc,c_or_f,p)
!$OMP END PARALLEL WORKSHARE
!$acc end kernels
case(2)
- !$acc host_data use_device(px,py,work)
- istat = cudecompTransposeXtoY(ch,gd,px,py,work,dtype_rp,stream=istream)
- !$acc end host_data
+ !!!PC !$acc host_data use_device(px,py,work)
+ !!!PC istat = cudecompTransposeXtoY(ch,gd,px,py,work,dtype_rp,stream=istream)
+ !!!PC !$acc end host_data
+ !$acc parallel loop collapse(3) default(present) async(1) !!!PC
+ !$OMP PARALLEL DO COLLAPSE(3) DEFAULT(shared) !!!PC
+ do k=1,n_x_3 !!!PC
+ do j=1,n_x_2 !!!PC
+ do i=1,n_x_1 !!!PC
+ py(j,k,i) = px(i,j,k) !!!PC
+ end do !!!PC
+ end do !!!PC
+ end do !!!PC
block
- integer :: i,j,k
+ !!!PC integer :: i,j,k
!
! transpose py -> p to default layout
!
@@ -160,10 +240,28 @@ subroutine solver_gpu(n,ng,arrplan,normfft,lambdaxy,a,b,c,bc,c_or_f,p)
end do
end block
case(3)
- !$acc host_data use_device(px,py,pz,work)
- istat = cudecompTransposeXtoY(ch,gd,px,py,work,dtype_rp,stream=istream)
- istat = cudecompTransposeYtoZ(ch,gd,py,pz,work,dtype_rp,stream=istream)
- !$acc end host_data
+ !!!PC !$acc host_data use_device(px,py,pz,work)
+ !!!PC istat = cudecompTransposeXtoY(ch,gd,px,py,work,dtype_rp,stream=istream)
+ !!!PC istat = cudecompTransposeYtoZ(ch,gd,py,pz,work,dtype_rp,stream=istream)
+ !!!PC !$acc end host_data
+ !$acc parallel loop collapse(3) default(present) async(1) !!!PC
+ !$OMP PARALLEL DO COLLAPSE(3) DEFAULT(shared) !!!PC
+ do k=1,n_x_3 !!!PC
+ do j=1,n_x_2 !!!PC
+ do i=1,n_x_1 !!!PC
+ py(j,k,i) = px(i,j,k) !!!PC
+ end do !!!PC
+ end do !!!PC
+ end do !!!PC
+ !$acc parallel loop collapse(3) default(present) async(1) !!!PC
+ !$OMP PARALLEL DO COLLAPSE(3) DEFAULT(shared) !!!PC
+ do k=1,n_x_3 !!!PC
+ do j=1,n_x_2 !!!PC
+ do i=1,n_x_1 !!!PC
+ pz(i,j,k) = py(j,k,i) !!!PC
+ end do !!!PC
+ end do !!!PC
+ end do !!!PC
!$acc kernels default(present) async(1)
!$OMP PARALLEL WORKSHARE
p(1:n(1),1:n(2),1:n(3)) = pz(1:n(1),1:n(2),1:n(3))*normfft
diff --git a/src/utils.f90 b/src/utils.f90
index 849c1316..c634302b 100644
--- a/src/utils.f90
+++ b/src/utils.f90
@@ -116,8 +116,9 @@ function device_memory_footprint(n,n_z) result(itotal)
! taken directly from `mod_common_cudecomp`
!
block
- use mod_common_cudecomp, only: work,work_halo,solver_buf_0,solver_buf_1,pz_aux_1,pz_aux_2
- itemp = storage_size(work ,i8)*size(work ) + storage_size(work_halo ,i8)*size(work_halo ) + &
+ !!!PC use mod_common_cudecomp, only: work,work_halo,solver_buf_0,solver_buf_1,pz_aux_1,pz_aux_2
+ use mod_common_cudecomp, only: work,solver_buf_0,solver_buf_1,pz_aux_1,pz_aux_2 !!!PC
+ itemp = storage_size(work ,i8)*size(work ) + & !!!PC storage_size(work_halo ,i8)*size(work_halo ) + &
storage_size(solver_buf_0,i8)*size(solver_buf_0) + storage_size(solver_buf_1,i8)*size(solver_buf_1) + &
storage_size(pz_aux_1 ,i8)*size(pz_aux_1 ) + storage_size(pz_aux_2 ,i8)*size(pz_aux_2 )
itotal = itotal + itemp/8
diff --git a/src/workspaces.f90 b/src/workspaces.f90
index e33f5603..fdf8a08c 100644
--- a/src/workspaces.f90
+++ b/src/workspaces.f90
@@ -7,62 +7,70 @@
module mod_workspaces
#if defined(_OPENACC)
use mod_types
- use mod_common_cudecomp, only: work,work_cuda,work_halo,work_halo_cuda
+ use mod_common_cudecomp, only: work,work_cuda!!!PC,work_halo,work_halo_cuda
use mod_utils, only: f_sizeof
implicit none
private
public init_wspace_arrays,set_cufft_wspace
contains
subroutine init_wspace_arrays
- use mod_common_cudecomp, only: handle,gd_halo,gd_poi , &
- ap_x_poi,ap_y_poi,ap_z_poi, &
+ use mod_common_cudecomp, only: & !!!PC handle,gd_halo,gd_poi , &
+ !!!PC ap_x_poi,ap_y_poi,ap_z_poi, &
solver_buf_0,solver_buf_1, &
- ap_z,pz_aux_1,pz_aux_2, &
+ !!!PC ap_z,pz_aux_1,pz_aux_2, &
+ pz_aux_1,pz_aux_2, & !!!PC
istream_acc_queue_1
use mod_common_mpi , only: ipencil => ipencil_axis
use mod_fft , only: wsize_fft
use mod_param , only: cudecomp_is_t_in_place,cbcpre
- use cudecomp
+ use mod_param , only: ng !!!PC
+!!!PC use cudecomp
use openacc
+ !!!PC use cudafor !!!PC
implicit none
integer :: istat
integer(i8) :: wsize,max_wsize
- integer :: nh(3)
+!!!PC integer :: nh(3)
!
! allocate cuDecomp workspace buffer for transposes (reused for FFTs)
!
max_wsize = -1
wsize = wsize_fft
max_wsize = max(max_wsize,wsize)
- istat = cudecompGetTransposeWorkspaceSize(handle,gd_poi,wsize)
- max_wsize = max(max_wsize,wsize)
+!!!PC istat = cudecompGetTransposeWorkspaceSize(handle,gd_poi,wsize)
+!!!PC max_wsize = max(max_wsize,wsize)
!
! one can append more checks here (e.g., for spectra calculation)
!
allocate(work(max_wsize))
- istat = cudecompMalloc(handle,gd_poi,work_cuda,max_wsize)
+!!!PC istat = cudecompMalloc(handle,gd_poi,work_cuda,max_wsize)
+ allocate(work_cuda,mold=work) !!!PC
+ !istat = cudaMalloc(c_devloc(work_cuda),max_wsize*f_sizeof(work(1))) !!!PC
call acc_map_data(work,work_cuda,max_wsize*f_sizeof(work(1)))
!
! allocate cuDecomp workspace buffer for halos
! (needs to be different due to the possible need of an NVSHMEM allocator
! in one of the descriptors, rather than a simple cudaMaloc)
!
- nh(:) = 1
- istat = cudecompGetHaloWorkspaceSize(handle,gd_halo,ipencil,nh,max_wsize)
- allocate(work_halo(max_wsize))
+!!!PC nh(:) = 1
+!!!PC istat = cudecompGetHaloWorkspaceSize(handle,gd_halo,ipencil,nh,max_wsize)
+!!!PC allocate(work_halo(max_wsize))
!
- istat = cudecompMalloc(handle,gd_halo,work_halo_cuda,max_wsize)
- call acc_map_data(work_halo,work_halo_cuda,max_wsize*f_sizeof(work_halo(1)))
+!!!PC istat = cudecompMalloc(handle,gd_halo,work_halo_cuda,max_wsize)
+!!!PC call acc_map_data(work_halo,work_halo_cuda,max_wsize*f_sizeof(work_halo(1)))
!
! allocate transpose buffers
!
- wsize = max(ap_x_poi%size,ap_y_poi%size,ap_z_poi%size)
+!!!PC wsize = max(ap_x_poi%size,ap_y_poi%size,ap_z_poi%size)
+ wsize = product([2*(ng(1)/2+1),2*(ng(2)/2+1),ng(3)]) !!!PC
allocate(solver_buf_0(wsize))
if(.not.cudecomp_is_t_in_place) allocate(solver_buf_1,mold=solver_buf_0)
!$acc enter data create(solver_buf_0,solver_buf_1)
if(cbcpre(0,3)//cbcpre(1,3) == 'PP') then
- allocate(pz_aux_1(ap_z%shape(1),ap_z%shape(2),ap_z%shape(3)), &
- pz_aux_2(ap_z%shape(1),ap_z%shape(2),ap_z%shape(3)))
+!!!PC allocate(pz_aux_1(ap_z%shape(1),ap_z%shape(2),ap_z%shape(3)), &
+!!!PC pz_aux_2(ap_z%shape(1),ap_z%shape(2),ap_z%shape(3)))
+ allocate(pz_aux_1(ng(1),ng(2),ng(3)), & !!!PC
+ pz_aux_2(ng(1),ng(2),ng(3))) !!!PC
!$acc enter data create(pz_aux_1,pz_aux_2)
end if
istream_acc_queue_1 = acc_get_cuda_stream(1) ! fetch CUDA stream of OpenACC queue 1