From 6708d1c8fa751cf5f0eb4c8da1f8ab89fd33b190 Mon Sep 17 00:00:00 2001 From: Pedro Costa Date: Fri, 15 Mar 2024 13:54:40 +0100 Subject: [PATCH] Single GPU version without cuDecomp dependency. --- build.conf | 4 +- configs/libs.mk | 5 +- dependencies/external.mk | 4 +- src/bound.f90 | 36 ++++---- src/common_cudecomp.f90 | 14 ++-- src/initmpi.f90 | 175 ++++++++++++++++++++------------------ src/param.f90 | 4 +- src/solver_gpu.f90 | 176 ++++++++++++++++++++++++++++++--------- src/utils.f90 | 5 +- src/workspaces.f90 | 42 ++++++---- 10 files changed, 292 insertions(+), 173 deletions(-) diff --git a/build.conf b/build.conf index bff572e0..72059bf2 100644 --- a/build.conf +++ b/build.conf @@ -1,7 +1,7 @@ # # compiler and compiling profile # -FCOMP=GNU # options: GNU, NVIDIA, INTEL +FCOMP=NVIDIA # options: GNU, NVIDIA, INTEL FFLAGS_OPT=1 # for production runs FFLAGS_OPT_MAX=0 # for production runs (more aggressive optimization) FFLAGS_DEBUG=0 # for debugging @@ -18,5 +18,5 @@ SINGLE_PRECISION=0 # perform the whole calculation in single precision # # GPU-related # -GPU=0 +GPU=1 USE_NVTX=0 # use the NVTX-enabled Git branch 'with-nvtx' to see the markers diff --git a/configs/libs.mk b/configs/libs.mk index b6e9ba5c..6f12af13 100644 --- a/configs/libs.mk +++ b/configs/libs.mk @@ -2,8 +2,9 @@ override LIBS += -L$(LIBS_DIR)/2decomp-fft -ldecomp2d override INCS += -I$(LIBS_DIR)/2decomp-fft/mod ifeq ($(strip $(GPU)),1) -override LIBS += -L$(LIBS_DIR)/cuDecomp/build/lib -lcudecomp -lcudecomp_fort -cudalib=cufft -override INCS += -I$(LIBS_DIR)/cuDecomp/build/include +#override LIBS += -L$(LIBS_DIR)/cuDecomp/build/lib -lcudecomp -lcudecomp_fort -cudalib=cufft +override LIBS += -cudalib=cufft +#override INCS += -I$(LIBS_DIR)/cuDecomp/build/include endif ifeq ($(strip $(USE_NVTX)),1) diff --git a/dependencies/external.mk b/dependencies/external.mk index 52666aa3..32cfe32f 100644 --- a/dependencies/external.mk +++ b/dependencies/external.mk @@ -4,10 +4,10 @@ ifeq ($(strip $(GPU)),1) libs: $(wildcard $(LIBS_DIR)/2decomp-fft/src/*.f90) cd $(LIBS_DIR)/2decomp-fft && make - cd $(LIBS_DIR)/cuDecomp && make lib -j +# cd $(LIBS_DIR)/cuDecomp && make lib -j libsclean: $(wildcard $(LIBS_DIR)/2decomp-fft/src/*.f90) cd $(LIBS_DIR)/2decomp-fft && make clean - cd $(LIBS_DIR)/cuDecomp && make clean +# cd $(LIBS_DIR)/cuDecomp && make clean else libs: $(wildcard $(LIBS_DIR)/2decomp-fft/src/*.f90) cd $(LIBS_DIR)/2decomp-fft && make diff --git a/src/bound.f90 b/src/bound.f90 index 16e56284..56ede3ea 100644 --- a/src/bound.f90 +++ b/src/bound.f90 @@ -530,29 +530,29 @@ end subroutine updthalo #if defined(_OPENACC) subroutine updthalo_gpu(nh,periods,p) use mod_types - use cudecomp - use mod_common_cudecomp, only: work => work_halo, & - ch => handle,gd => gd_halo, & - dtype => cudecomp_real_rp, & - istream => istream_acc_queue_1 + !!!PC use cudecomp + !!!PC use mod_common_cudecomp, only: work => work_halo, & + !!!PC ch => handle,gd => gd_halo, & + !!!PC dtype => cudecomp_real_rp, & + !!!PC istream => istream_acc_queue_1 implicit none integer , intent(in) :: nh logical , intent(in) :: periods(3) real(rp), intent(inout), dimension(1-nh:,1-nh:,1-nh:) :: p integer :: istat - !$acc host_data use_device(p,work) - select case(ipencil_axis) - case(1) - istat = cudecompUpdateHalosX(ch,gd,p,work,dtype,[nh,nh,nh],periods,2,stream=istream) - istat = cudecompUpdateHalosX(ch,gd,p,work,dtype,[nh,nh,nh],periods,3,stream=istream) - case(2) - istat = cudecompUpdateHalosY(ch,gd,p,work,dtype,[nh,nh,nh],periods,1,stream=istream) - istat = cudecompUpdateHalosY(ch,gd,p,work,dtype,[nh,nh,nh],periods,3,stream=istream) - case(3) - istat = cudecompUpdateHalosZ(ch,gd,p,work,dtype,[nh,nh,nh],periods,1,stream=istream) - istat = cudecompUpdateHalosZ(ch,gd,p,work,dtype,[nh,nh,nh],periods,2,stream=istream) - end select - !$acc end host_data + !!!PC !$acc host_data use_device(p,work) + !!!PC select case(ipencil_axis) + !!!PC case(1) + !!!PC istat = cudecompUpdateHalosX(ch,gd,p,work,dtype,[nh,nh,nh],periods,2,stream=istream) + !!!PC istat = cudecompUpdateHalosX(ch,gd,p,work,dtype,[nh,nh,nh],periods,3,stream=istream) + !!!PC case(2) + !!!PC istat = cudecompUpdateHalosY(ch,gd,p,work,dtype,[nh,nh,nh],periods,1,stream=istream) + !!!PC istat = cudecompUpdateHalosY(ch,gd,p,work,dtype,[nh,nh,nh],periods,3,stream=istream) + !!!PC case(3) + !!!PC istat = cudecompUpdateHalosZ(ch,gd,p,work,dtype,[nh,nh,nh],periods,1,stream=istream) + !!!PC istat = cudecompUpdateHalosZ(ch,gd,p,work,dtype,[nh,nh,nh],periods,2,stream=istream) + !!!PC end select + !!!PC !$acc end host_data end subroutine updthalo_gpu #endif end module mod_bound diff --git a/src/common_cudecomp.f90 b/src/common_cudecomp.f90 index 47985d1f..b89dbab9 100644 --- a/src/common_cudecomp.f90 +++ b/src/common_cudecomp.f90 @@ -8,22 +8,22 @@ module mod_common_cudecomp #if defined(_OPENACC) use mod_types !@cuf use cudafor - use cudecomp + !!!PC use cudecomp use openacc use mod_param, only: cudecomp_is_t_in_place implicit none public - integer :: cudecomp_real_rp - type(cudecompHandle) :: handle - type(cudecompGridDesc) :: gd_halo,gd_poi - type(cudecompPencilInfo) :: ap_x,ap_y,ap_z,ap_x_poi,ap_y_poi,ap_z_poi +!!!PC integer :: cudecomp_real_rp +!!!PC type(cudecompHandle) :: handle +!!!PC type(cudecompGridDesc) :: gd_halo,gd_poi +!!!PC type(cudecompPencilInfo) :: ap_x,ap_y,ap_z,ap_x_poi,ap_y_poi,ap_z_poi ! ! workspace stuff ! integer(i8) :: wsize_fft real(rp), pointer, contiguous, dimension(:) :: work ,work_cuda - real(rp), pointer, contiguous, dimension(:) :: work_halo,work_halo_cuda - !@cuf attributes(device) :: work_cuda,work_halo_cuda +!!!PC real(rp), pointer, contiguous, dimension(:) :: work_halo,work_halo_cuda + !@cuf attributes(device) :: work_cuda !!!PC,work_halo_cuda real(rp), target, allocatable, dimension(:) :: solver_buf_0,solver_buf_1 #if !defined(_IMPDIFF_1D) real(rp), allocatable, dimension(:,:,:) :: pz_aux_1,pz_aux_2 diff --git a/src/initmpi.f90 b/src/initmpi.f90 index a689b489..0adcd216 100644 --- a/src/initmpi.f90 +++ b/src/initmpi.f90 @@ -10,8 +10,8 @@ module mod_initmpi use mod_common_mpi, only: myid,ierr,halo,ipencil => ipencil_axis use mod_types !@acc use openacc - !@acc use cudecomp - !@cuf use cudafor, only: cudaGetDeviceCount,cudaSetDevice + !!!PC !@acc use cudecomp + !!!PC !@cuf use cudafor, only: cudaGetDeviceCount,cudaSetDevice #if defined(_OPENACC) use mod_common_cudecomp, only: cudecomp_real_rp, & ch => handle,gd => gd_halo,gd_poi, & @@ -40,8 +40,8 @@ subroutine initmpi(ng,dims,bc,lo,hi,n,n_x_fft,n_y_fft,lo_z,hi_z,n_z,nb,is_bound) integer(acc_device_kind) ::dev_type integer :: local_comm,mydev,ndev integer :: istat - type(cudecompGridDescConfig) :: conf,conf_poi - type(cudecompGridDescAutotuneOptions) :: atune_conf +!!!PC type(cudecompGridDescConfig) :: conf,conf_poi +!!!PC type(cudecompGridDescAutotuneOptions) :: atune_conf #else integer :: comm_cart #endif @@ -53,7 +53,7 @@ subroutine initmpi(ng,dims,bc,lo,hi,n,n_x_fft,n_y_fft,lo_z,hi_z,n_z,nb,is_bound) call MPI_COMM_SPLIT_TYPE(MPI_COMM_WORLD,MPI_COMM_TYPE_SHARED,0,MPI_INFO_NULL,local_comm,ierr) call MPI_COMM_RANK(local_comm,mydev,ierr) dev_type = acc_get_device_type() -#if 1 +#if 0 istat = cudaGetDeviceCount(ndev) ! may be tweaked with environment variable CUDA_VISIBLE_DEVICES mydev = mod(mydev,ndev) istat = cudaSetDevice(mydev) @@ -65,52 +65,52 @@ subroutine initmpi(ng,dims,bc,lo,hi,n,n_x_fft,n_y_fft,lo_z,hi_z,n_z,nb,is_bound) call acc_set_device_num(mydev,dev_type) call acc_init(dev_type) ! - istat = cudecompInit(ch,MPI_COMM_WORLD) + !!!PCistat = cudecompInit(ch,MPI_COMM_WORLD) ! ! setup descriptor for the Poisson solver ! - istat = cudecompGridDescConfigSetDefaults(conf) - conf%transpose_comm_backend = cudecomp_t_comm_backend - conf%transpose_axis_contiguous(:) = [.true.,.true.,.false.] - conf%gdims(:) = [2*(ng(1)/2+1),2*(ng(2)/2+1),ng(3)] - conf%pdims(:) = dims(1:2) - conf%gdims_dist(:) = ng(:) - istat = cudecompGridDescAutotuneOptionsSetDefaults(atune_conf) - if(rp == dp) then - cudecomp_real_rp = CUDECOMP_DOUBLE - else - cudecomp_real_rp = CUDECOMP_FLOAT - end if - atune_conf%dtype = cudecomp_real_rp - atune_conf%grid_mode = CUDECOMP_AUTOTUNE_GRID_TRANSPOSE - atune_conf%autotune_transpose_backend = cudecomp_is_t_comm_autotune - atune_conf%disable_nccl_backends = .not.cudecomp_is_t_enable_nccl - atune_conf%disable_nvshmem_backends = .not.cudecomp_is_t_enable_nvshmem - istat = cudecompGridDescCreate(ch,gd_poi,conf,atune_conf) - conf_poi = conf - dims(:) = conf%pdims - ! - ! setup descriptor for halo exchanges - ! - istat = cudecompGridDescConfigSetDefaults(conf) - conf%gdims(:) = ng(:) - conf%pdims(:) = dims(1:2) - conf%halo_comm_backend = cudecomp_h_comm_backend - conf%transpose_axis_contiguous(:) = .false. - istat = cudecompGridDescAutotuneOptionsSetDefaults(atune_conf) - atune_conf%halo_extents(:) = 1 - atune_conf%halo_periods(:) = periods(:) - atune_conf%dtype = cudecomp_real_rp - atune_conf%autotune_halo_backend = cudecomp_is_h_comm_autotune - atune_conf%disable_nccl_backends = .not.cudecomp_is_t_enable_nccl - atune_conf%disable_nvshmem_backends = .not.cudecomp_is_t_enable_nvshmem - if(all(conf_poi%transpose_comm_backend /= [CUDECOMP_TRANSPOSE_COMM_NVSHMEM,CUDECOMP_TRANSPOSE_COMM_NVSHMEM_PL])) then - ! - ! disable NVSHMEM halo backend autotuning when NVSHMEM is NOT used for transposes - ! - atune_conf%disable_nvshmem_backends = .true. - end if - istat = cudecompGridDescCreate(ch,gd,conf,atune_conf) + !!!PC istat = cudecompGridDescConfigSetDefaults(conf) + !!!PC conf%transpose_comm_backend = cudecomp_t_comm_backend + !!!PC conf%transpose_axis_contiguous(:) = [.true.,.true.,.false.] + !!!PC conf%gdims(:) = [2*(ng(1)/2+1),2*(ng(2)/2+1),ng(3)] + !!!PC conf%pdims(:) = dims(1:2) + !!!PC conf%gdims_dist(:) = ng(:) + !!!PC istat = cudecompGridDescAutotuneOptionsSetDefaults(atune_conf) + !!!PC if(rp == dp) then + !!!PC cudecomp_real_rp = CUDECOMP_DOUBLE + !!!PC else + !!!PC cudecomp_real_rp = CUDECOMP_FLOAT + !!!PC end if + !!!PC atune_conf%dtype = cudecomp_real_rp + !!!PC atune_conf%grid_mode = CUDECOMP_AUTOTUNE_GRID_TRANSPOSE + !!!PC atune_conf%autotune_transpose_backend = cudecomp_is_t_comm_autotune + !!!PC atune_conf%disable_nccl_backends = .not.cudecomp_is_t_enable_nccl + !!!PC atune_conf%disable_nvshmem_backends = .not.cudecomp_is_t_enable_nvshmem + !!!PC istat = cudecompGridDescCreate(ch,gd_poi,conf,atune_conf) + !!!PC conf_poi = conf + !!!PC dims(:) = conf%pdims + !!!PC ! + !!!PC ! setup descriptor for halo exchanges + !!!PC ! + !!!PC istat = cudecompGridDescConfigSetDefaults(conf) + !!!PC conf%gdims(:) = ng(:) + !!!PC conf%pdims(:) = dims(1:2) + !!!PC conf%halo_comm_backend = cudecomp_h_comm_backend + !!!PC conf%transpose_axis_contiguous(:) = .false. + !!!PC istat = cudecompGridDescAutotuneOptionsSetDefaults(atune_conf) + !!!PC atune_conf%halo_extents(:) = 1 + !!!PC atune_conf%halo_periods(:) = periods(:) + !!!PC atune_conf%dtype = cudecomp_real_rp + !!!PC atune_conf%autotune_halo_backend = cudecomp_is_h_comm_autotune + !!!PC atune_conf%disable_nccl_backends = .not.cudecomp_is_t_enable_nccl + !!!PC atune_conf%disable_nvshmem_backends = .not.cudecomp_is_t_enable_nvshmem + !!!PC if(all(conf_poi%transpose_comm_backend /= [CUDECOMP_TRANSPOSE_COMM_NVSHMEM,CUDECOMP_TRANSPOSE_COMM_NVSHMEM_PL])) then + !!!PC ! + !!!PC ! disable NVSHMEM halo backend autotuning when NVSHMEM is NOT used for transposes + !!!PC ! + !!!PC atune_conf%disable_nvshmem_backends = .true. + !!!PC end if + !!!PC istat = cudecompGridDescCreate(ch,gd,conf,atune_conf) #endif call decomp_2d_init(ng(1),ng(2),ng(3),dims(1),dims(2),periods) #if !defined(_DECOMP_Y) && !defined(_DECOMP_Z) @@ -123,40 +123,51 @@ subroutine initmpi(ng,dims,bc,lo,hi,n,n_x_fft,n_y_fft,lo_z,hi_z,n_z,nb,is_bound) ipencil_t(:) = pack([1,2,3],[1,2,3] /= ipencil) is_bound(:,:) = .false. #if defined(_OPENACC) - ! - ! fetch lo(:), hi(:), n(:) and n_z(:) from cuDecomp (should match the modified 2decomp one) - ! - istat = cudecompGetPencilInfo(ch,gd ,ap_x ,1) - istat = cudecompGetPencilInfo(ch,gd ,ap_y ,2) - istat = cudecompGetPencilInfo(ch,gd ,ap_z ,3) - istat = cudecompGetPencilInfo(ch,gd_poi,ap_x_poi,1) - istat = cudecompGetPencilInfo(ch,gd_poi,ap_y_poi,2) - istat = cudecompGetPencilInfo(ch,gd_poi,ap_z_poi,3) - select case(ipencil) - case(1) - lo(:) = ap_x%lo(:) - hi(:) = ap_x%hi(:) - case(2) - lo(:) = ap_y%lo(:) - hi(:) = ap_y%hi(:) - case(3) - lo(:) = ap_z%lo(:) - hi(:) = ap_z%hi(:) - end select - n(:) = hi(:)-lo(:)+1 - n_x_fft(:) = ap_x_poi%shape(:) - n_y_fft(:) = ap_y_poi%shape(:) - lo_z(:) = ap_z%lo(:) - hi_z(:) = ap_z%hi(:) - n_z(:) = ap_z%shape(:) - nb(:,ipencil) = CUDECOMP_RANK_NULL - associate(ip_t => ipencil_t) - istat = cudecompGetShiftedRank(ch,gd,ipencil,ip_t(1),-1,periods(ip_t(1)),nb(0,ip_t(1))) - istat = cudecompGetShiftedRank(ch,gd,ipencil,ip_t(1), 1,periods(ip_t(1)),nb(1,ip_t(1))) - istat = cudecompGetShiftedRank(ch,gd,ipencil,ip_t(2),-1,periods(ip_t(2)),nb(0,ip_t(2))) - istat = cudecompGetShiftedRank(ch,gd,ipencil,ip_t(2), 1,periods(ip_t(2)),nb(1,ip_t(2))) - end associate - where(nb(:,:) == CUDECOMP_RANK_NULL) is_bound(:,:) = .true. + !!!PC ! + !!!PC ! fetch lo(:), hi(:), n(:) and n_z(:) from cuDecomp (should match the modified 2decomp one) + !!!PC ! + !!!PC istat = cudecompGetPencilInfo(ch,gd ,ap_x ,1) + !!!PC istat = cudecompGetPencilInfo(ch,gd ,ap_y ,2) + !!!PC istat = cudecompGetPencilInfo(ch,gd ,ap_z ,3) + !!!PC istat = cudecompGetPencilInfo(ch,gd_poi,ap_x_poi,1) + !!!PC istat = cudecompGetPencilInfo(ch,gd_poi,ap_y_poi,2) + !!!PC istat = cudecompGetPencilInfo(ch,gd_poi,ap_z_poi,3) + !!!PC select case(ipencil) + !!!PC case(1) + !!!PC lo(:) = ap_x%lo(:) + !!!PC hi(:) = ap_x%hi(:) + !!!PC case(2) + !!!PC lo(:) = ap_y%lo(:) + !!!PC hi(:) = ap_y%hi(:) + !!!PC case(3) + !!!PC lo(:) = ap_z%lo(:) + !!!PC hi(:) = ap_z%hi(:) + !!!PC end select + !!!PC n(:) = hi(:)-lo(:)+1 + !!!PC n_x_fft(:) = ap_x_poi%shape(:) + !!!PC n_y_fft(:) = ap_y_poi%shape(:) + !!!PC lo_z(:) = ap_z%lo(:) + !!!PC hi_z(:) = ap_z%hi(:) + !!!PC n_z(:) = ap_z%shape(:) + !!!PC nb(:,ipencil) = CUDECOMP_RANK_NULL + !!!PC associate(ip_t => ipencil_t) + !!!PC istat = cudecompGetShiftedRank(ch,gd,ipencil,ip_t(1),-1,periods(ip_t(1)),nb(0,ip_t(1))) + !!!PC istat = cudecompGetShiftedRank(ch,gd,ipencil,ip_t(1), 1,periods(ip_t(1)),nb(1,ip_t(1))) + !!!PC istat = cudecompGetShiftedRank(ch,gd,ipencil,ip_t(2),-1,periods(ip_t(2)),nb(0,ip_t(2))) + !!!PC istat = cudecompGetShiftedRank(ch,gd,ipencil,ip_t(2), 1,periods(ip_t(2)),nb(1,ip_t(2))) + !!!PC end associate + !!!PC where(nb(:,:) == CUDECOMP_RANK_NULL) is_bound(:,:) = .true. + lo(:) = 1 !!!PC + hi(:) = ng(:) !!!PC + n(:) = hi(:)-lo(:)+1 !!!PC + n_x_fft(:) = [2*(ng(1)/2+1),2*(ng(2)/2+1),ng(3)] !!!PC + n_y_fft(:) = [2*(ng(2)/2+1),ng(3),2*(ng(1)/2+1)] !!!PC + lo_z(:) = lo(:) !!!PC + hi_z(:) = hi(:) !!!PC + n_z(:) = n(:) !!!PC + nb(:,ipencil) = CUDECOMP_RANK_NULL !!!PC + where(nb(:,:) == CUDECOMP_RANK_NULL) is_bound(:,:) = .true. !!!PC + is_bound(:,:) = .true. !!!PC #else select case(ipencil) case(1) diff --git a/src/param.f90 b/src/param.f90 index 80b319a1..62d95af8 100644 --- a/src/param.f90 +++ b/src/param.f90 @@ -6,7 +6,7 @@ ! - module mod_param use mod_types -!@acc use cudecomp +!!@acc use cudecomp implicit none public ! @@ -112,7 +112,7 @@ subroutine read_input(myid) dl(:) = l(:)/(1.*ng(:)) dli(:) = dl(:)**(-1) visc = visci**(-1) -#if defined(_OPENACC) +#if 0 ! ! read cuDecomp parameter file cudecomp.in, if it exists ! diff --git a/src/solver_gpu.f90 b/src/solver_gpu.f90 index 4d80b5dd..6f32372c 100644 --- a/src/solver_gpu.f90 +++ b/src/solver_gpu.f90 @@ -7,19 +7,19 @@ module mod_solver_gpu #if defined(_OPENACC) use, intrinsic :: iso_c_binding, only: C_PTR - use cudecomp + !!!PC use cudecomp use mod_fft , only: signal_processing,fftf_gpu,fftb_gpu use mod_types use mod_common_mpi , only: ipencil_axis - use mod_common_cudecomp, only: dtype_rp => cudecomp_real_rp, & + use mod_common_cudecomp, only: & !!!PC dtype_rp => cudecomp_real_rp, & cudecomp_is_t_in_place, & solver_buf_0,solver_buf_1, & pz_aux_1,pz_aux_2,work, & - ap_x => ap_x_poi, & - ap_y => ap_y_poi, & - ap_z => ap_z_poi, & - ap_z_0 => ap_z , & - ch => handle,gd => gd_poi, & + !ap_x => ap_x_poi, & + !ap_y => ap_y_poi, & + !ap_z => ap_z_poi, & + !ap_z_0 => ap_z , & + !ch => handle,gd => gd_poi, & istream => istream_acc_queue_1 implicit none private @@ -42,11 +42,19 @@ subroutine solver_gpu(n,ng,arrplan,normfft,lambdaxy,a,b,c,bc,c_or_f,p) integer :: q integer, dimension(3) :: n_x,n_y,n_z,n_z_0 integer :: istat + integer :: i,j,k,n_x_1,n_x_2,n_x_3 !!!PC ! - n_z_0(:) = ap_z_0%shape(:) - n_x(:) = ap_x%shape(:) - n_y(:) = ap_y%shape(:) - n_z(:) = ap_z%shape(:) + n_z_0(:) = ng(:) !!!PC + n_x(:) = [2*(ng(1)/2+1),2*(ng(2)/2+1),ng(3)] !!!PC + n_y(:) = [2*(ng(2)/2+1),ng(3),2*(ng(1)/2+1)] !!!PC + n_z(:) = [2*(ng(1)/2+1),2*(ng(2)/2+1),ng(3)] !!!PC + n_x_1 = n_x(1) !!!PC + n_x_2 = n_x(2) !!!PC + n_x_3 = n_x(3) !!!PC + !!!PC n_z_0(:) = ap_z_0%shape(:) + !!!PC n_x(:) = ap_x%shape(:) + !!!PC n_y(:) = ap_y%shape(:) + !!!PC n_z(:) = ap_z%shape(:) px(1:n_x(1),1:n_x(2),1:n_x(3)) => solver_buf_0(1:product(n_x(:))) if(cudecomp_is_t_in_place) then py(1:n_y(1),1:n_y(2),1:n_y(3)) => solver_buf_0(1:product(n_y(:))) @@ -64,7 +72,7 @@ subroutine solver_gpu(n,ng,arrplan,normfft,lambdaxy,a,b,c,bc,c_or_f,p) !$acc end kernels case(2) block - integer :: i,j,k + !!!PC integer :: i,j,k ! ! transpose p -> py to axis-contiguous layout ! @@ -78,36 +86,81 @@ subroutine solver_gpu(n,ng,arrplan,normfft,lambdaxy,a,b,c,bc,c_or_f,p) end do end do end block - !$acc host_data use_device(py,px,work) - istat = cudecompTransposeYtoX(ch,gd,py,px,work,dtype_rp,stream=istream) - !$acc end host_data + !!!PC !$acc host_data use_device(py,px,work) + !!!PC istat = cudecompTransposeYtoX(ch,gd,py,px,work,dtype_rp,stream=istream) + !!!PC !$acc end host_data + !$acc parallel loop collapse(3) default(present) async(1) !!!PC + !$OMP PARALLEL DO COLLAPSE(3) DEFAULT(shared) !!!PC + do k=1,n_x_3 !!!PC + do j=1,n_x_2 !!!PC + do i=1,n_x_1 !!!PC + px(i,j,k) = py(j,k,i) !!!PC + end do !!!PC + end do !!!PC + end do !!!PC case(3) !$acc kernels default(present) async(1) !$OMP PARALLEL WORKSHARE pz(1:n(1),1:n(2),1:n(3)) = p(1:n(1),1:n(2),1:n(3)) !$OMP END PARALLEL WORKSHARE !$acc end kernels - !$acc host_data use_device(pz,py,px,work) - istat = cudecompTransposeZtoY(ch,gd,pz,py,work,dtype_rp,stream=istream) - istat = cudecompTransposeYtoX(ch,gd,py,px,work,dtype_rp,stream=istream) - !$acc end host_data + !!!PC !$acc host_data use_device(pz,py,px,work) + !!!PC istat = cudecompTransposeZtoY(ch,gd,pz,py,work,dtype_rp,stream=istream) + !!!PC istat = cudecompTransposeYtoX(ch,gd,py,px,work,dtype_rp,stream=istream) + !!!PC !$acc end host_data + !$acc parallel loop collapse(3) default(present) async(1) !!!PC + !$OMP PARALLEL DO COLLAPSE(3) DEFAULT(shared) !!!PC + do k=1,n_x_3 !!!PC + do j=1,n_x_2 !!!PC + do i=1,n_x_1 !!!PC + py(j,k,i) = pz(i,j,k) !!!PC + end do !!!PC + end do !!!PC + end do !!!PC + !$acc parallel loop collapse(3) default(present) async(1) !!!PC + !$OMP PARALLEL DO COLLAPSE(3) DEFAULT(shared) !!!PC + do k=1,n_x_3 !!!PC + do j=1,n_x_2 !!!PC + do i=1,n_x_1 !!!PC + px(i,j,k) = py(j,k,i) !!!PC + end do !!!PC + end do !!!PC + end do !!!PC end select ! call signal_processing(0,'F',bc(0,1)//bc(1,1),c_or_f(1),ng(1),n_x,1,px) call fftf_gpu(arrplan(1,1),px) call signal_processing(1,'F',bc(0,1)//bc(1,1),c_or_f(1),ng(1),n_x,1,px) ! - !$acc host_data use_device(px,py,work) - istat = cudecompTransposeXtoY(ch,gd,px,py,work,dtype_rp,stream=istream) - !$acc end host_data + !!!PC !$acc host_data use_device(px,py,work) + !!!PC istat = cudecompTransposeXtoY(ch,gd,px,py,work,dtype_rp,stream=istream) + !!!PC !$acc end host_data + !$acc parallel loop collapse(3) default(present) async(1) !!!PC + !$OMP PARALLEL DO COLLAPSE(3) DEFAULT(shared) !!!PC + do k=1,n_x_3 !!!PC + do j=1,n_x_2 !!!PC + do i=1,n_x_1 !!!PC + py(j,k,i) = px(i,j,k) !!!PC + end do !!!PC + end do !!!PC + end do !!!PC ! call signal_processing(0,'F',bc(0,2)//bc(1,2),c_or_f(2),ng(2),n_y,1,py) call fftf_gpu(arrplan(1,2),py) call signal_processing(1,'F',bc(0,2)//bc(1,2),c_or_f(2),ng(2),n_y,1,py) ! - !$acc host_data use_device(py,pz,work) - istat = cudecompTransposeYtoZ(ch,gd,py,pz,work,dtype_rp,stream=istream) - !$acc end host_data + !!!PC !$acc host_data use_device(py,pz,work) + !!!PC istat = cudecompTransposeYtoZ(ch,gd,py,pz,work,dtype_rp,stream=istream) + !!!PC !$acc end host_data + !$acc parallel loop collapse(3) default(present) async(1) !!!PC + !$OMP PARALLEL DO COLLAPSE(3) DEFAULT(shared) !!!PC + do k=1,n_x_3 !!!PC + do j=1,n_x_2 !!!PC + do i=1,n_x_1 !!!PC + pz(i,j,k) = py(j,k,i) !!!PC + end do !!!PC + end do !!!PC + end do !!!PC ! q = 0 if(c_or_f(3) == 'f'.and.bc(1,3) == 'D') q = 1 @@ -117,17 +170,35 @@ subroutine solver_gpu(n,ng,arrplan,normfft,lambdaxy,a,b,c,bc,c_or_f,p) call gaussel_gpu( n_z_0(1),n_z_0(2),n_z_0(3)-q,0,a,b,c,pz,work,lambdaxy) end if ! - !$acc host_data use_device(pz,py,work) - istat = cudecompTransposeZtoY(ch,gd,pz,py,work,dtype_rp,stream=istream) - !$acc end host_data + !!!PC !$acc host_data use_device(pz,py,work) + !!!PC istat = cudecompTransposeZtoY(ch,gd,pz,py,work,dtype_rp,stream=istream) + !!!PC !$acc end host_data + !$acc parallel loop collapse(3) default(present) async(1) !!!PC + !$OMP PARALLEL DO COLLAPSE(3) DEFAULT(shared) !!!PC + do k=1,n_x_3 !!!PC + do j=1,n_x_2 !!!PC + do i=1,n_x_1 !!!PC + py(j,k,i) = pz(i,j,k) !!!PC + end do !!!PC + end do !!!PC + end do !!!PC ! call signal_processing(0,'B',bc(0,2)//bc(1,2),c_or_f(2),ng(2),n_y,1,py) call fftb_gpu(arrplan(2,2),py) call signal_processing(1,'B',bc(0,2)//bc(1,2),c_or_f(2),ng(2),n_y,1,py) ! - !$acc host_data use_device(py,px,work) - istat = cudecompTransposeYtoX(ch,gd,py,px,work,dtype_rp,stream=istream) - !$acc end host_data + !!!PC !$acc host_data use_device(py,px,work) + !!!PC istat = cudecompTransposeYtoX(ch,gd,py,px,work,dtype_rp,stream=istream) + !!!PC !$acc end host_data + !$acc parallel loop collapse(3) default(present) async(1) !!!PC + !$OMP PARALLEL DO COLLAPSE(3) DEFAULT(shared) !!!PC + do k=1,n_x_3 !!!PC + do j=1,n_x_2 !!!PC + do i=1,n_x_1 !!!PC + px(i,j,k) = py(j,k,i) !!!PC + end do !!!PC + end do !!!PC + end do !!!PC ! call signal_processing(0,'B',bc(0,1)//bc(1,1),c_or_f(1),ng(1),n_x,1,px) call fftb_gpu(arrplan(2,1),px) @@ -141,11 +212,20 @@ subroutine solver_gpu(n,ng,arrplan,normfft,lambdaxy,a,b,c,bc,c_or_f,p) !$OMP END PARALLEL WORKSHARE !$acc end kernels case(2) - !$acc host_data use_device(px,py,work) - istat = cudecompTransposeXtoY(ch,gd,px,py,work,dtype_rp,stream=istream) - !$acc end host_data + !!!PC !$acc host_data use_device(px,py,work) + !!!PC istat = cudecompTransposeXtoY(ch,gd,px,py,work,dtype_rp,stream=istream) + !!!PC !$acc end host_data + !$acc parallel loop collapse(3) default(present) async(1) !!!PC + !$OMP PARALLEL DO COLLAPSE(3) DEFAULT(shared) !!!PC + do k=1,n_x_3 !!!PC + do j=1,n_x_2 !!!PC + do i=1,n_x_1 !!!PC + py(j,k,i) = px(i,j,k) !!!PC + end do !!!PC + end do !!!PC + end do !!!PC block - integer :: i,j,k + !!!PC integer :: i,j,k ! ! transpose py -> p to default layout ! @@ -160,10 +240,28 @@ subroutine solver_gpu(n,ng,arrplan,normfft,lambdaxy,a,b,c,bc,c_or_f,p) end do end block case(3) - !$acc host_data use_device(px,py,pz,work) - istat = cudecompTransposeXtoY(ch,gd,px,py,work,dtype_rp,stream=istream) - istat = cudecompTransposeYtoZ(ch,gd,py,pz,work,dtype_rp,stream=istream) - !$acc end host_data + !!!PC !$acc host_data use_device(px,py,pz,work) + !!!PC istat = cudecompTransposeXtoY(ch,gd,px,py,work,dtype_rp,stream=istream) + !!!PC istat = cudecompTransposeYtoZ(ch,gd,py,pz,work,dtype_rp,stream=istream) + !!!PC !$acc end host_data + !$acc parallel loop collapse(3) default(present) async(1) !!!PC + !$OMP PARALLEL DO COLLAPSE(3) DEFAULT(shared) !!!PC + do k=1,n_x_3 !!!PC + do j=1,n_x_2 !!!PC + do i=1,n_x_1 !!!PC + py(j,k,i) = px(i,j,k) !!!PC + end do !!!PC + end do !!!PC + end do !!!PC + !$acc parallel loop collapse(3) default(present) async(1) !!!PC + !$OMP PARALLEL DO COLLAPSE(3) DEFAULT(shared) !!!PC + do k=1,n_x_3 !!!PC + do j=1,n_x_2 !!!PC + do i=1,n_x_1 !!!PC + pz(i,j,k) = py(j,k,i) !!!PC + end do !!!PC + end do !!!PC + end do !!!PC !$acc kernels default(present) async(1) !$OMP PARALLEL WORKSHARE p(1:n(1),1:n(2),1:n(3)) = pz(1:n(1),1:n(2),1:n(3))*normfft diff --git a/src/utils.f90 b/src/utils.f90 index 849c1316..c634302b 100644 --- a/src/utils.f90 +++ b/src/utils.f90 @@ -116,8 +116,9 @@ function device_memory_footprint(n,n_z) result(itotal) ! taken directly from `mod_common_cudecomp` ! block - use mod_common_cudecomp, only: work,work_halo,solver_buf_0,solver_buf_1,pz_aux_1,pz_aux_2 - itemp = storage_size(work ,i8)*size(work ) + storage_size(work_halo ,i8)*size(work_halo ) + & + !!!PC use mod_common_cudecomp, only: work,work_halo,solver_buf_0,solver_buf_1,pz_aux_1,pz_aux_2 + use mod_common_cudecomp, only: work,solver_buf_0,solver_buf_1,pz_aux_1,pz_aux_2 !!!PC + itemp = storage_size(work ,i8)*size(work ) + & !!!PC storage_size(work_halo ,i8)*size(work_halo ) + & storage_size(solver_buf_0,i8)*size(solver_buf_0) + storage_size(solver_buf_1,i8)*size(solver_buf_1) + & storage_size(pz_aux_1 ,i8)*size(pz_aux_1 ) + storage_size(pz_aux_2 ,i8)*size(pz_aux_2 ) itotal = itotal + itemp/8 diff --git a/src/workspaces.f90 b/src/workspaces.f90 index e33f5603..fdf8a08c 100644 --- a/src/workspaces.f90 +++ b/src/workspaces.f90 @@ -7,62 +7,70 @@ module mod_workspaces #if defined(_OPENACC) use mod_types - use mod_common_cudecomp, only: work,work_cuda,work_halo,work_halo_cuda + use mod_common_cudecomp, only: work,work_cuda!!!PC,work_halo,work_halo_cuda use mod_utils, only: f_sizeof implicit none private public init_wspace_arrays,set_cufft_wspace contains subroutine init_wspace_arrays - use mod_common_cudecomp, only: handle,gd_halo,gd_poi , & - ap_x_poi,ap_y_poi,ap_z_poi, & + use mod_common_cudecomp, only: & !!!PC handle,gd_halo,gd_poi , & + !!!PC ap_x_poi,ap_y_poi,ap_z_poi, & solver_buf_0,solver_buf_1, & - ap_z,pz_aux_1,pz_aux_2, & + !!!PC ap_z,pz_aux_1,pz_aux_2, & + pz_aux_1,pz_aux_2, & !!!PC istream_acc_queue_1 use mod_common_mpi , only: ipencil => ipencil_axis use mod_fft , only: wsize_fft use mod_param , only: cudecomp_is_t_in_place,cbcpre - use cudecomp + use mod_param , only: ng !!!PC +!!!PC use cudecomp use openacc + !!!PC use cudafor !!!PC implicit none integer :: istat integer(i8) :: wsize,max_wsize - integer :: nh(3) +!!!PC integer :: nh(3) ! ! allocate cuDecomp workspace buffer for transposes (reused for FFTs) ! max_wsize = -1 wsize = wsize_fft max_wsize = max(max_wsize,wsize) - istat = cudecompGetTransposeWorkspaceSize(handle,gd_poi,wsize) - max_wsize = max(max_wsize,wsize) +!!!PC istat = cudecompGetTransposeWorkspaceSize(handle,gd_poi,wsize) +!!!PC max_wsize = max(max_wsize,wsize) ! ! one can append more checks here (e.g., for spectra calculation) ! allocate(work(max_wsize)) - istat = cudecompMalloc(handle,gd_poi,work_cuda,max_wsize) +!!!PC istat = cudecompMalloc(handle,gd_poi,work_cuda,max_wsize) + allocate(work_cuda,mold=work) !!!PC + !istat = cudaMalloc(c_devloc(work_cuda),max_wsize*f_sizeof(work(1))) !!!PC call acc_map_data(work,work_cuda,max_wsize*f_sizeof(work(1))) ! ! allocate cuDecomp workspace buffer for halos ! (needs to be different due to the possible need of an NVSHMEM allocator ! in one of the descriptors, rather than a simple cudaMaloc) ! - nh(:) = 1 - istat = cudecompGetHaloWorkspaceSize(handle,gd_halo,ipencil,nh,max_wsize) - allocate(work_halo(max_wsize)) +!!!PC nh(:) = 1 +!!!PC istat = cudecompGetHaloWorkspaceSize(handle,gd_halo,ipencil,nh,max_wsize) +!!!PC allocate(work_halo(max_wsize)) ! - istat = cudecompMalloc(handle,gd_halo,work_halo_cuda,max_wsize) - call acc_map_data(work_halo,work_halo_cuda,max_wsize*f_sizeof(work_halo(1))) +!!!PC istat = cudecompMalloc(handle,gd_halo,work_halo_cuda,max_wsize) +!!!PC call acc_map_data(work_halo,work_halo_cuda,max_wsize*f_sizeof(work_halo(1))) ! ! allocate transpose buffers ! - wsize = max(ap_x_poi%size,ap_y_poi%size,ap_z_poi%size) +!!!PC wsize = max(ap_x_poi%size,ap_y_poi%size,ap_z_poi%size) + wsize = product([2*(ng(1)/2+1),2*(ng(2)/2+1),ng(3)]) !!!PC allocate(solver_buf_0(wsize)) if(.not.cudecomp_is_t_in_place) allocate(solver_buf_1,mold=solver_buf_0) !$acc enter data create(solver_buf_0,solver_buf_1) if(cbcpre(0,3)//cbcpre(1,3) == 'PP') then - allocate(pz_aux_1(ap_z%shape(1),ap_z%shape(2),ap_z%shape(3)), & - pz_aux_2(ap_z%shape(1),ap_z%shape(2),ap_z%shape(3))) +!!!PC allocate(pz_aux_1(ap_z%shape(1),ap_z%shape(2),ap_z%shape(3)), & +!!!PC pz_aux_2(ap_z%shape(1),ap_z%shape(2),ap_z%shape(3))) + allocate(pz_aux_1(ng(1),ng(2),ng(3)), & !!!PC + pz_aux_2(ng(1),ng(2),ng(3))) !!!PC !$acc enter data create(pz_aux_1,pz_aux_2) end if istream_acc_queue_1 = acc_get_cuda_stream(1) ! fetch CUDA stream of OpenACC queue 1