From ef3d02c81fc3589c8648f4a1d7f77b5c30e62766 Mon Sep 17 00:00:00 2001 From: Sebastien Lemaire Date: Fri, 19 Jan 2024 16:58:29 +0000 Subject: [PATCH 01/19] feat(omp): begin implementation of transeq solver still WIP, compiles but not tested --- src/omp/backend.f90 | 89 +++++++++++++++++++++++++++++++++++-- src/omp/kernels_dist.f90 | 96 ++++++++++++++++++++++++++-------------- 2 files changed, 149 insertions(+), 36 deletions(-) diff --git a/src/omp/backend.f90 b/src/omp/backend.f90 index 645137cb..ce95c1f5 100644 --- a/src/omp/backend.f90 +++ b/src/omp/backend.f90 @@ -3,6 +3,8 @@ module m_omp_backend use m_base_backend, only: base_backend_t use m_common, only: dp, globs_t use m_tdsops, only: dirps_t, tdsops_t + use m_omp_exec_dist, only: exec_dist_tds_compact + use m_omp_sendrecv, only: sendrecv_fields use m_omp_common, only: SZ @@ -33,6 +35,7 @@ module m_omp_backend procedure :: vecadd => vecadd_omp procedure :: set_fields => set_fields_omp procedure :: get_fields => get_fields_omp + procedure :: transeq_omp_dist end type omp_backend_t interface omp_backend_t @@ -121,7 +124,7 @@ subroutine transeq_x_omp(self, du, dv, dw, u, v, w, dirps) class(field_t), intent(in) :: u, v, w type(dirps_t), intent(in) :: dirps - !call self%transeq_omp_dist(du, dv, dw, u, v, w, dirps) + call self%transeq_omp_dist(du, dv, dw, u, v, w, dirps) end subroutine transeq_x_omp @@ -134,7 +137,7 @@ subroutine transeq_y_omp(self, du, dv, dw, u, v, w, dirps) type(dirps_t), intent(in) :: dirps ! u, v, w is reordered so that we pass v, u, w - !call self%transeq_omp_dist(dv, du, dw, v, u, w, dirps) + call self%transeq_omp_dist(dv, du, dw, v, u, w, dirps) end subroutine transeq_y_omp @@ -147,10 +150,36 @@ subroutine transeq_z_omp(self, du, dv, dw, u, v, w, dirps) type(dirps_t), intent(in) :: dirps ! u, v, w is reordered so that we pass w, u, v - !call self%transeq_omp_dist(dw, du, dv, w, u, v, dirps) + call self%transeq_omp_dist(dw, du, dv, w, u, v, dirps) end subroutine transeq_z_omp + subroutine transeq_omp_dist(self, du, dv, dw, u, v, w, dirps) + implicit none + + class(omp_backend_t) :: self + class(field_t), intent(inout) :: du, dv, dw + class(field_t), intent(in) :: u, v, w + type(dirps_t), intent(in) :: dirps + class(field_t), pointer :: duu, d2u, uu, du_temp + + ! du + du_temp => self%allocator%get_block() + call tds_solve_omp(self, du_temp, u, dirps, dirps%der1st) + + duu => self%allocator%get_block() + uu => self%allocator%get_block() + call vecmul_omp(uu, u, u, dirps) + call tds_solve_omp(self, duu, uu, dirps, dirps%der1st_sym) + + d2u => self%allocator%get_block() + call tds_solve_omp(self, d2u, u, dirps, dirps%der2nd) + + + + + end subroutine transeq_omp_dist + subroutine tds_solve_omp(self, du, u, dirps, tdsops) implicit none @@ -160,10 +189,36 @@ subroutine tds_solve_omp(self, du, u, dirps, tdsops) type(dirps_t), intent(in) :: dirps class(tdsops_t), intent(in) :: tdsops - !call self%tds_solve_dist(self, du, u, dirps, tdsops) + call tds_solve_dist(self, du, u, dirps, tdsops) end subroutine tds_solve_omp + subroutine tds_solve_dist(self, du, u, dirps, tdsops) + implicit none + + class(omp_backend_t) :: self + class(field_t), intent(inout) :: du + class(field_t), intent(in) :: u + type(dirps_t), intent(in) :: dirps + class(tdsops_t), intent(in) :: tdsops + integer :: n_halo + + ! TODO: don't hardcode n_halo + n_halo = 4 + call copy_into_buffers(self%u_send_s, self%u_send_e, u%data, dirps%n) + + ! halo exchange + call sendrecv_fields(self%u_recv_s, self%u_recv_e, self%u_send_s, self%u_send_e, & + SZ*n_halo*dirps%n_blocks, dirps%nproc, dirps%pprev, dirps%pnext) + + + call exec_dist_tds_compact( & + du%data, u%data, self%u_recv_s, self%u_recv_e, self%du_send_s, self%du_send_e, & + self%du_recv_s, self%du_recv_e, & + tdsops, dirps%nproc, dirps%pprev, dirps%pnext, dirps%n_blocks) + + end subroutine tds_solve_dist + subroutine trans_x2y_omp(self, u_, v_, w_, u, v, w) implicit none @@ -230,6 +285,32 @@ subroutine vecadd_omp(self, a, x, b, y) end subroutine vecadd_omp + subroutine vecmul_omp(uv, u, v, dirps) + implicit none + + class(field_t), intent(inout) :: uv + class(field_t), intent(in) :: u, v + type(dirps_t), intent(in) :: dirps + integer :: i, j, k + + real(dp), pointer, dimension(:, :, :) :: u_data, v_data, uv_data + + select type(u); type is (field_t); u_data => u%data; end select + select type(v); type is (field_t); v_data => v%data; end select + select type(uv); type is (field_t); uv_data => uv%data; end select + + do k = 1, dirps%n_blocks + do j = 1, dirps%n + !$omp simd + do i = 1, SZ + uv_data(i, j, k) = u_data(i, j, k)*v_data(i, j, k) + end do + !$omp end simd + end do + end do + + end subroutine vecmul_omp + subroutine copy_into_buffers(u_send_s, u_send_e, u, n) implicit none diff --git a/src/omp/kernels_dist.f90 b/src/omp/kernels_dist.f90 index 35c883ec..bf706904 100644 --- a/src/omp/kernels_dist.f90 +++ b/src/omp/kernels_dist.f90 @@ -37,29 +37,45 @@ subroutine der_univ_dist( & !$omp simd do i = 1, SZ - du(i, 1) = coeffs_s(1, 1)*u_s(i, 1) + coeffs_s(2, 1)*u_s(i, 2) & - + coeffs_s(3, 1)*u_s(i, 3) + coeffs_s(4, 1)*u_s(i, 4) & + du(i, 1) = coeffs_s(1, 1)*u_s(i, 1) & + + coeffs_s(2, 1)*u_s(i, 2) & + + coeffs_s(3, 1)*u_s(i, 3) & + + coeffs_s(4, 1)*u_s(i, 4) & + coeffs_s(5, 1)*u(i, 1) & - + coeffs_s(6, 1)*u(i, 2) + coeffs_s(7, 1)*u(i, 3) & - + coeffs_s(8, 1)*u(i, 4) + coeffs_s(9, 1)*u(i, 5) + + coeffs_s(6, 1)*u(i, 2) & + + coeffs_s(7, 1)*u(i, 3) & + + coeffs_s(8, 1)*u(i, 4) & + + coeffs_s(9, 1)*u(i, 5) du(i, 1) = du(i, 1)*faf(1) - du(i, 2) = coeffs_s(1, 2)*u_s(i, 2) + coeffs_s(2, 2)*u_s(i, 3) & - + coeffs_s(3, 2)*u_s(i, 4) + coeffs_s(4, 2)*u(i, 1) & + du(i, 2) = coeffs_s(1, 2)*u_s(i, 2) & + + coeffs_s(2, 2)*u_s(i, 3) & + + coeffs_s(3, 2)*u_s(i, 4) & + + coeffs_s(4, 2)*u(i, 1) & + coeffs_s(5, 2)*u(i, 2) & - + coeffs_s(6, 2)*u(i, 3) + coeffs_s(7, 2)*u(i, 4) & - + coeffs_s(8, 2)*u(i, 5) + coeffs_s(9, 2)*u(i, 6) + + coeffs_s(6, 2)*u(i, 3) & + + coeffs_s(7, 2)*u(i, 4) & + + coeffs_s(8, 2)*u(i, 5) & + + coeffs_s(9, 2)*u(i, 6) du(i, 2) = du(i, 2)*faf(2) - du(i, 3) = coeffs_s(1, 3)*u_s(i, 3) + coeffs_s(2, 3)*u_s(i, 4) & - + coeffs_s(3, 3)*u(i, 1) + coeffs_s(4, 3)*u(i, 2) & + du(i, 3) = coeffs_s(1, 3)*u_s(i, 3) & + + coeffs_s(2, 3)*u_s(i, 4) & + + coeffs_s(3, 3)*u(i, 1) & + + coeffs_s(4, 3)*u(i, 2) & + coeffs_s(5, 3)*u(i, 3) & - + coeffs_s(6, 3)*u(i, 4) + coeffs_s(7, 3)*u(i, 5) & - + coeffs_s(8, 3)*u(i, 6) + coeffs_s(9, 3)*u(i, 7) + + coeffs_s(6, 3)*u(i, 4) & + + coeffs_s(7, 3)*u(i, 5) & + + coeffs_s(8, 3)*u(i, 6) & + + coeffs_s(9, 3)*u(i, 7) du(i, 3) = ffr(3)*(du(i, 3) - faf(3)*du(i, 2)) - du(i, 4) = coeffs_s(1, 4)*u_s(i, 4) + coeffs_s(2, 4)*u(i, 1) & - + coeffs_s(3, 4)*u(i, 2) + coeffs_s(4, 4)*u(i, 3) & + du(i, 4) = coeffs_s(1, 4)*u_s(i, 4) & + + coeffs_s(2, 4)*u(i, 1) & + + coeffs_s(3, 4)*u(i, 2) & + + coeffs_s(4, 4)*u(i, 3) & + coeffs_s(5, 4)*u(i, 4) & - + coeffs_s(6, 4)*u(i, 5) + coeffs_s(7, 4)*u(i, 6) & - + coeffs_s(8, 4)*u(i, 7) + coeffs_s(9, 4)*u(i, 8) + + coeffs_s(6, 4)*u(i, 5) & + + coeffs_s(7, 4)*u(i, 6) & + + coeffs_s(8, 4)*u(i, 7) & + + coeffs_s(9, 4)*u(i, 8) du(i, 4) = ffr(4)*(du(i, 4) - faf(4)*du(i, 3)) end do !$omp end simd @@ -83,32 +99,48 @@ subroutine der_univ_dist( & !$omp simd do i = 1, SZ j = n - 3 - du(i, j) = coeffs_e(1, 1)*u(i, j - 4) + coeffs_e(2, 1)*u(i, j - 3) & - + coeffs_e(3, 1)*u(i, j - 2) + coeffs_e(4, 1)*u(i, j - 1) & + du(i, j) = coeffs_e(1, 1)*u(i, j - 4) & + + coeffs_e(2, 1)*u(i, j - 3) & + + coeffs_e(3, 1)*u(i, j - 2) & + + coeffs_e(4, 1)*u(i, j - 1) & + coeffs_e(5, 1)*u(i, j) & - + coeffs_e(6, 1)*u(i, j + 1) + coeffs_e(7, 1)*u(i, j + 2) & - + coeffs_e(8, 1)*u(i, j + 3) + coeffs_e(9, 1)*u_e(i, 1) + + coeffs_e(6, 1)*u(i, j + 1) & + + coeffs_e(7, 1)*u(i, j + 2) & + + coeffs_e(8, 1)*u(i, j + 3) & + + coeffs_e(9, 1)*u_e(i, 1) du(i, j) = ffr(j)*(du(i, j) - faf(j)*du(i, j - 1)) j = n - 2 - du(i, j) = coeffs_e(1, 2)*u(i, j - 4) + coeffs_e(2, 2)*u(i, j - 3) & - + coeffs_e(3, 2)*u(i, j - 2) + coeffs_e(4, 2)*u(i, j - 1) & + du(i, j) = coeffs_e(1, 2)*u(i, j - 4) & + + coeffs_e(2, 2)*u(i, j - 3) & + + coeffs_e(3, 2)*u(i, j - 2) & + + coeffs_e(4, 2)*u(i, j - 1) & + coeffs_e(5, 2)*u(i, j) & - + coeffs_e(6, 2)*u(i, j + 1) + coeffs_e(7, 2)*u(i, j + 2) & - + coeffs_e(8, 2)*u_e(i, 1) + coeffs_e(9, 2)*u_e(i, 2) + + coeffs_e(6, 2)*u(i, j + 1) & + + coeffs_e(7, 2)*u(i, j + 2) & + + coeffs_e(8, 2)*u_e(i, 1) & + + coeffs_e(9, 2)*u_e(i, 2) du(i, j) = ffr(j)*(du(i, j) - faf(j)*du(i, j - 1)) j = n - 1 - du(i, j) = coeffs_e(1, 3)*u(i, j - 4) + coeffs_e(2, 3)*u(i, j - 3) & - + coeffs_e(3, 3)*u(i, j - 2) + coeffs_e(4, 3)*u(i, j - 1) & + du(i, j) = coeffs_e(1, 3)*u(i, j - 4) & + + coeffs_e(2, 3)*u(i, j - 3) & + + coeffs_e(3, 3)*u(i, j - 2) & + + coeffs_e(4, 3)*u(i, j - 1) & + coeffs_e(5, 3)*u(i, j) & - + coeffs_e(6, 3)*u(i, j + 1) + coeffs_e(7, 3)*u_e(i, 1) & - + coeffs_e(8, 3)*u_e(i, 2) + coeffs_e(9, 3)*u_e(i, 3) + + coeffs_e(6, 3)*u(i, j + 1) & + + coeffs_e(7, 3)*u_e(i, 1) & + + coeffs_e(8, 3)*u_e(i, 2) & + + coeffs_e(9, 3)*u_e(i, 3) du(i, j) = ffr(j)*(du(i, j) - faf(j)*du(i, j - 1)) j = n - du(i, j) = coeffs_e(1, 4)*u(i, j - 4) + coeffs_e(2, 4)*u(i, j - 3) & - + coeffs_e(3, 4)*u(i, j - 2) + coeffs_e(4, 4)*u(i, j - 1) & + du(i, j) = coeffs_e(1, 4)*u(i, j - 4) & + + coeffs_e(2, 4)*u(i, j - 3) & + + coeffs_e(3, 4)*u(i, j - 2) & + + coeffs_e(4, 4)*u(i, j - 1) & + coeffs_e(5, 4)*u(i, j) & - + coeffs_e(6, 4)*u_e(i, 1) + coeffs_e(7, 4)*u_e(i, 2) & - + coeffs_e(8, 4)*u_e(i, 3) + coeffs_e(9, 4)*u_e(i, 4) + + coeffs_e(6, 4)*u_e(i, 1) & + + coeffs_e(7, 4)*u_e(i, 2) & + + coeffs_e(8, 4)*u_e(i, 3) & + + coeffs_e(9, 4)*u_e(i, 4) du(i, j) = ffr(j)*(du(i, j) - faf(j)*du(i, j - 1)) end do !$omp end simd From 9a98a7fe781cbc78ea56b322ceef2a00b56e38a3 Mon Sep 17 00:00:00 2001 From: Sebastien Lemaire Date: Fri, 26 Jan 2024 17:12:48 +0000 Subject: [PATCH 02/19] feat(omp): initial transeq solver implementation still WIP, not tested and not cleaned up --- src/omp/backend.f90 | 62 ++++++++++++++++++++----- src/omp/exec_dist.f90 | 102 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 153 insertions(+), 11 deletions(-) diff --git a/src/omp/backend.f90 b/src/omp/backend.f90 index ce95c1f5..0d781fda 100644 --- a/src/omp/backend.f90 +++ b/src/omp/backend.f90 @@ -3,7 +3,7 @@ module m_omp_backend use m_base_backend, only: base_backend_t use m_common, only: dp, globs_t use m_tdsops, only: dirps_t, tdsops_t - use m_omp_exec_dist, only: exec_dist_tds_compact + use m_omp_exec_dist, only: exec_dist_tds_compact, exec_dist_transeq_compact use m_omp_sendrecv, only: sendrecv_fields use m_omp_common, only: SZ @@ -161,24 +161,64 @@ subroutine transeq_omp_dist(self, du, dv, dw, u, v, w, dirps) class(field_t), intent(inout) :: du, dv, dw class(field_t), intent(in) :: u, v, w type(dirps_t), intent(in) :: dirps - class(field_t), pointer :: duu, d2u, uu, du_temp - ! du - du_temp => self%allocator%get_block() - call tds_solve_omp(self, du_temp, u, dirps, dirps%der1st) + call transeq_dist_component(self, du, u, u, dirps%der1st, dirps%der1st_sym, dirps%der2nd, dirps) - duu => self%allocator%get_block() - uu => self%allocator%get_block() - call vecmul_omp(uu, u, u, dirps) - call tds_solve_omp(self, duu, uu, dirps, dirps%der1st_sym) + call transeq_dist_component(self, dv, v, u, dirps%der1st_sym, dirps%der1st, dirps%der2nd_sym, dirps) + call transeq_dist_component(self, dw, w, u, dirps%der1st_sym, dirps%der1st, dirps%der2nd_sym, dirps) + + end subroutine transeq_omp_dist + + subroutine transeq_dist_component(self, rhs, v, u, tdsops_du, tdsops_dud, tdsops_d2u, dirps) + + class(omp_backend_t) :: self + class(field_t), intent(inout) :: rhs + class(field_t), intent(in) :: u, v + class(tdsops_t), intent(in) :: tdsops_du + class(tdsops_t), intent(in) :: tdsops_dud + class(tdsops_t), intent(in) :: tdsops_d2u + type(dirps_t), intent(in) :: dirps + class(field_t), pointer :: du, d2u, dud, uv + + integer :: n_halo + + ! TODO: don't hardcode n_halo + n_halo = 4 + + du => self%allocator%get_block() + dud => self%allocator%get_block() + uv => self%allocator%get_block() + call vecmul_omp(uv, u, v, dirps) d2u => self%allocator%get_block() - call tds_solve_omp(self, d2u, u, dirps, dirps%der2nd) + call copy_into_buffers(self%u_send_s, self%u_send_e, u%data, dirps%n) + call copy_into_buffers(self%v_send_s, self%v_send_e, uv%data, dirps%n) + ! halo exchange + call sendrecv_fields(self%u_recv_s, self%u_recv_e, self%u_send_s, self%u_send_e, & + SZ*n_halo*dirps%n_blocks, dirps%nproc, dirps%pprev, dirps%pnext) + call sendrecv_fields(self%v_recv_s, self%v_recv_e, self%v_send_s, self%v_send_e, & + SZ*n_halo*dirps%n_blocks, dirps%nproc, dirps%pprev, dirps%pnext) - end subroutine transeq_omp_dist + + call exec_dist_transeq_compact(& + rhs%data, du%data, dud%data, d2u%data, & + self%du_send_s, self%du_send_e, self%du_recv_s, self%du_recv_e, & + self%dud_send_s, self%dud_send_e, self%dud_recv_s, self%dud_recv_e, & + self%d2u_send_s, self%d2u_send_e, self%d2u_recv_s, self%d2u_recv_e, & + u%data, self%u_recv_s, self%u_recv_e, & + uv%data, self%v_recv_s, self%v_recv_e, & + tdsops_du, tdsops_dud, tdsops_d2u, self%nu, & + dirps%nproc, dirps%pprev, dirps%pnext, dirps%n_blocks) + + call self%allocator%release_block(du) + call self%allocator%release_block(dud) + call self%allocator%release_block(uv) + call self%allocator%release_block(d2u) + + end subroutine subroutine tds_solve_omp(self, du, u, dirps, tdsops) implicit none diff --git a/src/omp/exec_dist.f90 b/src/omp/exec_dist.f90 index ac46e1a4..e36e6b63 100644 --- a/src/omp/exec_dist.f90 +++ b/src/omp/exec_dist.f90 @@ -61,5 +61,107 @@ subroutine exec_dist_tds_compact( & end subroutine exec_dist_tds_compact + + subroutine exec_dist_transeq_compact(& + rhs, du, dud, d2u, & + du_send_s, du_send_e, du_recv_s, du_recv_e, & + dud_send_s, dud_send_e, dud_recv_s, dud_recv_e, & + d2u_send_s, d2u_send_e, d2u_recv_s, d2u_recv_e, & + u, u_recv_s, u_recv_e, & + ud, ud_recv_s, ud_recv_e, & + tdsops_du, tdsops_dud, tdsops_d2u, nu, nproc, pprev, pnext, n_block) + + implicit none + + ! du = d(u) + real(dp), dimension(:, :, :), intent(out) :: rhs, du, dud, d2u + + ! The ones below are intent(out) just so that we can write data in them, + ! not because we actually need the data they store later where this + ! subroutine is called. We absolutely don't care about the data they pass back + real(dp), dimension(:, :, :), intent(out) :: & + du_send_s, du_send_e, du_recv_s, du_recv_e + real(dp), dimension(:, :, :), intent(out) :: & + dud_send_s, dud_send_e, dud_recv_s, dud_recv_e + real(dp), dimension(:, :, :), intent(out) :: & + d2u_send_s, d2u_send_e, d2u_recv_s, d2u_recv_e + + real(dp), dimension(:, :, :), intent(in) :: u, u_recv_s, u_recv_e + real(dp), dimension(:, :, :), intent(in) :: ud, ud_recv_s, ud_recv_e + + type(tdsops_t), intent(in) :: tdsops_du, tdsops_dud, tdsops_d2u + real(dp) :: nu + integer, intent(in) :: nproc, pprev, pnext + integer, intent(in) :: n_block + + integer :: n_data + integer :: k, i, j + + n_data = SZ*n_block + + !$omp parallel do + do k = 1, n_block + call der_univ_dist( & + du(:, :, k), du_send_s(:, :, k), du_send_e(:, :, k), u(:, :, k), & + u_recv_s(:, :, k), u_recv_e(:, :, k), & + tdsops_du%coeffs_s, tdsops_du%coeffs_e, tdsops_du%coeffs, tdsops_du%n, & + tdsops_du%dist_fw, tdsops_du%dist_bw, tdsops_du%dist_af & + ) + + call der_univ_dist( & + d2u(:, :, k), d2u_send_s(:, :, k), d2u_send_e(:, :, k), u(:, :, k), & + u_recv_s(:, :, k), u_recv_e(:, :, k), & + tdsops_d2u%coeffs_s, tdsops_d2u%coeffs_e, tdsops_d2u%coeffs, tdsops_d2u%n, & + tdsops_d2u%dist_fw, tdsops_d2u%dist_bw, tdsops_d2u%dist_af & + ) + + call der_univ_dist( & + dud(:, :, k), dud_send_s(:, :, k), dud_send_e(:, :, k), ud(:, :, k), & + ud_recv_s(:, :, k), ud_recv_e(:, :, k), & + tdsops_dud%coeffs_s, tdsops_dud%coeffs_e, tdsops_dud%coeffs, tdsops_dud%n, & + tdsops_dud%dist_fw, tdsops_dud%dist_bw, tdsops_dud%dist_af & + ) + + end do + !$omp end parallel do + + ! halo exchange for 2x2 systems + call sendrecv_fields(du_recv_s, du_recv_e, du_send_s, du_send_e, & + n_data, nproc, pprev, pnext) + call sendrecv_fields(dud_recv_s, dud_recv_e, dud_send_s, dud_send_e, & + n_data, nproc, pprev, pnext) + call sendrecv_fields(d2u_recv_s, d2u_recv_e, d2u_send_s, d2u_send_e, & + n_data, nproc, pprev, pnext) + + !$omp parallel do + do k = 1, n_block + call der_univ_subs(du(:, :, k), & + du_recv_s(:, :, k), du_recv_e(:, :, k), & + tdsops_du%n, tdsops_du%dist_sa, tdsops_du%dist_sc) + + call der_univ_subs(dud(:, :, k), & + dud_recv_s(:, :, k), dud_recv_e(:, :, k), & + tdsops_dud%n, tdsops_dud%dist_sa, tdsops_dud%dist_sc) + + call der_univ_subs(d2u(:, :, k), & + d2u_recv_s(:, :, k), d2u_recv_e(:, :, k), & + tdsops_d2u%n, tdsops_d2u%dist_sa, tdsops_d2u%dist_sc) + + do j = 1, tdsops_d2u%n + !$omp simd + do i = 1, SZ + rhs(i, j, k) = -0.5*(u(i, j, k)*du(i, j, k) + dud(i, j, k)) + nu*d2u(i, j, k) + end do + !$omp end simd + end do + + end do + !$omp end parallel do + + + end subroutine exec_dist_transeq_compact + + + end module m_omp_exec_dist From 03754b7e701fde20f9a108e5ac4bfc7db263e8b8 Mon Sep 17 00:00:00 2001 From: Sebastien Lemaire Date: Fri, 26 Jan 2024 18:32:27 +0000 Subject: [PATCH 03/19] feat(omp): handle multiplication inplace for convective flux also moves halo exchange to its own function --- src/omp/backend.f90 | 47 ++++++++++++++++++++++++------------------- src/omp/exec_dist.f90 | 41 ++++++++++++++++++++++++++++++------- 2 files changed, 60 insertions(+), 28 deletions(-) diff --git a/src/omp/backend.f90 b/src/omp/backend.f90 index 0d781fda..7e602d59 100644 --- a/src/omp/backend.f90 +++ b/src/omp/backend.f90 @@ -161,47 +161,53 @@ subroutine transeq_omp_dist(self, du, dv, dw, u, v, w, dirps) class(field_t), intent(inout) :: du, dv, dw class(field_t), intent(in) :: u, v, w type(dirps_t), intent(in) :: dirps + integer :: n_halo - call transeq_dist_component(self, du, u, u, dirps%der1st, dirps%der1st_sym, dirps%der2nd, dirps) + call transeq_halo_exchange(self, u, v, w, dirps) + call transeq_dist_component(self, du, u, u, dirps%der1st, dirps%der1st_sym, dirps%der2nd, dirps) call transeq_dist_component(self, dv, v, u, dirps%der1st_sym, dirps%der1st, dirps%der2nd_sym, dirps) - call transeq_dist_component(self, dw, w, u, dirps%der1st_sym, dirps%der1st, dirps%der2nd_sym, dirps) end subroutine transeq_omp_dist - subroutine transeq_dist_component(self, rhs, v, u, tdsops_du, tdsops_dud, tdsops_d2u, dirps) + subroutine transeq_halo_exchange(self, u, v, w, dirps) class(omp_backend_t) :: self - class(field_t), intent(inout) :: rhs - class(field_t), intent(in) :: u, v - class(tdsops_t), intent(in) :: tdsops_du - class(tdsops_t), intent(in) :: tdsops_dud - class(tdsops_t), intent(in) :: tdsops_d2u + class(field_t), intent(in) :: u, v, w type(dirps_t), intent(in) :: dirps - class(field_t), pointer :: du, d2u, dud, uv - integer :: n_halo ! TODO: don't hardcode n_halo n_halo = 4 - du => self%allocator%get_block() - dud => self%allocator%get_block() - uv => self%allocator%get_block() - call vecmul_omp(uv, u, v, dirps) - d2u => self%allocator%get_block() - call copy_into_buffers(self%u_send_s, self%u_send_e, u%data, dirps%n) - call copy_into_buffers(self%v_send_s, self%v_send_e, uv%data, dirps%n) + call copy_into_buffers(self%v_send_s, self%v_send_e, v%data, dirps%n) + call copy_into_buffers(self%w_send_s, self%w_send_e, w%data, dirps%n) - ! halo exchange call sendrecv_fields(self%u_recv_s, self%u_recv_e, self%u_send_s, self%u_send_e, & SZ*n_halo*dirps%n_blocks, dirps%nproc, dirps%pprev, dirps%pnext) - call sendrecv_fields(self%v_recv_s, self%v_recv_e, self%v_send_s, self%v_send_e, & SZ*n_halo*dirps%n_blocks, dirps%nproc, dirps%pprev, dirps%pnext) + call sendrecv_fields(self%w_recv_s, self%w_recv_e, self%w_send_s, self%w_send_e, & + SZ*n_halo*dirps%n_blocks, dirps%nproc, dirps%pprev, dirps%pnext) + + end subroutine + + subroutine transeq_dist_component(self, rhs, v, u, tdsops_du, tdsops_dud, tdsops_d2u, dirps) + + class(omp_backend_t) :: self + class(field_t), intent(inout) :: rhs + class(field_t), intent(in) :: u, v + class(tdsops_t), intent(in) :: tdsops_du + class(tdsops_t), intent(in) :: tdsops_dud + class(tdsops_t), intent(in) :: tdsops_d2u + type(dirps_t), intent(in) :: dirps + class(field_t), pointer :: du, d2u, dud, uv + du => self%allocator%get_block() + dud => self%allocator%get_block() + d2u => self%allocator%get_block() call exec_dist_transeq_compact(& rhs%data, du%data, dud%data, d2u%data, & @@ -209,13 +215,12 @@ subroutine transeq_dist_component(self, rhs, v, u, tdsops_du, tdsops_dud, tdsops self%dud_send_s, self%dud_send_e, self%dud_recv_s, self%dud_recv_e, & self%d2u_send_s, self%d2u_send_e, self%d2u_recv_s, self%d2u_recv_e, & u%data, self%u_recv_s, self%u_recv_e, & - uv%data, self%v_recv_s, self%v_recv_e, & + v%data, self%v_recv_s, self%v_recv_e, & tdsops_du, tdsops_dud, tdsops_d2u, self%nu, & dirps%nproc, dirps%pprev, dirps%pnext, dirps%n_blocks) call self%allocator%release_block(du) call self%allocator%release_block(dud) - call self%allocator%release_block(uv) call self%allocator%release_block(d2u) end subroutine diff --git a/src/omp/exec_dist.f90 b/src/omp/exec_dist.f90 index e36e6b63..45e4f7ca 100644 --- a/src/omp/exec_dist.f90 +++ b/src/omp/exec_dist.f90 @@ -68,7 +68,7 @@ subroutine exec_dist_transeq_compact(& dud_send_s, dud_send_e, dud_recv_s, dud_recv_e, & d2u_send_s, d2u_send_e, d2u_recv_s, d2u_recv_e, & u, u_recv_s, u_recv_e, & - ud, ud_recv_s, ud_recv_e, & + v, v_recv_s, v_recv_e, & tdsops_du, tdsops_dud, tdsops_d2u, nu, nproc, pprev, pnext, n_block) implicit none @@ -87,18 +87,27 @@ subroutine exec_dist_transeq_compact(& d2u_send_s, d2u_send_e, d2u_recv_s, d2u_recv_e real(dp), dimension(:, :, :), intent(in) :: u, u_recv_s, u_recv_e - real(dp), dimension(:, :, :), intent(in) :: ud, ud_recv_s, ud_recv_e + real(dp), dimension(:, :, :), intent(in) :: v, v_recv_s, v_recv_e type(tdsops_t), intent(in) :: tdsops_du, tdsops_dud, tdsops_d2u + + real(dp), dimension(:, :), allocatable :: ud, ud_recv_s, ud_recv_e real(dp) :: nu integer, intent(in) :: nproc, pprev, pnext integer, intent(in) :: n_block - integer :: n_data - integer :: k, i, j + integer :: n_data, n_halo + integer :: k, i, j, n + ! TODO: don't hardcode n_halo + n_halo = 4 + n = tdsops_d2u%n n_data = SZ*n_block + allocate(ud(SZ, n)) + allocate(ud_recv_e(SZ, n_halo)) + allocate(ud_recv_s(SZ, n_halo)) + !$omp parallel do do k = 1, n_block call der_univ_dist( & @@ -115,9 +124,27 @@ subroutine exec_dist_transeq_compact(& tdsops_d2u%dist_fw, tdsops_d2u%dist_bw, tdsops_d2u%dist_af & ) + ! Handle dud by locally generating u*v + do j = 1, n + !$omp simd + do i = 1, SZ + ud(i, j) = u(i, j, k) * v(i, j, k) + end do + !$omp end simd + end do + + do j = 1, n_halo + !$omp simd + do i = 1, SZ + ud_recv_s(i, j) = u_recv_s(i, j, k) * v_recv_s(i, j, k) + ud_recv_e(i, j) = u_recv_e(i, j, k) * v_recv_e(i, j, k) + end do + !$omp end simd + end do + call der_univ_dist( & - dud(:, :, k), dud_send_s(:, :, k), dud_send_e(:, :, k), ud(:, :, k), & - ud_recv_s(:, :, k), ud_recv_e(:, :, k), & + dud(:, :, k), dud_send_s(:, :, k), dud_send_e(:, :, k), ud(:, :), & + ud_recv_s(:, :), ud_recv_e(:, :), & tdsops_dud%coeffs_s, tdsops_dud%coeffs_e, tdsops_dud%coeffs, tdsops_dud%n, & tdsops_dud%dist_fw, tdsops_dud%dist_bw, tdsops_dud%dist_af & ) @@ -147,7 +174,7 @@ subroutine exec_dist_transeq_compact(& d2u_recv_s(:, :, k), d2u_recv_e(:, :, k), & tdsops_d2u%n, tdsops_d2u%dist_sa, tdsops_d2u%dist_sc) - do j = 1, tdsops_d2u%n + do j = 1, n !$omp simd do i = 1, SZ rhs(i, j, k) = -0.5*(u(i, j, k)*du(i, j, k) + dud(i, j, k)) + nu*d2u(i, j, k) From 29f9f6a2de3f4e9f93fb6f3bd1b83e2ca08a85f0 Mon Sep 17 00:00:00 2001 From: Sebastien Lemaire Date: Fri, 26 Jan 2024 18:43:47 +0000 Subject: [PATCH 04/19] docs(omp): add comment --- src/omp/backend.f90 | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/omp/backend.f90 b/src/omp/backend.f90 index 7e602d59..b3d458aa 100644 --- a/src/omp/backend.f90 +++ b/src/omp/backend.f90 @@ -194,6 +194,8 @@ subroutine transeq_halo_exchange(self, u, v, w, dirps) end subroutine + !> Computes RHS_x^v following: + ! rhs_x^v = -0.5*(u*dv/dx + duv/dx) + nu*d2v/dx2 subroutine transeq_dist_component(self, rhs, v, u, tdsops_du, tdsops_dud, tdsops_d2u, dirps) class(omp_backend_t) :: self From e805ac0092815bf59c99ffa48f457ff67017d4d1 Mon Sep 17 00:00:00 2001 From: Sebastien Lemaire Date: Mon, 5 Feb 2024 11:57:24 +0000 Subject: [PATCH 05/19] refactor(omp): remove unusef vecmul_omp function --- src/omp/backend.f90 | 26 -------------------------- 1 file changed, 26 deletions(-) diff --git a/src/omp/backend.f90 b/src/omp/backend.f90 index b3d458aa..4a8174ab 100644 --- a/src/omp/backend.f90 +++ b/src/omp/backend.f90 @@ -332,32 +332,6 @@ subroutine vecadd_omp(self, a, x, b, y) end subroutine vecadd_omp - subroutine vecmul_omp(uv, u, v, dirps) - implicit none - - class(field_t), intent(inout) :: uv - class(field_t), intent(in) :: u, v - type(dirps_t), intent(in) :: dirps - integer :: i, j, k - - real(dp), pointer, dimension(:, :, :) :: u_data, v_data, uv_data - - select type(u); type is (field_t); u_data => u%data; end select - select type(v); type is (field_t); v_data => v%data; end select - select type(uv); type is (field_t); uv_data => uv%data; end select - - do k = 1, dirps%n_blocks - do j = 1, dirps%n - !$omp simd - do i = 1, SZ - uv_data(i, j, k) = u_data(i, j, k)*v_data(i, j, k) - end do - !$omp end simd - end do - end do - - end subroutine vecmul_omp - subroutine copy_into_buffers(u_send_s, u_send_e, u, n) implicit none From 732f4da04ac9bedec639e5fd876a69e8ed2cc861 Mon Sep 17 00:00:00 2001 From: Sebastien Lemaire Date: Tue, 6 Feb 2024 15:04:31 +0000 Subject: [PATCH 06/19] refactor: refactor solver allocation of tdsops --- src/solver.f90 | 79 ++++++++++++++++++-------------------------------- 1 file changed, 29 insertions(+), 50 deletions(-) diff --git a/src/solver.f90 b/src/solver.f90 index 5eb1139c..b164de13 100644 --- a/src/solver.f90 +++ b/src/solver.f90 @@ -107,59 +107,38 @@ function init(backend, time_integrator, xdirps, ydirps, zdirps, globs) & dx = globs%dx; dy = globs%dy; dz = globs%dz ! Allocate and set the tdsops - call solver%backend%alloc_tdsops(solver%xdirps%der1st, nx, dx, & - 'first-deriv', 'compact6') - call solver%backend%alloc_tdsops(solver%ydirps%der1st, ny, dy, & - 'first-deriv', 'compact6') - call solver%backend%alloc_tdsops(solver%zdirps%der1st, nz, dz, & - 'first-deriv', 'compact6') - call solver%backend%alloc_tdsops(solver%xdirps%der1st_sym, nx, dx, & - 'first-deriv', 'compact6') - call solver%backend%alloc_tdsops(solver%ydirps%der1st_sym, ny, dy, & - 'first-deriv', 'compact6') - call solver%backend%alloc_tdsops(solver%zdirps%der1st_sym, nz, dz, & - 'first-deriv', 'compact6') - call solver%backend%alloc_tdsops(solver%xdirps%der2nd, nx, dx, & - 'second-deriv', 'compact6') - call solver%backend%alloc_tdsops(solver%ydirps%der2nd, ny, dy, & - 'second-deriv', 'compact6') - call solver%backend%alloc_tdsops(solver%zdirps%der2nd, nz, dz, & - 'second-deriv', 'compact6') - call solver%backend%alloc_tdsops(solver%xdirps%der2nd_sym, nx, dx, & - 'second-deriv', 'compact6') - call solver%backend%alloc_tdsops(solver%ydirps%der2nd_sym, ny, dy, & - 'second-deriv', 'compact6') - call solver%backend%alloc_tdsops(solver%zdirps%der2nd_sym, nz, dz, & - 'second-deriv', 'compact6') - - call solver%backend%alloc_tdsops(solver%xdirps%interpl_v2p, nx, dx, & - 'interpolate', 'classic', from_to='v2p') - call solver%backend%alloc_tdsops(solver%ydirps%interpl_v2p, ny, dy, & - 'interpolate', 'classic', from_to='v2p') - call solver%backend%alloc_tdsops(solver%zdirps%interpl_v2p, nz, dz, & - 'interpolate', 'classic', from_to='v2p') - call solver%backend%alloc_tdsops(solver%xdirps%interpl_p2v, nx, dx, & - 'interpolate', 'classic', from_to='p2v') - call solver%backend%alloc_tdsops(solver%ydirps%interpl_p2v, ny, dy, & - 'interpolate', 'classic', from_to='p2v') - call solver%backend%alloc_tdsops(solver%zdirps%interpl_p2v, nz, dz, & - 'interpolate', 'classic', from_to='p2v') - - call solver%backend%alloc_tdsops(solver%xdirps%stagder_v2p, nx, dx, & - 'stag-deriv', 'compact6', from_to='v2p') - call solver%backend%alloc_tdsops(solver%ydirps%stagder_v2p, ny, dy, & - 'stag-deriv', 'compact6', from_to='v2p') - call solver%backend%alloc_tdsops(solver%zdirps%stagder_v2p, nz, dz, & - 'stag-deriv', 'compact6', from_to='v2p') - call solver%backend%alloc_tdsops(solver%xdirps%stagder_p2v, nx, dx, & - 'stag-deriv', 'compact6', from_to='p2v') - call solver%backend%alloc_tdsops(solver%ydirps%stagder_p2v, ny, dy, & - 'stag-deriv', 'compact6', from_to='p2v') - call solver%backend%alloc_tdsops(solver%zdirps%stagder_p2v, nz, dz, & - 'stag-deriv', 'compact6', from_to='p2v') + call allocate_tdsops(solver%xdirps, nx, dx, solver%backend) + call allocate_tdsops(solver%ydirps, ny, dy, solver%backend) + call allocate_tdsops(solver%zdirps, nz, dz, solver%backend) end function init + subroutine allocate_tdsops(dirps, nx, dx, backend) + class(dirps_t), intent(inout) :: dirps + real(dp), intent(in) :: dx + integer, intent(in) :: nx + class(base_backend_t), intent(in) :: backend + + call backend%alloc_tdsops(dirps%der1st, nx, dx, & + 'first-deriv', 'compact6') + call backend%alloc_tdsops(dirps%der1st_sym, nx, dx, & + 'first-deriv', 'compact6') + call backend%alloc_tdsops(dirps%der2nd, nx, dx, & + 'second-deriv', 'compact6') + call backend%alloc_tdsops(dirps%der2nd_sym, nx, dx, & + 'second-deriv', 'compact6') + call backend%alloc_tdsops(dirps%interpl_v2p, nx, dx, & + 'interpolate', 'classic', from_to='v2p') + call backend%alloc_tdsops(dirps%interpl_p2v, nx, dx, & + 'interpolate', 'classic', from_to='p2v') + call backend%alloc_tdsops(dirps%stagder_v2p, nx, dx, & + 'stag-deriv', 'compact6', from_to='v2p') + call backend%alloc_tdsops(dirps%stagder_p2v, nx, dx, & + 'stag-deriv', 'compact6', from_to='p2v') + + + end subroutine + subroutine transeq(self, du, dv, dw, u, v, w) implicit none From 237c61c4cd73eeacc015ac251aae5b09c442ba7e Mon Sep 17 00:00:00 2001 From: Sebastien Lemaire Date: Tue, 6 Feb 2024 15:10:32 +0000 Subject: [PATCH 07/19] test: add test for omp transeq test fails for now, either bug in implementation or in the test itself --- tests/CMakeLists.txt | 1 + tests/omp/test_omp_transeq.f90 | 150 +++++++++++++++++++++++++++++++++ 2 files changed, 151 insertions(+) create mode 100644 tests/omp/test_omp_transeq.f90 diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 75c9ca88..0a3e11e0 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -1,6 +1,7 @@ set(TESTSRC test_allocator.f90 omp/test_omp_tridiag.f90 + omp/test_omp_transeq.f90 ) set(CUDATESTSRC cuda/test_cuda_allocator.f90 diff --git a/tests/omp/test_omp_transeq.f90 b/tests/omp/test_omp_transeq.f90 new file mode 100644 index 00000000..569ba60e --- /dev/null +++ b/tests/omp/test_omp_transeq.f90 @@ -0,0 +1,150 @@ +program test_omp_transeq + use iso_fortran_env, only: stderr => error_unit + use mpi + + use m_allocator, only: allocator_t, field_t + use m_common, only: dp, pi, globs_t, set_pprev_pnext + use m_omp_common, only: SZ + use m_omp_sendrecv, only: sendrecv_fields + use m_omp_backend, only: omp_backend_t, transeq_x_omp, base_backend_t + use m_tdsops, only: dirps_t, tdsops_t + use m_solver, only: allocate_tdsops + + implicit none + + logical :: allpass = .true. + class(field_t), pointer :: u, v, w + class(field_t), pointer :: du, dv, dw + real(dp), dimension(:, :, :), allocatable :: r_u + + integer :: n, n_block, i, j, k + integer :: n_glob + integer :: nrank, nproc, pprev, pnext + integer :: ierr + + real(dp) :: dx, dx_per, nu, norm_du, tol = 1d-8, tstart, tend + + type(globs_t) :: globs + class(base_backend_t), pointer :: backend + class(allocator_t), pointer :: allocator + + type(omp_backend_t), target :: omp_backend + type(allocator_t), target :: omp_allocator + type(dirps_t) :: xdirps, ydirps, zdirps + + ! Initialise variables and arrays + call MPI_Init(ierr) + call MPI_Comm_rank(MPI_COMM_WORLD, nrank, ierr) + call MPI_Comm_size(MPI_COMM_WORLD, nproc, ierr) + + globs%nx = 64 + globs%ny = 64 + globs%nz = 64 + + globs%nx_loc = globs%nx/nproc + globs%ny_loc = globs%ny/nproc + globs%nz_loc = globs%nz/nproc + + globs%n_groups_x = globs%ny_loc*globs%nz_loc/SZ + globs%n_groups_y = globs%nx_loc*globs%nz_loc/SZ + globs%n_groups_z = globs%nx_loc*globs%ny_loc/SZ + + xdirps%nproc = nproc + ydirps%nproc = nproc + zdirps%nproc = nproc + + call set_pprev_pnext( & + xdirps%pprev, xdirps%pnext, & + ydirps%pprev, ydirps%pnext, & + zdirps%pprev, zdirps%pnext, & + xdirps%nproc, ydirps%nproc, zdirps%nproc, nrank & + ) + + xdirps%n = globs%nx_loc + ydirps%n = globs%ny_loc + zdirps%n = globs%nz_loc + + xdirps%n_blocks = globs%n_groups_x + ydirps%n_blocks = globs%n_groups_y + zdirps%n_blocks = globs%n_groups_z + + omp_allocator = allocator_t([SZ, globs%nx_loc, globs%n_groups_x]) + allocator => omp_allocator + print*, 'OpenMP allocator instantiated' + + omp_backend = omp_backend_t(globs, allocator) + backend => omp_backend + print*, 'OpenMP backend instantiated' + + + if (nrank == 0) print*, 'Parallel run with', nproc, 'ranks' + + n_glob = globs%nx_loc + n = n_glob/nproc + n_block = xdirps%n_blocks + + nu = 1._dp + omp_backend%nu = nu + + + u => allocator%get_block() + v => allocator%get_block() + w => allocator%get_block() + + du => allocator%get_block() + dv => allocator%get_block() + dw => allocator%get_block() + + dx_per = 2*pi/n_glob + dx = 2*pi/(n_glob - 1) + globs%dx = dx + + + do k = 1, n_block + do j = 1, n + do i = 1, SZ + u%data(i, j, k) = sin((j - 1 + nrank*n)*dx_per) + v%data(i, j, k) = cos((j - 1 + nrank*n)*dx_per) + end do + end do + end do + w%data(:, :, :) = 0.d0 + + call allocate_tdsops(xdirps, globs%nx_loc, globs%dx, omp_backend) + + call cpu_time(tstart) + call transeq_x_omp(omp_backend, du, dv, dw, u, v, w, xdirps) + call cpu_time(tend) + + if (nrank == 0) print*, 'Total time', tend - tstart + + allocate(r_u(SZ, n, n_block)) + + ! check error + r_u = du%data - (-v%data*v%data + 0.5_dp*u%data*u%data - nu*u%data) + norm_du = norm2(r_u) + norm_du = norm_du*norm_du/n_glob/n_block/SZ + call MPI_Allreduce(MPI_IN_PLACE, norm_du, 1, MPI_DOUBLE_PRECISION, & + MPI_SUM, MPI_COMM_WORLD, ierr) + norm_du = sqrt(norm_du) + + if (nrank == 0) print*, 'error norm', norm_du + if (nrank == 0) then + if ( norm_du > tol ) then + allpass = .false. + write(stderr, '(a)') 'Check second derivatives... failed' + else + write(stderr, '(a)') 'Check second derivatives... passed' + end if + end if + + if (allpass) then + if (nrank == 0) write(stderr, '(a)') 'ALL TESTS PASSED SUCCESSFULLY.' + else + error stop 'SOME TESTS FAILED.' + end if + + call MPI_Finalize(ierr) + +end program test_omp_transeq + From b30271fc3227199e29e2c4a985568946ae295d68 Mon Sep 17 00:00:00 2001 From: Sebastien Lemaire Date: Fri, 16 Feb 2024 12:19:14 +0000 Subject: [PATCH 08/19] Fix typo in transeq computation --- src/omp/exec_dist.f90 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/omp/exec_dist.f90 b/src/omp/exec_dist.f90 index 45e4f7ca..867686ac 100644 --- a/src/omp/exec_dist.f90 +++ b/src/omp/exec_dist.f90 @@ -177,7 +177,7 @@ subroutine exec_dist_transeq_compact(& do j = 1, n !$omp simd do i = 1, SZ - rhs(i, j, k) = -0.5*(u(i, j, k)*du(i, j, k) + dud(i, j, k)) + nu*d2u(i, j, k) + rhs(i, j, k) = -0.5*(v(i, j, k)*du(i, j, k) + dud(i, j, k)) + nu*d2u(i, j, k) end do !$omp end simd end do From 813804c18e5d832145fabdf1ffbe4f0ac31ad58b Mon Sep 17 00:00:00 2001 From: Sebastien Lemaire Date: Fri, 16 Feb 2024 12:20:01 +0000 Subject: [PATCH 09/19] add dist_transeq test --- tests/CMakeLists.txt | 1 + tests/omp/test_omp_dist_transeq.f90 | 151 ++++++++++++++++++++++++++++ tests/omp/test_omp_transeq.f90 | 6 +- tests/omp/test_omp_tridiag.f90 | 2 +- 4 files changed, 156 insertions(+), 4 deletions(-) create mode 100644 tests/omp/test_omp_dist_transeq.f90 diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 0a3e11e0..b9dfaca5 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -2,6 +2,7 @@ set(TESTSRC test_allocator.f90 omp/test_omp_tridiag.f90 omp/test_omp_transeq.f90 + omp/test_omp_dist_transeq.f90 ) set(CUDATESTSRC cuda/test_cuda_allocator.f90 diff --git a/tests/omp/test_omp_dist_transeq.f90 b/tests/omp/test_omp_dist_transeq.f90 new file mode 100644 index 00000000..1a46df80 --- /dev/null +++ b/tests/omp/test_omp_dist_transeq.f90 @@ -0,0 +1,151 @@ +program test_transeq + use iso_fortran_env, only: stderr => error_unit + use mpi + + use m_common, only: dp, pi + use m_omp_common, only: SZ + use m_omp_exec_dist, only: exec_dist_transeq_compact + use m_omp_sendrecv, only: sendrecv_fields + use m_tdsops, only: tdsops_t + + implicit none + + logical :: allpass = .true. + real(dp), allocatable, dimension(:, :, :) :: u, v, r_u + real(dp), allocatable, dimension(:, :, :) :: du, dud, d2u ! intermediate solution arrays + real(dp), allocatable, dimension(:, :, :) :: & + du_recv_s, du_recv_e, du_send_s, du_send_e, & + dud_recv_s, dud_recv_e, dud_send_s, dud_send_e, & + d2u_recv_s, d2u_recv_e, d2u_send_s, d2u_send_e + + real(dp), allocatable, dimension(:, :, :) :: & + u_send_s, u_send_e, u_recv_s, u_recv_e, & + v_send_s, v_send_e, v_recv_s, v_recv_e + + type(tdsops_t) :: der1st, der2nd + + integer :: n, n_block, i, j, k, n_halo, n_iters + integer :: n_glob + integer :: nrank, nproc, pprev, pnext + integer :: ierr + + real(dp) :: dx, dx_per, nu, norm_du, tol = 1d-8 + + call MPI_Init(ierr) + call MPI_Comm_rank(MPI_COMM_WORLD, nrank, ierr) + call MPI_Comm_size(MPI_COMM_WORLD, nproc, ierr) + + if (nrank == 0) print*, 'Parallel run with', nproc, 'ranks' + + pnext = modulo(nrank - nproc + 1, nproc) + pprev = modulo(nrank - 1, nproc) + + n_glob = 32*4 + n = n_glob/nproc + n_block = 32*32/SZ + n_iters = 1 + + nu = 1._dp + + allocate(u(SZ, n, n_block), v(SZ, n, n_block), r_u(SZ, n, n_block)) + + ! main input fields + ! field for storing the result + ! intermediate solution fields + allocate(du(SZ, n, n_block)) + allocate(dud(SZ, n, n_block)) + allocate(d2u(SZ, n, n_block)) + + dx_per = 2*pi/n_glob + dx = 2*pi/(n_glob - 1) + + do k = 1, n_block + do j = 1, n + do i = 1, SZ + u(i, j, k) = sin((j - 1 + nrank*n)*dx_per) + v(i, j, k) = cos((j - 1 + nrank*n)*dx_per) + end do + end do + end do + + n_halo = 4 + + ! arrays for exchanging data between ranks + allocate(u_send_s(SZ, n_halo, n_block)) + allocate(u_send_e(SZ, n_halo, n_block)) + allocate(u_recv_s(SZ, n_halo, n_block)) + allocate(u_recv_e(SZ, n_halo, n_block)) + allocate(v_send_s(SZ, n_halo, n_block)) + allocate(v_send_e(SZ, n_halo, n_block)) + allocate(v_recv_s(SZ, n_halo, n_block)) + allocate(v_recv_e(SZ, n_halo, n_block)) + + allocate(du_send_s(SZ, 1, n_block), du_send_e(SZ, 1, n_block)) + allocate(du_recv_s(SZ, 1, n_block), du_recv_e(SZ, 1, n_block)) + allocate(dud_send_s(SZ, 1, n_block), dud_send_e(SZ, 1, n_block)) + allocate(dud_recv_s(SZ, 1, n_block), dud_recv_e(SZ, 1, n_block)) + allocate(d2u_send_s(SZ, 1, n_block), d2u_send_e(SZ, 1, n_block)) + allocate(d2u_recv_s(SZ, 1, n_block), d2u_recv_e(SZ, 1, n_block)) + + ! preprocess the operator and coefficient arrays + der1st = tdsops_t(n, dx_per, operation='first-deriv', & + scheme='compact6') + der2nd = tdsops_t(n, dx_per, operation='second-deriv', & + scheme='compact6') + + + u_send_s(:, :, :) = u(:, 1:4, :) + u_send_e(:, :, :) = u(:, n - n_halo + 1:n, :) + v_send_s(:, :, :) = v(:, 1:4, :) + v_send_e(:, :, :) = v(:, n - n_halo + 1:n, :) + + + ! halo exchange + call sendrecv_fields(u_recv_s, u_recv_e, & + u_send_s, u_send_e, & + SZ*4*n_block, nproc, pprev, pnext) + + call sendrecv_fields(v_recv_s, v_recv_e, & + v_send_s, v_send_e, & + SZ*4*n_block, nproc, pprev, pnext) + + call exec_dist_transeq_compact( & + r_u, & + du, dud, d2u, & + du_send_s, du_send_e, du_recv_s, du_recv_e, & + dud_send_s, dud_send_e, dud_recv_s, dud_recv_e, & + d2u_send_s, d2u_send_e, d2u_recv_s, d2u_recv_e, & + u, u_recv_s, u_recv_e, & + v, v_recv_s, v_recv_e, & + der1st, der1st, der2nd, nu, nproc, pprev, pnext, n_block & + ) + + ! check error + r_u = r_u - (-v*v + 0.5_dp*u*u - nu*u) + norm_du = norm2(r_u) + norm_du = norm_du*norm_du/n_glob/n_block/SZ + call MPI_Allreduce(MPI_IN_PLACE, norm_du, 1, MPI_DOUBLE_PRECISION, & + MPI_SUM, MPI_COMM_WORLD, ierr) + norm_du = sqrt(norm_du) + + if (nrank == 0) print*, 'error norm', norm_du + + if (nrank == 0) then + if ( norm_du > tol ) then + allpass = .false. + write(stderr, '(a)') 'Check second derivatives... failed' + else + write(stderr, '(a)') 'Check second derivatives... passed' + end if + end if + + if (allpass) then + if (nrank == 0) write(stderr, '(a)') 'ALL TESTS PASSED SUCCESSFULLY.' + else + error stop 'SOME TESTS FAILED.' + end if + + call MPI_Finalize(ierr) + +end program test_transeq + diff --git a/tests/omp/test_omp_transeq.f90 b/tests/omp/test_omp_transeq.f90 index 569ba60e..a05745ea 100644 --- a/tests/omp/test_omp_transeq.f90 +++ b/tests/omp/test_omp_transeq.f90 @@ -19,7 +19,7 @@ program test_omp_transeq integer :: n, n_block, i, j, k integer :: n_glob - integer :: nrank, nproc, pprev, pnext + integer :: nrank, nproc integer :: ierr real(dp) :: dx, dx_per, nu, norm_du, tol = 1d-8, tstart, tend @@ -110,7 +110,7 @@ program test_omp_transeq end do w%data(:, :, :) = 0.d0 - call allocate_tdsops(xdirps, globs%nx_loc, globs%dx, omp_backend) + call allocate_tdsops(xdirps, globs%nx_loc, dx_per, omp_backend) call cpu_time(tstart) call transeq_x_omp(omp_backend, du, dv, dw, u, v, w, xdirps) @@ -121,7 +121,7 @@ program test_omp_transeq allocate(r_u(SZ, n, n_block)) ! check error - r_u = du%data - (-v%data*v%data + 0.5_dp*u%data*u%data - nu*u%data) + r_u = dv%data - (-v%data*v%data + 0.5_dp*u%data*u%data - nu*u%data) norm_du = norm2(r_u) norm_du = norm_du*norm_du/n_glob/n_block/SZ call MPI_Allreduce(MPI_IN_PLACE, norm_du, 1, MPI_DOUBLE_PRECISION, & diff --git a/tests/omp/test_omp_tridiag.f90 b/tests/omp/test_omp_tridiag.f90 index da747250..41b706e9 100644 --- a/tests/omp/test_omp_tridiag.f90 +++ b/tests/omp/test_omp_tridiag.f90 @@ -48,7 +48,7 @@ program test_omp_tridiag n_glob = 1024 n = n_glob/nproc - n_block = 512*512/SZ + n_block = 64*64/SZ n_iters = 1 allocate (u(SZ, n, n_block), du(SZ, n, n_block)) From 33a24aa2e0a9e5eba20d41458e3a091e069ffd2c Mon Sep 17 00:00:00 2001 From: Sebastien Lemaire Date: Fri, 16 Feb 2024 12:20:24 +0000 Subject: [PATCH 10/19] remove unused variable --- src/omp/backend.f90 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/omp/backend.f90 b/src/omp/backend.f90 index 4a8174ab..d98a2260 100644 --- a/src/omp/backend.f90 +++ b/src/omp/backend.f90 @@ -205,7 +205,7 @@ subroutine transeq_dist_component(self, rhs, v, u, tdsops_du, tdsops_dud, tdsops class(tdsops_t), intent(in) :: tdsops_dud class(tdsops_t), intent(in) :: tdsops_d2u type(dirps_t), intent(in) :: dirps - class(field_t), pointer :: du, d2u, dud, uv + class(field_t), pointer :: du, d2u, dud du => self%allocator%get_block() dud => self%allocator%get_block() From f7bc7e3d532192a6ea024e5d789d1a3efec4e757 Mon Sep 17 00:00:00 2001 From: Sebastien Lemaire Date: Fri, 16 Feb 2024 12:26:18 +0000 Subject: [PATCH 11/19] reformat long line --- src/omp/backend.f90 | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/omp/backend.f90 b/src/omp/backend.f90 index d98a2260..c4c7b44d 100644 --- a/src/omp/backend.f90 +++ b/src/omp/backend.f90 @@ -165,9 +165,12 @@ subroutine transeq_omp_dist(self, du, dv, dw, u, v, w, dirps) call transeq_halo_exchange(self, u, v, w, dirps) - call transeq_dist_component(self, du, u, u, dirps%der1st, dirps%der1st_sym, dirps%der2nd, dirps) - call transeq_dist_component(self, dv, v, u, dirps%der1st_sym, dirps%der1st, dirps%der2nd_sym, dirps) - call transeq_dist_component(self, dw, w, u, dirps%der1st_sym, dirps%der1st, dirps%der2nd_sym, dirps) + call transeq_dist_component(self, du, u, u, & + dirps%der1st, dirps%der1st_sym, dirps%der2nd, dirps) + call transeq_dist_component(self, dv, v, u, & + dirps%der1st_sym, dirps%der1st, dirps%der2nd_sym, dirps) + call transeq_dist_component(self, dw, w, u, & + dirps%der1st_sym, dirps%der1st, dirps%der2nd_sym, dirps) end subroutine transeq_omp_dist From 3a327b9da057d2c4df9a45d097d933fb98d34630 Mon Sep 17 00:00:00 2001 From: Nanoseb Date: Fri, 16 Feb 2024 12:52:11 +0000 Subject: [PATCH 12/19] add subroutine name after end subroutine for consistency with the rest of the codebase Co-authored-by: Jamie J Quinn --- src/omp/backend.f90 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/omp/backend.f90 b/src/omp/backend.f90 index 29c2f0e2..d12de4fc 100644 --- a/src/omp/backend.f90 +++ b/src/omp/backend.f90 @@ -225,7 +225,7 @@ subroutine transeq_dist_component(self, rhs, v, u, tdsops_du, tdsops_dud, tdsops call self%allocator%release_block(dud) call self%allocator%release_block(d2u) - end subroutine + end subroutine transeq_dist_component subroutine tds_solve_omp(self, du, u, dirps, tdsops) implicit none From 1646e53690ee371e0e4e051f10965a16a1212ae8 Mon Sep 17 00:00:00 2001 From: Nanoseb Date: Fri, 16 Feb 2024 12:52:40 +0000 Subject: [PATCH 13/19] add subroutine name after end subroutine for consistency with the rest of the codebase Co-authored-by: Jamie J Quinn --- src/omp/backend.f90 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/omp/backend.f90 b/src/omp/backend.f90 index d12de4fc..6d737325 100644 --- a/src/omp/backend.f90 +++ b/src/omp/backend.f90 @@ -192,7 +192,7 @@ subroutine transeq_halo_exchange(self, u, v, w, dirps) call sendrecv_fields(self%w_recv_s, self%w_recv_e, self%w_send_s, self%w_send_e, & SZ*n_halo*dirps%n_blocks, dirps%nproc, dirps%pprev, dirps%pnext) - end subroutine + end subroutine transeq_halo_exchange !> Computes RHS_x^v following: ! rhs_x^v = -0.5*(u*dv/dx + duv/dx) + nu*d2v/dx2 From b20b503e2b86db4fec064c4293f7007dce7af1e5 Mon Sep 17 00:00:00 2001 From: Sebastien Lemaire Date: Fri, 16 Feb 2024 14:04:43 +0000 Subject: [PATCH 14/19] improve performance of copy_into_buffers --- src/omp/backend.f90 | 29 +++++++++++++++++++++-------- 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/src/omp/backend.f90 b/src/omp/backend.f90 index 29c2f0e2..1d9a3f25 100644 --- a/src/omp/backend.f90 +++ b/src/omp/backend.f90 @@ -181,9 +181,9 @@ subroutine transeq_halo_exchange(self, u, v, w, dirps) ! TODO: don't hardcode n_halo n_halo = 4 - call copy_into_buffers(self%u_send_s, self%u_send_e, u%data, dirps%n) - call copy_into_buffers(self%v_send_s, self%v_send_e, v%data, dirps%n) - call copy_into_buffers(self%w_send_s, self%w_send_e, w%data, dirps%n) + call copy_into_buffers(self%u_send_s, self%u_send_e, u%data, dirps%n, dirps%n_blocks) + call copy_into_buffers(self%v_send_s, self%v_send_e, v%data, dirps%n, dirps%n_blocks) + call copy_into_buffers(self%w_send_s, self%w_send_e, w%data, dirps%n, dirps%n_blocks) call sendrecv_fields(self%u_recv_s, self%u_recv_e, self%u_send_s, self%u_send_e, & SZ*n_halo*dirps%n_blocks, dirps%nproc, dirps%pprev, dirps%pnext) @@ -252,7 +252,7 @@ subroutine tds_solve_dist(self, du, u, dirps, tdsops) ! TODO: don't hardcode n_halo n_halo = 4 - call copy_into_buffers(self%u_send_s, self%u_send_e, u%data, dirps%n) + call copy_into_buffers(self%u_send_s, self%u_send_e, u%data, dirps%n, dirps%n_blocks) ! halo exchange call sendrecv_fields(self%u_recv_s, self%u_recv_e, self%u_send_s, self%u_send_e, & @@ -305,15 +305,28 @@ subroutine vecadd_omp(self, a, x, b, y) end subroutine vecadd_omp - subroutine copy_into_buffers(u_send_s, u_send_e, u, n) + subroutine copy_into_buffers(u_send_s, u_send_e, u, n, n_blocks) implicit none real(dp), dimension(:, :, :), intent(out) :: u_send_s, u_send_e real(dp), dimension(:, :, :), intent(in) :: u integer, intent(in) :: n - - u_send_s(:, :, :) = u(:, 1:4, :) - u_send_e(:, :, :) = u(:, n - 3:n, :) + integer, intent(in) :: n_blocks + integer :: i, j, k + integer :: n_halo = 4 + + !$omp parallel do + do k=1, n_blocks + do j=1, n_halo + !$omp simd + do i=1, SZ + u_send_s(i, j, k) = u(i, j, k) + u_send_e(i, j, k) = u(i, n - n_halo + j, k) + end do + !$omp end simd + end do + end do + !$omp end parallel do end subroutine copy_into_buffers From ddaaf7d0f554cbaefa8b0d08f36242a5150b33c2 Mon Sep 17 00:00:00 2001 From: Sebastien Lemaire Date: Fri, 16 Feb 2024 15:07:40 +0000 Subject: [PATCH 15/19] fix test_omp_transeq for parallel runs --- tests/CMakeLists.txt | 2 +- tests/omp/test_omp_transeq.f90 | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index cbd27246..f029a33f 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -43,5 +43,5 @@ foreach(testfile IN LISTS TESTSRC) find_package(OpenMP REQUIRED) target_link_libraries(${test_name} PRIVATE OpenMP::OpenMP_Fortran) - add_test(NAME ${test_name} COMMAND sh -c "mpirun -np 1 ${test_name}") + add_test(NAME ${test_name} COMMAND sh -c "mpirun -np 4 ${test_name}") endforeach() diff --git a/tests/omp/test_omp_transeq.f90 b/tests/omp/test_omp_transeq.f90 index a05745ea..b793c52a 100644 --- a/tests/omp/test_omp_transeq.f90 +++ b/tests/omp/test_omp_transeq.f90 @@ -37,9 +37,9 @@ program test_omp_transeq call MPI_Comm_rank(MPI_COMM_WORLD, nrank, ierr) call MPI_Comm_size(MPI_COMM_WORLD, nproc, ierr) - globs%nx = 64 - globs%ny = 64 - globs%nz = 64 + globs%nx = 96 + globs%ny = 96 + globs%nz = 96 globs%nx_loc = globs%nx/nproc globs%ny_loc = globs%ny/nproc @@ -79,7 +79,7 @@ program test_omp_transeq if (nrank == 0) print*, 'Parallel run with', nproc, 'ranks' - n_glob = globs%nx_loc + n_glob = globs%nx n = n_glob/nproc n_block = xdirps%n_blocks From aba7b1a7e8e86b17e9ec025ad9460268d44e8f2f Mon Sep 17 00:00:00 2001 From: Sebastien Lemaire Date: Fri, 16 Feb 2024 15:09:45 +0000 Subject: [PATCH 16/19] revert back to running tests in serial --- tests/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index f029a33f..cbd27246 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -43,5 +43,5 @@ foreach(testfile IN LISTS TESTSRC) find_package(OpenMP REQUIRED) target_link_libraries(${test_name} PRIVATE OpenMP::OpenMP_Fortran) - add_test(NAME ${test_name} COMMAND sh -c "mpirun -np 4 ${test_name}") + add_test(NAME ${test_name} COMMAND sh -c "mpirun -np 1 ${test_name}") endforeach() From 6b63417763788cdb59e4a36924ca69d024430647 Mon Sep 17 00:00:00 2001 From: Sebastien Lemaire Date: Fri, 16 Feb 2024 16:49:03 +0000 Subject: [PATCH 17/19] Explicitely set _dp in for transeq computation --- src/omp/exec_dist.f90 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/omp/exec_dist.f90 b/src/omp/exec_dist.f90 index 867686ac..0b6b8644 100644 --- a/src/omp/exec_dist.f90 +++ b/src/omp/exec_dist.f90 @@ -177,7 +177,7 @@ subroutine exec_dist_transeq_compact(& do j = 1, n !$omp simd do i = 1, SZ - rhs(i, j, k) = -0.5*(v(i, j, k)*du(i, j, k) + dud(i, j, k)) + nu*d2u(i, j, k) + rhs(i, j, k) = -0.5_dp*(v(i, j, k)*du(i, j, k) + dud(i, j, k)) + nu*d2u(i, j, k) end do !$omp end simd end do From e97c2877cc356d00a9bf0e327d4f13e770b65447 Mon Sep 17 00:00:00 2001 From: Sebastien Lemaire Date: Mon, 19 Feb 2024 14:16:11 +0000 Subject: [PATCH 18/19] fix identation --- src/omp/backend.f90 | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/omp/backend.f90 b/src/omp/backend.f90 index 331f3872..c1d775c2 100644 --- a/src/omp/backend.f90 +++ b/src/omp/backend.f90 @@ -260,9 +260,9 @@ subroutine tds_solve_dist(self, du, u, dirps, tdsops) call exec_dist_tds_compact( & - du%data, u%data, self%u_recv_s, self%u_recv_e, self%du_send_s, self%du_send_e, & - self%du_recv_s, self%du_recv_e, & - tdsops, dirps%nproc, dirps%pprev, dirps%pnext, dirps%n_blocks) + du%data, u%data, self%u_recv_s, self%u_recv_e, self%du_send_s, self%du_send_e, & + self%du_recv_s, self%du_recv_e, & + tdsops, dirps%nproc, dirps%pprev, dirps%pnext, dirps%n_blocks) end subroutine tds_solve_dist From 33345c0de1711773ec7889e2d80bc8b2d130203a Mon Sep 17 00:00:00 2001 From: Sebastien Lemaire Date: Mon, 19 Feb 2024 14:22:07 +0000 Subject: [PATCH 19/19] fix openmp race condition --- src/omp/exec_dist.f90 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/omp/exec_dist.f90 b/src/omp/exec_dist.f90 index 0b6b8644..e713c08a 100644 --- a/src/omp/exec_dist.f90 +++ b/src/omp/exec_dist.f90 @@ -108,7 +108,7 @@ subroutine exec_dist_transeq_compact(& allocate(ud_recv_e(SZ, n_halo)) allocate(ud_recv_s(SZ, n_halo)) - !$omp parallel do + !$omp parallel do private(ud, ud_recv_e, ud_recv_s) do k = 1, n_block call der_univ_dist( & du(:, :, k), du_send_s(:, :, k), du_send_e(:, :, k), u(:, :, k), &