From ab51139df2d7dff79289edcc05b3c6d718355cdc Mon Sep 17 00:00:00 2001 From: Hui Zhou Date: Sun, 10 Nov 2024 09:01:12 -0600 Subject: [PATCH 1/2] ch4: fix race condition in setting is_local in send requests We need set is_local in a requests inside the vci critical section or race condition may happen. Only the send requests that may go into RNDV active messages need it set. This fixes the occasional failures in am-only threads/pt2pt/multisend2 test. --- src/mpid/ch4/netmod/ofi/ofi_send.h | 4 ++++ src/mpid/ch4/netmod/ucx/ucx_send.h | 4 +++- src/mpid/ch4/src/ch4_send.h | 2 -- src/mpid/ch4/src/mpidig_send.h | 3 +++ 4 files changed, 10 insertions(+), 3 deletions(-) diff --git a/src/mpid/ch4/netmod/ofi/ofi_send.h b/src/mpid/ch4/netmod/ofi/ofi_send.h index 897f05b42a7..3d409ef5fa1 100644 --- a/src/mpid/ch4/netmod/ofi/ofi_send.h +++ b/src/mpid/ch4/netmod/ofi/ofi_send.h @@ -104,6 +104,10 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_issue_ack_recv(MPIR_Request * sreq, MPIR_ ackreq->remote_addr = MPIDI_OFI_av_to_phys(addr, nic, vci_remote); ackreq->match_bits = match_bits; +#ifndef MPIDI_CH4_DIRECT_NETMOD + /* set is_local in case we go into active messages later */ + MPIDI_REQUEST(sreq, is_local) = 0; +#endif MPIDI_OFI_CALL_RETRY(fi_trecv(MPIDI_OFI_global.ctx[ackreq->ctx_idx].rx, ackreq->ack_hdr, ackreq->ack_hdr_sz, NULL, ackreq->remote_addr, ackreq->match_bits, 0ULL, (void *) &(ackreq->context)), diff --git a/src/mpid/ch4/netmod/ucx/ucx_send.h b/src/mpid/ch4/netmod/ucx/ucx_send.h index 70a39f25a7c..838f64397ba 100644 --- a/src/mpid/ch4/netmod/ucx/ucx_send.h +++ b/src/mpid/ch4/netmod/ucx/ucx_send.h @@ -161,7 +161,9 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_NM_mpi_isend(const void *buf, addr, request, vci_src, vci_dst, is_sync, 0 /* is_am */); MPIDI_UCX_REQ(*request).s.am_req = NULL; MPIDI_UCX_THREAD_CS_EXIT_VCI(vci_src); - +#ifndef MPIDI_CH4_DIRECT_NETMOD + MPIDI_REQUEST(*request, is_local) = 0; +#endif MPIR_FUNC_EXIT; return mpi_errno; diff --git a/src/mpid/ch4/src/ch4_send.h b/src/mpid/ch4/src/ch4_send.h index 1e78055f081..a8be81b4b2b 100644 --- a/src/mpid/ch4/src/ch4_send.h +++ b/src/mpid/ch4/src/ch4_send.h @@ -29,8 +29,6 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_isend(const void *buf, mpi_errno = MPIDI_SHM_mpi_isend(buf, count, datatype, rank, tag, comm, attr, av, req); else mpi_errno = MPIDI_NM_mpi_isend(buf, count, datatype, rank, tag, comm, attr, av, req); - if (mpi_errno == MPI_SUCCESS) - MPIDI_REQUEST(*req, is_local) = r; #endif MPIR_ERR_CHECK(mpi_errno); diff --git a/src/mpid/ch4/src/mpidig_send.h b/src/mpid/ch4/src/mpidig_send.h index 6c2a0143cc6..ba64f0831f7 100644 --- a/src/mpid/ch4/src/mpidig_send.h +++ b/src/mpid/ch4/src/mpidig_send.h @@ -99,6 +99,9 @@ MPL_STATIC_INLINE_PREFIX int MPIDIG_isend_impl(const void *buf, MPI_Aint count, src_vci, dst_vci, sreq), is_local, mpi_errno); } else { /* RNDV send */ +#ifndef MPIDI_CH4_DIRECT_NETMOD + MPIDI_REQUEST(sreq, is_local) = is_local; +#endif MPIDIG_REQUEST(sreq, buffer) = (void *) buf; MPIDIG_REQUEST(sreq, count) = count; MPIDIG_REQUEST(sreq, datatype) = datatype; From 0dff758656ef120a083e6a15b5da5651b1f9f9e6 Mon Sep 17 00:00:00 2001 From: Hui Zhou Date: Fri, 15 Nov 2024 11:13:07 -0600 Subject: [PATCH 2/2] ch4/part: set is_local field Now that the matching checks the is_local field -- ref. MPIDIG_match_request, we need make sure the is_local field in the request is set. --- src/mpid/ch4/src/mpidig_part_callbacks.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/mpid/ch4/src/mpidig_part_callbacks.c b/src/mpid/ch4/src/mpidig_part_callbacks.c index 01ac6b4a653..7a1745aea41 100644 --- a/src/mpid/ch4/src/mpidig_part_callbacks.c +++ b/src/mpid/ch4/src/mpidig_part_callbacks.c @@ -85,6 +85,9 @@ int MPIDIG_part_send_init_target_msg_cb(void *am_hdr, void *data, MPIR_ERR_CHKANDSTMT(unexp_req == NULL, mpi_errno, MPIX_ERR_NOREQ, goto fn_fail, "**nomemreq"); +#ifndef MPIDI_CH4_DIRECT_NETMOD + MPIDI_REQUEST(unexp_req, is_local) = is_local; +#endif MPIDI_PART_REQUEST(unexp_req, u.recv.context_id) = msg_hdr->context_id; part_rreq_update_sinfo(unexp_req, msg_hdr);