From f345ef8c9728feaa61ea67c7e30a34ba0787493b Mon Sep 17 00:00:00 2001 From: Hui Zhou Date: Sat, 21 Dec 2024 08:28:30 -0600 Subject: [PATCH 01/59] misc: spellchecks Miscellenous typo fixes to appease the spellchecker. --- src/mpid/ch3/include/mpidimpl.h | 12 ++++++------ src/mpid/ch3/include/mpidpost.h | 6 +++--- src/mpid/ch3/include/mpidpre.h | 2 +- src/mpid/ch3/src/mpid_vc.c | 2 +- src/mpid/ch4/src/ch4_proc.h | 4 ++-- 5 files changed, 13 insertions(+), 13 deletions(-) diff --git a/src/mpid/ch3/include/mpidimpl.h b/src/mpid/ch3/include/mpidimpl.h index 1400271797f..e7d30ae62ad 100644 --- a/src/mpid/ch3/include/mpidimpl.h +++ b/src/mpid/ch3/include/mpidimpl.h @@ -1128,7 +1128,7 @@ int MPIDI_CH3I_Get_accumulate(const void *origin_addr, MPI_Aint origin_count, MPIDI_CH3_Progress_signal_completion - Inform the progress engine that a pending request has completed. - IMPLEMENTORS: + IMPLEMENTERS: In a single-threaded environment, this routine can be implemented by incrementing a request completion counter. In a multi-threaded environment, the request completion counter must be atomically @@ -1229,10 +1229,10 @@ int MPIDI_CH3I_VC_post_sockconnect(MPIDI_VC_t * ); all processes in comm*/ int MPID_PG_BCast( MPIR_Comm *peercomm_p, MPIR_Comm *comm_p, int root ); -/* Channel defintitions */ +/* Channel definititions */ /*@ - MPIDI_CH3_iStartMsg - A non-blocking request to send a CH3 packet. A r - equest object is allocated only if the send could not be completed + MPIDI_CH3_iStartMsg - A non-blocking request to send a CH3 packet. A + request object is allocated only if the send could not be completed immediately. Input Parameters: @@ -1282,7 +1282,7 @@ int MPIDI_CH3_iStartMsg(MPIDI_VC_t * vc, void * pkt, intptr_t pkt_sz, packet structure and the vector may be allocated on the stack. - IMPLEMENTORS: + IMPLEMENTERS: If the send can not be completed immediately, the CH3 packet structure and the vector must be stored internally until the request is complete. @@ -1346,7 +1346,7 @@ int MPIDI_CH3_iSend(MPIDI_VC_t * vc, MPIR_Request * sreq, void * pkt, packet structure and the vector may be allocated on the stack. - IMPLEMENTORS: + IMPLEMENTERS: If the send can not be completed immediately, the packet structure and the vector must be stored internally until the request is complete. diff --git a/src/mpid/ch3/include/mpidpost.h b/src/mpid/ch3/include/mpidpost.h index 6f76c6aedc1..2c773d97099 100644 --- a/src/mpid/ch3/include/mpidpost.h +++ b/src/mpid/ch3/include/mpidpost.h @@ -45,7 +45,7 @@ } .ve - IMPLEMENTORS: + IMPLEMENTERS: A multi-threaded implementation might save the current value of a request completion counter in the state. @*/ @@ -66,7 +66,7 @@ void MPIDI_CH3_Progress_start(MPID_Progress_state * state); NOTE: MPIDI_CH3_Progress_start/end() need to be called. - IMPLEMENTORS: + IMPLEMENTERS: A multi-threaded implementation would return immediately if the a request had been completed between the call to MPIDI_CH3_Progress_start() and MPIDI_CH3_Progress_wait(). This could be @@ -110,7 +110,7 @@ int MPIDI_CH3_Progress_test(void); Return value: An mpi error code. - IMPLEMENTORS: + IMPLEMENTERS: This routine is similar to MPIDI_CH3_Progress_test but may not be as thorough in its attempt to satisfy all outstanding communication. diff --git a/src/mpid/ch3/include/mpidpre.h b/src/mpid/ch3/include/mpidpre.h index 595434c3aff..2d181e2a7c8 100644 --- a/src/mpid/ch3/include/mpidpre.h +++ b/src/mpid/ch3/include/mpidpre.h @@ -480,7 +480,7 @@ typedef struct MPIDI_Request { * 4. The callback function can complete other requests, thus * calling those requests' callback functions. However, the * recursion depth of request completion function is limited. - * If we ever need deeper recurisve calls, we need to change + * If we ever need deeper recursive calls, we need to change * to an iterative design instead of a recursive design for * request completion. * diff --git a/src/mpid/ch3/src/mpid_vc.c b/src/mpid/ch3/src/mpid_vc.c index 81cb71c91e6..496fd1a25ba 100644 --- a/src/mpid/ch3/src/mpid_vc.c +++ b/src/mpid/ch3/src/mpid_vc.c @@ -580,7 +580,7 @@ int MPID_Intercomm_exchange_map(MPIR_Comm *local_comm_ptr, int local_leader, } /* Finish up by giving the device the opportunity to update - any other infomration among these processes. Note that the + any other information among these processes. Note that the new intercomm has not been set up; in fact, we haven't yet attempted to set up the connection tables. diff --git a/src/mpid/ch4/src/ch4_proc.h b/src/mpid/ch4/src/ch4_proc.h index 749afce18f5..d4ba4901f26 100644 --- a/src/mpid/ch4/src/ch4_proc.h +++ b/src/mpid/ch4/src/ch4_proc.h @@ -9,7 +9,7 @@ #include "ch4_types.h" /* There are 3 terms referencing processes: - * upid, or "unversal process id", is netmod layer address (addrname) + * upid, or "universal process id", is netmod layer address (addrname) * lpid, or "local process id", is av entry index in an ch4-layer table * gpid, or "global process id", is av table index plus av entry index * @@ -262,7 +262,7 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_rank_is_local(int rank, MPIR_Comm * comm) #ifdef MPIDI_CH4_DIRECT_NETMOD /* Ask the netmod for locality information. If it decided not to build it, - * it will call back up to the MPIDIU function to get the infomration. */ + * it will call back up to the MPIDIU function to get the information. */ ret = MPIDI_NM_rank_is_local(rank, comm); #else ret = MPIDIU_av_is_local(MPIDIU_comm_rank_to_av(comm, rank)); From 843cfe5d495828ed2c747fbd3b2affc14562a253 Mon Sep 17 00:00:00 2001 From: Hui Zhou Date: Tue, 10 Dec 2024 13:28:48 -0600 Subject: [PATCH 02/59] test: remove unused test glpid This test requires to access MPICH internals, thus won't be used with the current design. --- test/mpi/group/Makefile.am | 4 ---- test/mpi/group/glpid.c | 44 -------------------------------------- 2 files changed, 48 deletions(-) delete mode 100644 test/mpi/group/glpid.c diff --git a/test/mpi/group/Makefile.am b/test/mpi/group/Makefile.am index d647c9d377a..993dab99371 100644 --- a/test/mpi/group/Makefile.am +++ b/test/mpi/group/Makefile.am @@ -16,7 +16,3 @@ noinst_PROGRAMS = \ groupcreate \ gtranks \ groupnullincl - -# glpid is a whitebox test that uses mpiimpl.h; it is unlikely to build with the -# current build system setup -#EXTRA_PROGRAMS = glpid diff --git a/test/mpi/group/glpid.c b/test/mpi/group/glpid.c deleted file mode 100644 index 06238aeb942..00000000000 --- a/test/mpi/group/glpid.c +++ /dev/null @@ -1,44 +0,0 @@ -/* - * Copyright (C) by Argonne National Laboratory - * See COPYRIGHT in top-level directory - */ - -#include -#include "mpi.h" -#include "mpiimpl.h" - -int main(int argc, char *argv[]) -{ - MPIR_Group group, *group_ptr = &group; - int i; - - MPI_Init(&argc, &argv); - - /* Setup a sample group */ - group.handle = 1; - group.ref_count = 1; - group.size = 4; - group.rank = 0; - group.idx_of_first_lpid = -1; - group.lrank_to_lpid = (MPII_Group_pmap_t *) - MPL_malloc(group.size * sizeof(MPII_Group_pmap_t), MPL_MEM_OTHER); - for (i = 0; i < group.size; i++) { - group.lrank_to_lpid[i].lrank = i; - group.lrank_to_lpid[i].lpid = group.size - i - 1; - group.lrank_to_lpid[i].next_lpid = -1; - group.lrank_to_lpid[i].flag = 0; - } - - /* Set up the group lpid list */ - MPII_Group_setup_lpid_list(group_ptr); - - /* Print the group structure */ - printf("Index of first lpid = %d\n", group.idx_of_first_lpid); - for (i = 0; i < group.size; i++) { - printf("lrank_to_lpid[%d].next_lpid = %d, .lpid = %d\n", - i, group.lrank_to_lpid[i].next_lpid, group.lrank_to_lpid[i].lpid); - } - - MPI_Finalize(); - return 0; -} From 89338a6e2a04efaf7ef19e6990637b45c436353a Mon Sep 17 00:00:00 2001 From: Hui Zhou Date: Tue, 10 Dec 2024 13:23:06 -0600 Subject: [PATCH 03/59] group: remove unused groupdebug.c We no longer use this file. --- src/mpi/group/groupdebug.c | 77 -------------------------------------- 1 file changed, 77 deletions(-) delete mode 100644 src/mpi/group/groupdebug.c diff --git a/src/mpi/group/groupdebug.c b/src/mpi/group/groupdebug.c deleted file mode 100644 index a70b9592d2f..00000000000 --- a/src/mpi/group/groupdebug.c +++ /dev/null @@ -1,77 +0,0 @@ -/* - * Copyright (C) by Argonne National Laboratory - * See COPYRIGHT in top-level directory - */ - -#include "mpiimpl.h" -#include "group.h" - -/* style: allow:fprintf:2 sig:0 */ -/* style: PMPIuse:PMPI_Abort:2 sig:0 */ - -/* - * This file contains routines that are used only to perform testing - * and debugging of the group routines - */ -void MPITEST_Group_create(int, int, MPI_Group *); -void MPITEST_Group_print(MPI_Group); - -/* --BEGIN DEBUG-- */ -void MPITEST_Group_create(int nproc, int myrank, MPI_Group * new_group) -{ - MPIR_Group *new_group_ptr; - int i; - - new_group_ptr = (MPIR_Group *) MPIR_Handle_obj_alloc(&MPIR_Group_mem); - if (!new_group_ptr) { - fprintf(stderr, "Could not create a new group\n"); - PMPI_Abort(MPI_COMM_WORLD, 1); - } - MPIR_Object_set_ref(new_group_ptr, 1); - new_group_ptr->lrank_to_lpid = - (MPII_Group_pmap_t *) MPL_malloc(nproc * sizeof(MPII_Group_pmap_t), MPL_MEM_DEBUG); - if (!new_group_ptr->lrank_to_lpid) { - fprintf(stderr, "Could not create lrank map for new group\n"); - PMPI_Abort(MPI_COMM_WORLD, 1); - } - - new_group_ptr->rank = MPI_UNDEFINED; - for (i = 0; i < nproc; i++) { - new_group_ptr->lrank_to_lpid[i].lrank = i; - new_group_ptr->lrank_to_lpid[i].lpid = i; - } - new_group_ptr->size = nproc; - new_group_ptr->rank = myrank; - new_group_ptr->idx_of_first_lpid = -1; - - *new_group = new_group_ptr->handle; -} - -void MPITEST_Group_print(MPI_Group g) -{ - MPIR_Group *g_ptr; - int g_idx, size, i; - - MPIR_Group_get_ptr(g, g_ptr); - - g_idx = g_ptr->idx_of_first_lpid; - if (g_idx < 0) { - MPII_Group_setup_lpid_list(g_ptr); - g_idx = g_ptr->idx_of_first_lpid; - } - - /* Loop through these, printing the lpids by rank and in order */ - size = g_ptr->size; - fprintf(stdout, "Lpids in rank order\n"); - for (i = 0; i < size; i++) { - fprintf(stdout, "Rank %d has lpid %d\n", i, g_ptr->lrank_to_lpid[i].lpid); - } - - fprintf(stdout, "Ranks in lpid order\n"); - while (g_idx >= 0) { - fprintf(stdout, "Rank %d has lpid %d\n", g_idx, g_ptr->lrank_to_lpid[g_idx].lpid); - g_idx = g_ptr->lrank_to_lpid[g_idx].next_lpid; - } -} - -/* --END DEBUG-- */ From 937cc5096c9e548bcb6a4c73f3a3c6d2dd01e586 Mon Sep 17 00:00:00 2001 From: Hui Zhou Date: Tue, 10 Dec 2024 14:17:37 -0600 Subject: [PATCH 04/59] group: abstract group access and lpid integer type Hide the internal fields of MPIR_Group from unnecessary access. Outside group_util.c and group_impl.c, it only need assume the MPIR_Lpid integer type, creation routines based on lpid map or lpid stride description, and access routine to look up lpid from a group rank. --- src/include/mpir_group.h | 19 ++++++++- src/mpi/group/grouputil.c | 84 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 101 insertions(+), 2 deletions(-) diff --git a/src/include/mpir_group.h b/src/include/mpir_group.h index c40f22fe877..f46659494b5 100644 --- a/src/include/mpir_group.h +++ b/src/include/mpir_group.h @@ -11,12 +11,19 @@ * only because they are required for the group operations (e.g., * MPI_Group_intersection) and for the scalable RMA synchronization *---------------------------------------------------------------------------*/ + +/* Abstract the integer type for lpid (process id). It is possible to use 32-bit + * in principle, but 64-bit is simpler since we can trivially combine + * (world_idx, world_rank). + */ +typedef uint64_t MPIR_Lpid; + /* This structure is used to implement the group operations such as MPI_Group_translate_ranks */ /* note: next_lpid (with idx_of_first_lpid in MPIR_Group) gives a linked list * in a sorted lpid ascending order */ typedef struct MPII_Group_pmap_t { - uint64_t lpid; /* local process id, from VCONN */ + MPIR_Lpid lpid; /* local process id, from VCONN */ int next_lpid; /* Index of next lpid (in lpid order) */ } MPII_Group_pmap_t; @@ -63,7 +70,7 @@ struct MPIR_Group { * process number */ int is_local_dense_monotonic; /* see NOTE-G1 */ - /* We may want some additional data for the RMA syncrhonization calls */ + /* We may want some additional data for the RMA synchronization calls */ /* Other, device-specific information */ #ifdef MPID_DEV_GROUP_DECL MPID_DEV_GROUP_DECL @@ -104,6 +111,14 @@ void MPIR_Group_setup_lpid_pairs(MPIR_Group *, MPIR_Group *); int MPIR_Group_create(int, MPIR_Group **); int MPIR_Group_release(MPIR_Group * group_ptr); +int MPIR_Group_create_map(int size, int rank, MPIR_Session * session_ptr, MPIR_Lpid * map, + MPIR_Group ** new_group_ptr); +int MPIR_Group_create_stride(int size, int rank, MPIR_Session * session_ptr, + MPIR_Lpid offset, MPIR_Lpid stride, MPIR_Lpid blocksize, + MPIR_Group ** new_group_ptr); +MPIR_Lpid MPIR_Group_rank_to_lpid(MPIR_Group * group, int rank); +int MPIR_Group_lpid_to_rank(MPIR_Group * group, MPIR_Lpid lpid); + int MPIR_Group_check_subset(MPIR_Group * group_ptr, MPIR_Comm * comm_ptr); void MPIR_Group_set_session_ptr(MPIR_Group * group_ptr, MPIR_Session * session_out); int MPIR_Group_init(void); diff --git a/src/mpi/group/grouputil.c b/src/mpi/group/grouputil.c index ac777e50305..414c562fe3c 100644 --- a/src/mpi/group/grouputil.c +++ b/src/mpi/group/grouputil.c @@ -94,6 +94,90 @@ int MPIR_Group_create(int nproc, MPIR_Group ** new_group_ptr) return mpi_errno; } +int MPIR_Group_create_map(int size, int rank, MPIR_Session * session_ptr, MPIR_Lpid * map, + MPIR_Group ** new_group_ptr) +{ + int mpi_errno = MPI_SUCCESS; + + if (size == 0) { + /* See 5.3.2, Group Constructors. For many group routines, + * the standard explicitly says to return MPI_GROUP_EMPTY; + * for others it is implied */ + *new_group_ptr = MPIR_Group_empty; + goto fn_exit; + } + + MPIR_Group *newgrp; + mpi_errno = MPIR_Group_create(size, &newgrp); + MPIR_ERR_CHECK(mpi_errno); + + newgrp->rank = rank; + MPIR_Group_set_session_ptr(newgrp, session_ptr); + + for (int i = 0; i < size; i++) { + newgrp->lrank_to_lpid[i].lpid = map[i]; + } + + *new_group_ptr = newgrp; + + fn_exit: + MPL_free(map); + return mpi_errno; + fn_fail: + goto fn_exit; +} + +int MPIR_Group_create_stride(int size, int rank, MPIR_Session * session_ptr, + MPIR_Lpid offset, MPIR_Lpid stride, MPIR_Lpid blocksize, + MPIR_Group ** new_group_ptr) +{ + int mpi_errno = MPI_SUCCESS; + MPIR_Group *newgrp; + + MPIR_Assert(size > 0); + + mpi_errno = MPIR_Group_create(size, &newgrp); + MPIR_ERR_CHECK(mpi_errno); + + newgrp->rank = rank; + MPIR_Group_set_session_ptr(newgrp, session_ptr); + + MPIR_Lpid lpid = offset; + int i = 0; + while (i < size) { + for (int j = 0; j < blocksize; j++) { + newgrp->lrank_to_lpid[i + j].lpid = lpid + j; + } + i += blocksize; + lpid += stride; + } + + *new_group_ptr = newgrp; + + fn_exit: + return mpi_errno; + fn_fail: + goto fn_exit; +} + +MPIR_Lpid MPIR_Group_rank_to_lpid(MPIR_Group * group, int rank) +{ + return group->lrank_to_lpid[rank].lpid; +} + +int MPIR_Group_lpid_to_rank(MPIR_Group * group, MPIR_Lpid lpid) +{ + /* Use linear search for now. + * Optimization, build hash map in MPIR_Group_create_map and do O(1) hash lookup + */ + for (int i = 0; i < group->size; i++) { + if (lpid == group->lrank_to_lpid[i].lpid) { + return i; + } + } + return MPI_UNDEFINED; +} + /* * return value is the first index in the list * From b04ca339a731f38f82bd856c16d8d80a34a29426 Mon Sep 17 00:00:00 2001 From: Hui Zhou Date: Tue, 10 Dec 2024 13:41:20 -0600 Subject: [PATCH 05/59] misc: use the new group rank/lpid conversion routines For most external usages, we only need MPIR_Group_rank_to_lpid. --- src/mpi/comm/comm_impl.c | 6 +++--- src/mpid/ch3/src/ch3u_comm.c | 2 +- src/mpid/ch3/src/ch3u_handle_connection.c | 2 +- src/mpid/ch4/src/ch4_impl.h | 5 ++++- 4 files changed, 9 insertions(+), 6 deletions(-) diff --git a/src/mpi/comm/comm_impl.c b/src/mpi/comm/comm_impl.c index 9dbba6d703f..9f361f60007 100644 --- a/src/mpi/comm/comm_impl.c +++ b/src/mpi/comm/comm_impl.c @@ -224,7 +224,7 @@ int MPII_Comm_create_calculate_mapping(MPIR_Group * group_ptr, subsetOfWorld = 1; wsize = MPIR_Process.size; for (i = 0; i < n; i++) { - uint64_t g_lpid = group_ptr->lrank_to_lpid[i].lpid; + MPIR_Lpid g_lpid = MPIR_Group_rank_to_lpid(group_ptr, i); /* This mapping is relative to comm world */ MPL_DBG_MSG_FMT(MPIR_DBG_COMM, VERBOSE, @@ -261,7 +261,7 @@ int MPII_Comm_create_calculate_mapping(MPIR_Group * group_ptr, for (j = 0; j < comm_ptr->local_size; j++) { uint64_t comm_lpid; MPID_Comm_get_lpid(comm_ptr, j, &comm_lpid, FALSE); - if (comm_lpid == group_ptr->lrank_to_lpid[i].lpid) { + if (comm_lpid == MPIR_Group_rank_to_lpid(group_ptr, i)) { mapping[i] = j; break; } @@ -800,7 +800,7 @@ int MPIR_Intercomm_create_from_groups_impl(MPIR_Group * local_group_ptr, int loc int tag = get_tag_from_stringtag(stringtag); /* FIXME: ensure lpid is from comm_world */ - uint64_t remote_lpid = remote_group_ptr->lrank_to_lpid[remote_leader].lpid; + MPIR_Lpid remote_lpid = MPIR_Group_rank_to_lpid(remote_group_ptr, remote_leader); MPIR_Assert(remote_lpid < MPIR_Process.size); mpi_errno = MPIR_Intercomm_create_impl(local_comm, local_leader, MPIR_Process.comm_world, (int) remote_lpid, diff --git a/src/mpid/ch3/src/ch3u_comm.c b/src/mpid/ch3/src/ch3u_comm.c index b704d3042e2..ce2f495055b 100644 --- a/src/mpid/ch3/src/ch3u_comm.c +++ b/src/mpid/ch3/src/ch3u_comm.c @@ -512,7 +512,7 @@ static int nonempty_intersection(MPIR_Comm *comm, MPIR_Group *group, int *flag) for (i_g = 0; i_g < group->size; ++i_g) { /* FIXME: This won't work for dynamic procs */ - MPIDI_PG_Get_vc(MPIDI_Process.my_pg, group->lrank_to_lpid[i_g].lpid, &vc_g); + MPIDI_PG_Get_vc(MPIDI_Process.my_pg, MPIR_Group_rank_to_lpid(group, i_g), &vc_g); for (i_c = 0; i_c < comm->remote_size; ++i_c) { MPIDI_Comm_get_vc(comm, i_c, &vc_c); if (vc_g == vc_c) { diff --git a/src/mpid/ch3/src/ch3u_handle_connection.c b/src/mpid/ch3/src/ch3u_handle_connection.c index ef5819aaf3d..17ef122cb7f 100644 --- a/src/mpid/ch3/src/ch3u_handle_connection.c +++ b/src/mpid/ch3/src/ch3u_handle_connection.c @@ -372,7 +372,7 @@ static int terminate_failed_VCs(MPIR_Group *new_failed_group) MPIDI_VC_t *vc; /* terminate the VC */ /* FIXME: This won't work for dynamic procs */ - MPIDI_PG_Get_vc(MPIDI_Process.my_pg, new_failed_group->lrank_to_lpid[i].lpid, &vc); + MPIDI_PG_Get_vc(MPIDI_Process.my_pg, MPIR_Group_rank_to_lpid(new_failed_group, i), &vc); mpi_errno = MPIDI_CH3_Connection_terminate(vc); MPIR_ERR_CHECK(mpi_errno); } diff --git a/src/mpid/ch4/src/ch4_impl.h b/src/mpid/ch4/src/ch4_impl.h index 8991052f1a5..2f5a31dc767 100644 --- a/src/mpid/ch4/src/ch4_impl.h +++ b/src/mpid/ch4/src/ch4_impl.h @@ -387,7 +387,10 @@ MPL_STATIC_INLINE_PREFIX int MPIDIU_valid_group_rank(MPIR_Comm * comm, int rank, MPIDI_NM_comm_get_gpid(comm, rank, &gpid, FALSE); - for (z = 0; z < size && gpid != grp->lrank_to_lpid[z].lpid; ++z) { + for (z = 0; z < size; ++z) { + if (gpid == MPIR_Group_rank_to_lpid(grp, z)) { + break; + } } ret = (z < size); From d5ec57f0f7a46df6757fcae361b6673eb11b8922 Mon Sep 17 00:00:00 2001 From: Hui Zhou Date: Tue, 10 Dec 2024 14:01:29 -0600 Subject: [PATCH 06/59] comm: use MPIR_Group_create_{map, stride) Avoid access group internal fields. --- src/mpi/comm/comm_impl.c | 55 +++++++++++----------------------------- src/mpi/comm/ulfm_impl.c | 15 ++++++----- 2 files changed, 23 insertions(+), 47 deletions(-) diff --git a/src/mpi/comm/comm_impl.c b/src/mpi/comm/comm_impl.c index 9f361f60007..46f06b89762 100644 --- a/src/mpi/comm/comm_impl.c +++ b/src/mpi/comm/comm_impl.c @@ -68,36 +68,19 @@ int MPIR_Comm_test_threadcomm_impl(MPIR_Comm * comm_ptr, int *flag) static int comm_create_local_group(MPIR_Comm * comm_ptr) { int mpi_errno = MPI_SUCCESS; - MPIR_Group *group_ptr; - int n = comm_ptr->local_size; - - mpi_errno = MPIR_Group_create(n, &group_ptr); - MPIR_ERR_CHECK(mpi_errno); - /* Group belongs to the same session as communicator */ - MPIR_Group_set_session_ptr(group_ptr, comm_ptr->session_ptr); - - group_ptr->is_local_dense_monotonic = TRUE; + int n = comm_ptr->local_size; + MPIR_Lpid *map = MPL_malloc(n * sizeof(MPIR_Lpid), MPL_MEM_GROUP); - int comm_world_size = MPIR_Process.size; for (int i = 0; i < n; i++) { uint64_t lpid; (void) MPID_Comm_get_lpid(comm_ptr, i, &lpid, FALSE); - group_ptr->lrank_to_lpid[i].lpid = lpid; - if (lpid > comm_world_size || (i > 0 && group_ptr->lrank_to_lpid[i - 1].lpid != (lpid - 1))) { - group_ptr->is_local_dense_monotonic = FALSE; - } + map[i] = lpid; } - group_ptr->size = n; - group_ptr->rank = comm_ptr->rank; - group_ptr->idx_of_first_lpid = -1; - - comm_ptr->local_group = group_ptr; - - /* FIXME : Add a sanity check that the size of the group is the same as - * the size of the communicator. This helps catch corrupted - * communicators */ + mpi_errno = MPIR_Group_create_map(n, comm_ptr->rank, comm_ptr->session_ptr, map, + &comm_ptr->local_group); + MPIR_ERR_CHECK(mpi_errno); fn_exit: return mpi_errno; @@ -931,31 +914,23 @@ int MPIR_Comm_idup_with_info_impl(MPIR_Comm * comm_ptr, MPIR_Info * info, int MPIR_Comm_remote_group_impl(MPIR_Comm * comm_ptr, MPIR_Group ** group_ptr) { int mpi_errno = MPI_SUCCESS; - int i, n; - MPIR_FUNC_ENTER; + /* Create a group and populate it with the local process ids */ if (!comm_ptr->remote_group) { - n = comm_ptr->remote_size; - mpi_errno = MPIR_Group_create(n, group_ptr); - MPIR_ERR_CHECK(mpi_errno); + int n = comm_ptr->remote_size; + MPIR_Lpid *map = MPL_malloc(n * sizeof(MPIR_Lpid), MPL_MEM_GROUP); - for (i = 0; i < n; i++) { + for (int i = 0; i < n; i++) { uint64_t lpid; (void) MPID_Comm_get_lpid(comm_ptr, i, &lpid, TRUE); - (*group_ptr)->lrank_to_lpid[i].lpid = lpid; - /* TODO calculate is_local_dense_monotonic */ + map[i] = lpid; } - (*group_ptr)->size = n; - (*group_ptr)->rank = MPI_UNDEFINED; - (*group_ptr)->idx_of_first_lpid = -1; - - MPIR_Group_set_session_ptr(*group_ptr, comm_ptr->session_ptr); - - comm_ptr->remote_group = *group_ptr; - } else { - *group_ptr = comm_ptr->remote_group; + mpi_errno = MPIR_Group_create_map(n, MPI_UNDEFINED, comm_ptr->session_ptr, map, + &comm_ptr->remote_group); + MPIR_ERR_CHECK(mpi_errno); } + *group_ptr = comm_ptr->remote_group; MPIR_Group_add_ref(comm_ptr->remote_group); fn_exit: diff --git a/src/mpi/comm/ulfm_impl.c b/src/mpi/comm/ulfm_impl.c index dfd4ad6bfcf..33edffa3d11 100644 --- a/src/mpi/comm/ulfm_impl.c +++ b/src/mpi/comm/ulfm_impl.c @@ -87,21 +87,22 @@ int MPIR_Comm_get_failed_impl(MPIR_Comm * comm_ptr, MPIR_Group ** failed_group_p /* create failed_group */ int n = utarray_len(failed_procs); + MPIR_Lpid *map = MPL_malloc(n * sizeof(MPIR_Lpid), MPL_MEM_GROUP); + MPIR_Group *new_group; - mpi_errno = MPIR_Group_create(n, &new_group); - MPIR_ERR_CHECK(mpi_errno); - new_group->rank = MPI_UNDEFINED; + int myrank = MPI_UNDEFINED; for (int i = 0; i < utarray_len(failed_procs); i++) { int *p = (int *) utarray_eltptr(failed_procs, i); - new_group->lrank_to_lpid[i].lpid = *p; + map[i] = *p; /* if calling process is part of the group, set the rank */ if (*p == MPIR_Process.rank) { - new_group->rank = i; + myrank = i; } } - new_group->size = n; - new_group->idx_of_first_lpid = -1; + + mpi_errno = MPIR_Group_create_map(n, myrank, comm_ptr->session_ptr, map, &new_group); + MPIR_ERR_CHECK(mpi_errno); MPIR_Group *comm_group; MPIR_Comm_group_impl(comm_ptr, &comm_group); From 5572c0d5c46d0f9fedc3eb3d338855c8b76418de Mon Sep 17 00:00:00 2001 From: Hui Zhou Date: Tue, 10 Dec 2024 14:58:57 -0600 Subject: [PATCH 07/59] group: rearange functions in group_impl.c Group similar functions together to facilitate refactoring. There is no changes in this commit other than moving functions around. The 4 incl/excl functions are very similar. The 3 difference/intersection/union functions are very similar. --- src/mpi/group/group_impl.c | 404 ++++++++++++++++++------------------- 1 file changed, 202 insertions(+), 202 deletions(-) diff --git a/src/mpi/group/group_impl.c b/src/mpi/group/group_impl.c index dbd3cd88204..848996c7d8c 100644 --- a/src/mpi/group/group_impl.c +++ b/src/mpi/group/group_impl.c @@ -18,6 +18,22 @@ int MPIR_Group_size_impl(MPIR_Group * group_ptr, int *size) return MPI_SUCCESS; } +int MPIR_Group_free_impl(MPIR_Group * group_ptr) +{ + int mpi_errno = MPI_SUCCESS; + + /* Do not free MPI_GROUP_EMPTY */ + if (group_ptr->handle != MPI_GROUP_EMPTY) { + mpi_errno = MPIR_Group_release(group_ptr); + MPIR_ERR_CHECK(mpi_errno); + } + + fn_exit: + return mpi_errno; + fn_fail: + goto fn_exit; +} + int MPIR_Group_compare_impl(MPIR_Group * group_ptr1, MPIR_Group * group_ptr2, int *result) { int mpi_errno = MPI_SUCCESS; @@ -67,77 +83,76 @@ int MPIR_Group_compare_impl(MPIR_Group * group_ptr1, MPIR_Group * group_ptr2, in return mpi_errno; } -int MPIR_Group_difference_impl(MPIR_Group * group_ptr1, MPIR_Group * group_ptr2, - MPIR_Group ** new_group_ptr) +int MPIR_Group_translate_ranks_impl(MPIR_Group * gp1, int n, const int ranks1[], + MPIR_Group * gp2, int ranks2[]) { int mpi_errno = MPI_SUCCESS; - int size1, i, k, g1_idx, g2_idx, nnew; + int i, g2_idx; uint64_t l1_pid, l2_pid; - int *flags = NULL; - MPIR_FUNC_ENTER; - /* Return a group consisting of the members of group1 that are *not* - * in group2 */ - size1 = group_ptr1->size; - /* Insure that the lpid lists are setup */ - MPIR_Group_setup_lpid_pairs(group_ptr1, group_ptr2); + MPL_DBG_MSG_S(MPIR_DBG_OTHER, VERBOSE, "gp2->is_local_dense_monotonic=%s", + (gp2->is_local_dense_monotonic ? "TRUE" : "FALSE")); - flags = MPL_calloc(size1, sizeof(int), MPL_MEM_OTHER); + /* Initialize the output ranks */ + for (i = 0; i < n; i++) + ranks2[i] = MPI_UNDEFINED; - g1_idx = group_ptr1->idx_of_first_lpid; - g2_idx = group_ptr2->idx_of_first_lpid; + if (gp2->size > 0 && gp2->is_local_dense_monotonic) { + /* g2 probably == group_of(MPI_COMM_WORLD); use fast, constant-time lookup */ + uint64_t lpid_offset = gp2->lrank_to_lpid[0].lpid; - nnew = size1; - while (g1_idx >= 0 && g2_idx >= 0) { - l1_pid = group_ptr1->lrank_to_lpid[g1_idx].lpid; - l2_pid = group_ptr2->lrank_to_lpid[g2_idx].lpid; - if (l1_pid < l2_pid) { - g1_idx = group_ptr1->lrank_to_lpid[g1_idx].next_lpid; - } else if (l1_pid > l2_pid) { - g2_idx = group_ptr2->lrank_to_lpid[g2_idx].next_lpid; - } else { - /* Equal */ - flags[g1_idx] = 1; - g1_idx = group_ptr1->lrank_to_lpid[g1_idx].next_lpid; - g2_idx = group_ptr2->lrank_to_lpid[g2_idx].next_lpid; - nnew--; + for (i = 0; i < n; ++i) { + uint64_t g1_lpid; + + if (ranks1[i] == MPI_PROC_NULL) { + ranks2[i] = MPI_PROC_NULL; + continue; + } + /* "adjusted" lpid from g1 */ + g1_lpid = gp1->lrank_to_lpid[ranks1[i]].lpid - lpid_offset; + if (g1_lpid < gp2->size) { + ranks2[i] = g1_lpid; + } + /* else leave UNDEFINED */ } - } - /* Create the group */ - if (nnew == 0) { - /* See 5.3.2, Group Constructors. For many group routines, - * the standard explicitly says to return MPI_GROUP_EMPTY; - * for others it is implied */ - *new_group_ptr = MPIR_Group_empty; - goto fn_exit; } else { - mpi_errno = MPIR_Group_create(nnew, new_group_ptr); - /* --BEGIN ERROR HANDLING-- */ - if (mpi_errno) { - goto fn_fail; + /* general, slow path; lookup time is dependent on the user-provided rank values! */ + g2_idx = gp2->idx_of_first_lpid; + if (g2_idx < 0) { + MPII_Group_setup_lpid_list(gp2); + g2_idx = gp2->idx_of_first_lpid; } - /* --END ERROR HANDLING-- */ - (*new_group_ptr)->rank = MPI_UNDEFINED; - k = 0; - for (i = 0; i < size1; i++) { - if (!flags[i]) { - (*new_group_ptr)->lrank_to_lpid[k].lpid = group_ptr1->lrank_to_lpid[i].lpid; - if (i == group_ptr1->rank) - (*new_group_ptr)->rank = k; - k++; + if (g2_idx >= 0) { + /* g2_idx can be < 0 if the g2 group is empty */ + l2_pid = gp2->lrank_to_lpid[g2_idx].lpid; + for (i = 0; i < n; i++) { + if (ranks1[i] == MPI_PROC_NULL) { + ranks2[i] = MPI_PROC_NULL; + continue; + } + l1_pid = gp1->lrank_to_lpid[ranks1[i]].lpid; + /* Search for this l1_pid in group2. Use the following + * optimization: start from the last position in the lpid list + * if possible. A more sophisticated version could use a + * tree based or even hashed search to speed the translation. */ + if (l1_pid < l2_pid || g2_idx < 0) { + /* Start over from the beginning */ + g2_idx = gp2->idx_of_first_lpid; + l2_pid = gp2->lrank_to_lpid[g2_idx].lpid; + } + while (g2_idx >= 0 && l1_pid > l2_pid) { + g2_idx = gp2->lrank_to_lpid[g2_idx].next_lpid; + if (g2_idx >= 0) + l2_pid = gp2->lrank_to_lpid[g2_idx].lpid; + else + l2_pid = (uint64_t) - 1; + } + if (l1_pid == l2_pid) + ranks2[i] = g2_idx; } } - /* TODO calculate is_local_dense_monotonic */ } - - MPIR_Group_set_session_ptr(*new_group_ptr, group_ptr1->session_ptr); - - fn_exit: - MPL_free(flags); - MPIR_FUNC_EXIT; return mpi_errno; - fn_fail: - goto fn_exit; } int MPIR_Group_excl_impl(MPIR_Group * group_ptr, int n, const int ranks[], @@ -188,22 +203,6 @@ int MPIR_Group_excl_impl(MPIR_Group * group_ptr, int n, const int ranks[], goto fn_exit; } -int MPIR_Group_free_impl(MPIR_Group * group_ptr) -{ - int mpi_errno = MPI_SUCCESS; - - /* Do not free MPI_GROUP_EMPTY */ - if (group_ptr->handle != MPI_GROUP_EMPTY) { - mpi_errno = MPIR_Group_release(group_ptr); - MPIR_ERR_CHECK(mpi_errno); - } - - fn_exit: - return mpi_errno; - fn_fail: - goto fn_exit; -} - int MPIR_Group_incl_impl(MPIR_Group * group_ptr, int n, const int ranks[], MPIR_Group ** new_group_ptr) { @@ -242,79 +241,6 @@ int MPIR_Group_incl_impl(MPIR_Group * group_ptr, int n, const int ranks[], goto fn_exit; } -int MPIR_Group_intersection_impl(MPIR_Group * group_ptr1, MPIR_Group * group_ptr2, - MPIR_Group ** new_group_ptr) -{ - int mpi_errno = MPI_SUCCESS; - int size1, i, k, g1_idx, g2_idx, nnew; - uint64_t l1_pid, l2_pid; - int *flags = NULL; - - MPIR_FUNC_ENTER; - /* Return a group consisting of the members of group1 that are - * in group2 */ - size1 = group_ptr1->size; - /* Insure that the lpid lists are setup */ - MPIR_Group_setup_lpid_pairs(group_ptr1, group_ptr2); - - flags = MPL_calloc(size1, sizeof(int), MPL_MEM_OTHER); - - g1_idx = group_ptr1->idx_of_first_lpid; - g2_idx = group_ptr2->idx_of_first_lpid; - - nnew = 0; - while (g1_idx >= 0 && g2_idx >= 0) { - l1_pid = group_ptr1->lrank_to_lpid[g1_idx].lpid; - l2_pid = group_ptr2->lrank_to_lpid[g2_idx].lpid; - if (l1_pid < l2_pid) { - g1_idx = group_ptr1->lrank_to_lpid[g1_idx].next_lpid; - } else if (l1_pid > l2_pid) { - g2_idx = group_ptr2->lrank_to_lpid[g2_idx].next_lpid; - } else { - /* Equal */ - flags[g1_idx] = 1; - g1_idx = group_ptr1->lrank_to_lpid[g1_idx].next_lpid; - g2_idx = group_ptr2->lrank_to_lpid[g2_idx].next_lpid; - nnew++; - } - } - /* Create the group. Handle the trivial case first */ - if (nnew == 0) { - *new_group_ptr = MPIR_Group_empty; - goto fn_exit; - } - - mpi_errno = MPIR_Group_create(nnew, new_group_ptr); - MPIR_ERR_CHECK(mpi_errno); - - (*new_group_ptr)->rank = MPI_UNDEFINED; - (*new_group_ptr)->is_local_dense_monotonic = TRUE; - k = 0; - for (i = 0; i < size1; i++) { - if (flags[i]) { - uint64_t lpid = group_ptr1->lrank_to_lpid[i].lpid; - (*new_group_ptr)->lrank_to_lpid[k].lpid = lpid; - if (i == group_ptr1->rank) - (*new_group_ptr)->rank = k; - if (lpid > MPIR_Process.size || - (k > 0 && (*new_group_ptr)->lrank_to_lpid[k - 1].lpid != (lpid - 1))) { - (*new_group_ptr)->is_local_dense_monotonic = FALSE; - } - - k++; - } - } - - MPIR_Group_set_session_ptr(*new_group_ptr, group_ptr1->session_ptr); - - fn_exit: - MPL_free(flags); - MPIR_FUNC_EXIT; - return mpi_errno; - fn_fail: - goto fn_exit; -} - int MPIR_Group_range_excl_impl(MPIR_Group * group_ptr, int n, int ranges[][3], MPIR_Group ** new_group_ptr) { @@ -464,76 +390,150 @@ int MPIR_Group_range_incl_impl(MPIR_Group * group_ptr, int n, int ranges[][3], goto fn_exit; } -int MPIR_Group_translate_ranks_impl(MPIR_Group * gp1, int n, const int ranks1[], - MPIR_Group * gp2, int ranks2[]) +int MPIR_Group_difference_impl(MPIR_Group * group_ptr1, MPIR_Group * group_ptr2, + MPIR_Group ** new_group_ptr) { int mpi_errno = MPI_SUCCESS; - int i, g2_idx; + int size1, i, k, g1_idx, g2_idx, nnew; uint64_t l1_pid, l2_pid; + int *flags = NULL; - MPL_DBG_MSG_S(MPIR_DBG_OTHER, VERBOSE, "gp2->is_local_dense_monotonic=%s", - (gp2->is_local_dense_monotonic ? "TRUE" : "FALSE")); - - /* Initialize the output ranks */ - for (i = 0; i < n; i++) - ranks2[i] = MPI_UNDEFINED; + MPIR_FUNC_ENTER; + /* Return a group consisting of the members of group1 that are *not* + * in group2 */ + size1 = group_ptr1->size; + /* Insure that the lpid lists are setup */ + MPIR_Group_setup_lpid_pairs(group_ptr1, group_ptr2); - if (gp2->size > 0 && gp2->is_local_dense_monotonic) { - /* g2 probably == group_of(MPI_COMM_WORLD); use fast, constant-time lookup */ - uint64_t lpid_offset = gp2->lrank_to_lpid[0].lpid; + flags = MPL_calloc(size1, sizeof(int), MPL_MEM_OTHER); - for (i = 0; i < n; ++i) { - uint64_t g1_lpid; + g1_idx = group_ptr1->idx_of_first_lpid; + g2_idx = group_ptr2->idx_of_first_lpid; - if (ranks1[i] == MPI_PROC_NULL) { - ranks2[i] = MPI_PROC_NULL; - continue; - } - /* "adjusted" lpid from g1 */ - g1_lpid = gp1->lrank_to_lpid[ranks1[i]].lpid - lpid_offset; - if (g1_lpid < gp2->size) { - ranks2[i] = g1_lpid; - } - /* else leave UNDEFINED */ + nnew = size1; + while (g1_idx >= 0 && g2_idx >= 0) { + l1_pid = group_ptr1->lrank_to_lpid[g1_idx].lpid; + l2_pid = group_ptr2->lrank_to_lpid[g2_idx].lpid; + if (l1_pid < l2_pid) { + g1_idx = group_ptr1->lrank_to_lpid[g1_idx].next_lpid; + } else if (l1_pid > l2_pid) { + g2_idx = group_ptr2->lrank_to_lpid[g2_idx].next_lpid; + } else { + /* Equal */ + flags[g1_idx] = 1; + g1_idx = group_ptr1->lrank_to_lpid[g1_idx].next_lpid; + g2_idx = group_ptr2->lrank_to_lpid[g2_idx].next_lpid; + nnew--; } + } + /* Create the group */ + if (nnew == 0) { + /* See 5.3.2, Group Constructors. For many group routines, + * the standard explicitly says to return MPI_GROUP_EMPTY; + * for others it is implied */ + *new_group_ptr = MPIR_Group_empty; + goto fn_exit; } else { - /* general, slow path; lookup time is dependent on the user-provided rank values! */ - g2_idx = gp2->idx_of_first_lpid; - if (g2_idx < 0) { - MPII_Group_setup_lpid_list(gp2); - g2_idx = gp2->idx_of_first_lpid; + mpi_errno = MPIR_Group_create(nnew, new_group_ptr); + /* --BEGIN ERROR HANDLING-- */ + if (mpi_errno) { + goto fn_fail; } - if (g2_idx >= 0) { - /* g2_idx can be < 0 if the g2 group is empty */ - l2_pid = gp2->lrank_to_lpid[g2_idx].lpid; - for (i = 0; i < n; i++) { - if (ranks1[i] == MPI_PROC_NULL) { - ranks2[i] = MPI_PROC_NULL; - continue; - } - l1_pid = gp1->lrank_to_lpid[ranks1[i]].lpid; - /* Search for this l1_pid in group2. Use the following - * optimization: start from the last position in the lpid list - * if possible. A more sophisticated version could use a - * tree based or even hashed search to speed the translation. */ - if (l1_pid < l2_pid || g2_idx < 0) { - /* Start over from the beginning */ - g2_idx = gp2->idx_of_first_lpid; - l2_pid = gp2->lrank_to_lpid[g2_idx].lpid; - } - while (g2_idx >= 0 && l1_pid > l2_pid) { - g2_idx = gp2->lrank_to_lpid[g2_idx].next_lpid; - if (g2_idx >= 0) - l2_pid = gp2->lrank_to_lpid[g2_idx].lpid; - else - l2_pid = (uint64_t) - 1; - } - if (l1_pid == l2_pid) - ranks2[i] = g2_idx; + /* --END ERROR HANDLING-- */ + (*new_group_ptr)->rank = MPI_UNDEFINED; + k = 0; + for (i = 0; i < size1; i++) { + if (!flags[i]) { + (*new_group_ptr)->lrank_to_lpid[k].lpid = group_ptr1->lrank_to_lpid[i].lpid; + if (i == group_ptr1->rank) + (*new_group_ptr)->rank = k; + k++; } } + /* TODO calculate is_local_dense_monotonic */ } + + MPIR_Group_set_session_ptr(*new_group_ptr, group_ptr1->session_ptr); + + fn_exit: + MPL_free(flags); + MPIR_FUNC_EXIT; return mpi_errno; + fn_fail: + goto fn_exit; +} + +int MPIR_Group_intersection_impl(MPIR_Group * group_ptr1, MPIR_Group * group_ptr2, + MPIR_Group ** new_group_ptr) +{ + int mpi_errno = MPI_SUCCESS; + int size1, i, k, g1_idx, g2_idx, nnew; + uint64_t l1_pid, l2_pid; + int *flags = NULL; + + MPIR_FUNC_ENTER; + /* Return a group consisting of the members of group1 that are + * in group2 */ + size1 = group_ptr1->size; + /* Insure that the lpid lists are setup */ + MPIR_Group_setup_lpid_pairs(group_ptr1, group_ptr2); + + flags = MPL_calloc(size1, sizeof(int), MPL_MEM_OTHER); + + g1_idx = group_ptr1->idx_of_first_lpid; + g2_idx = group_ptr2->idx_of_first_lpid; + + nnew = 0; + while (g1_idx >= 0 && g2_idx >= 0) { + l1_pid = group_ptr1->lrank_to_lpid[g1_idx].lpid; + l2_pid = group_ptr2->lrank_to_lpid[g2_idx].lpid; + if (l1_pid < l2_pid) { + g1_idx = group_ptr1->lrank_to_lpid[g1_idx].next_lpid; + } else if (l1_pid > l2_pid) { + g2_idx = group_ptr2->lrank_to_lpid[g2_idx].next_lpid; + } else { + /* Equal */ + flags[g1_idx] = 1; + g1_idx = group_ptr1->lrank_to_lpid[g1_idx].next_lpid; + g2_idx = group_ptr2->lrank_to_lpid[g2_idx].next_lpid; + nnew++; + } + } + /* Create the group. Handle the trivial case first */ + if (nnew == 0) { + *new_group_ptr = MPIR_Group_empty; + goto fn_exit; + } + + mpi_errno = MPIR_Group_create(nnew, new_group_ptr); + MPIR_ERR_CHECK(mpi_errno); + + (*new_group_ptr)->rank = MPI_UNDEFINED; + (*new_group_ptr)->is_local_dense_monotonic = TRUE; + k = 0; + for (i = 0; i < size1; i++) { + if (flags[i]) { + uint64_t lpid = group_ptr1->lrank_to_lpid[i].lpid; + (*new_group_ptr)->lrank_to_lpid[k].lpid = lpid; + if (i == group_ptr1->rank) + (*new_group_ptr)->rank = k; + if (lpid > MPIR_Process.size || + (k > 0 && (*new_group_ptr)->lrank_to_lpid[k - 1].lpid != (lpid - 1))) { + (*new_group_ptr)->is_local_dense_monotonic = FALSE; + } + + k++; + } + } + + MPIR_Group_set_session_ptr(*new_group_ptr, group_ptr1->session_ptr); + + fn_exit: + MPL_free(flags); + MPIR_FUNC_EXIT; + return mpi_errno; + fn_fail: + goto fn_exit; } int MPIR_Group_union_impl(MPIR_Group * group_ptr1, MPIR_Group * group_ptr2, From 75791da52aa5eee7e792e554378b881408d8f1d3 Mon Sep 17 00:00:00 2001 From: Hui Zhou Date: Tue, 10 Dec 2024 16:11:59 -0600 Subject: [PATCH 08/59] group: refactor group_impl.c to use new group interfaces Use MPIR_Group_{rank_to_lpid,lpid_to_rank} to avoid directly access MPIR_Group internal fields. For most group creation routines, just populate an lpid lookup map and call MPIR_Group_create_map to create the group. --- src/mpi/group/group_impl.c | 575 +++++++++++-------------------------- 1 file changed, 171 insertions(+), 404 deletions(-) diff --git a/src/mpi/group/group_impl.c b/src/mpi/group/group_impl.c index 848996c7d8c..fa123a70efc 100644 --- a/src/mpi/group/group_impl.c +++ b/src/mpi/group/group_impl.c @@ -37,7 +37,6 @@ int MPIR_Group_free_impl(MPIR_Group * group_ptr) int MPIR_Group_compare_impl(MPIR_Group * group_ptr1, MPIR_Group * group_ptr2, int *result) { int mpi_errno = MPI_SUCCESS; - int g1_idx, g2_idx, size, i; /* See if their sizes are equal */ if (group_ptr1->size != group_ptr2->size) { @@ -45,39 +44,39 @@ int MPIR_Group_compare_impl(MPIR_Group * group_ptr1, MPIR_Group * group_ptr2, in goto fn_exit; } - /* Run through the lrank to lpid lists of each group in lpid order - * to see if the same processes are involved */ - g1_idx = group_ptr1->idx_of_first_lpid; - g2_idx = group_ptr2->idx_of_first_lpid; - /* If the lpid list hasn't been created, do it now */ - if (g1_idx < 0) { - MPII_Group_setup_lpid_list(group_ptr1); - g1_idx = group_ptr1->idx_of_first_lpid; - } - if (g2_idx < 0) { - MPII_Group_setup_lpid_list(group_ptr2); - g2_idx = group_ptr2->idx_of_first_lpid; - } - while (g1_idx >= 0 && g2_idx >= 0) { - if (group_ptr1->lrank_to_lpid[g1_idx].lpid != group_ptr2->lrank_to_lpid[g2_idx].lpid) { - *result = MPI_UNEQUAL; - goto fn_exit; + int size; + size = group_ptr1->size; + + /* See if they are identical */ + bool is_ident = true; + for (int i = 0; i < size; i++) { + if (MPIR_Group_rank_to_lpid(group_ptr1, i) != MPIR_Group_rank_to_lpid(group_ptr2, i)) { + is_ident = false; + break; } - g1_idx = group_ptr1->lrank_to_lpid[g1_idx].next_lpid; - g2_idx = group_ptr2->lrank_to_lpid[g2_idx].next_lpid; } - /* See if the processes are in the same order by rank */ - size = group_ptr1->size; - for (i = 0; i < size; i++) { - if (group_ptr1->lrank_to_lpid[i].lpid != group_ptr2->lrank_to_lpid[i].lpid) { - *result = MPI_SIMILAR; - goto fn_exit; + if (is_ident) { + *result = MPI_IDENT; + goto fn_exit; + } + + /* See if they are similar */ + bool is_similar = true; + for (int i = 0; i < size; i++) { + MPIR_Lpid lpid = MPIR_Group_rank_to_lpid(group_ptr1, i); + if (MPI_UNDEFINED == MPIR_Group_lpid_to_rank(group_ptr2, lpid)) { + /* not found */ + is_similar = false; + break; } } - /* If we reach here, the groups are identical */ - *result = MPI_IDENT; + if (is_similar) { + *result = MPI_SIMILAR; + } else { + *result = MPI_UNEQUAL; + } fn_exit: return mpi_errno; @@ -87,71 +86,16 @@ int MPIR_Group_translate_ranks_impl(MPIR_Group * gp1, int n, const int ranks1[], MPIR_Group * gp2, int ranks2[]) { int mpi_errno = MPI_SUCCESS; - int i, g2_idx; - uint64_t l1_pid, l2_pid; - - MPL_DBG_MSG_S(MPIR_DBG_OTHER, VERBOSE, "gp2->is_local_dense_monotonic=%s", - (gp2->is_local_dense_monotonic ? "TRUE" : "FALSE")); - - /* Initialize the output ranks */ - for (i = 0; i < n; i++) - ranks2[i] = MPI_UNDEFINED; - - if (gp2->size > 0 && gp2->is_local_dense_monotonic) { - /* g2 probably == group_of(MPI_COMM_WORLD); use fast, constant-time lookup */ - uint64_t lpid_offset = gp2->lrank_to_lpid[0].lpid; - - for (i = 0; i < n; ++i) { - uint64_t g1_lpid; - if (ranks1[i] == MPI_PROC_NULL) { - ranks2[i] = MPI_PROC_NULL; - continue; - } - /* "adjusted" lpid from g1 */ - g1_lpid = gp1->lrank_to_lpid[ranks1[i]].lpid - lpid_offset; - if (g1_lpid < gp2->size) { - ranks2[i] = g1_lpid; - } - /* else leave UNDEFINED */ - } - } else { - /* general, slow path; lookup time is dependent on the user-provided rank values! */ - g2_idx = gp2->idx_of_first_lpid; - if (g2_idx < 0) { - MPII_Group_setup_lpid_list(gp2); - g2_idx = gp2->idx_of_first_lpid; - } - if (g2_idx >= 0) { - /* g2_idx can be < 0 if the g2 group is empty */ - l2_pid = gp2->lrank_to_lpid[g2_idx].lpid; - for (i = 0; i < n; i++) { - if (ranks1[i] == MPI_PROC_NULL) { - ranks2[i] = MPI_PROC_NULL; - continue; - } - l1_pid = gp1->lrank_to_lpid[ranks1[i]].lpid; - /* Search for this l1_pid in group2. Use the following - * optimization: start from the last position in the lpid list - * if possible. A more sophisticated version could use a - * tree based or even hashed search to speed the translation. */ - if (l1_pid < l2_pid || g2_idx < 0) { - /* Start over from the beginning */ - g2_idx = gp2->idx_of_first_lpid; - l2_pid = gp2->lrank_to_lpid[g2_idx].lpid; - } - while (g2_idx >= 0 && l1_pid > l2_pid) { - g2_idx = gp2->lrank_to_lpid[g2_idx].next_lpid; - if (g2_idx >= 0) - l2_pid = gp2->lrank_to_lpid[g2_idx].lpid; - else - l2_pid = (uint64_t) - 1; - } - if (l1_pid == l2_pid) - ranks2[i] = g2_idx; - } + for (int i = 0; i < n; i++) { + if (ranks1[i] == MPI_PROC_NULL) { + ranks2[i] = MPI_PROC_NULL; + continue; } + MPIR_Lpid lpid = MPIR_Group_rank_to_lpid(gp1, ranks1[i]); + ranks2[i] = MPIR_Group_lpid_to_rank(gp2, lpid); } + return mpi_errno; } @@ -159,41 +103,34 @@ int MPIR_Group_excl_impl(MPIR_Group * group_ptr, int n, const int ranks[], MPIR_Group ** new_group_ptr) { int mpi_errno = MPI_SUCCESS; - int size, i, newi; - int *flags = NULL; - MPIR_FUNC_ENTER; - size = group_ptr->size; + int size = group_ptr->size; + int nnew = size - n; - /* Allocate a new group and lrank_to_lpid array */ - mpi_errno = MPIR_Group_create(size - n, new_group_ptr); - MPIR_ERR_CHECK(mpi_errno); - - (*new_group_ptr)->rank = MPI_UNDEFINED; /* Use flag fields to mark the members to *exclude* . */ - - flags = MPL_calloc(size, sizeof(int), MPL_MEM_OTHER); - - for (i = 0; i < n; i++) { + int *flags = MPL_calloc(size, sizeof(int), MPL_MEM_OTHER); + for (int i = 0; i < n; i++) { flags[ranks[i]] = 1; } - newi = 0; - for (i = 0; i < size; i++) { + MPIR_Lpid *map = MPL_malloc(nnew * sizeof(MPIR_Lpid), MPL_MEM_GROUP); + MPIR_ERR_CHKANDJUMP(!map, mpi_errno, MPI_ERR_OTHER, "**nomem"); + + int myrank = MPI_UNDEFINED; + int newi = 0; + for (int i = 0; i < size; i++) { if (flags[i] == 0) { - (*new_group_ptr)->lrank_to_lpid[newi].lpid = group_ptr->lrank_to_lpid[i].lpid; - if (group_ptr->rank == i) - (*new_group_ptr)->rank = newi; + map[newi] = MPIR_Group_rank_to_lpid(group_ptr, i); + if (group_ptr->rank == i) { + myrank = newi; + } newi++; } } - (*new_group_ptr)->size = size - n; - (*new_group_ptr)->idx_of_first_lpid = -1; - /* TODO calculate is_local_dense_monotonic */ - - MPIR_Group_set_session_ptr(*new_group_ptr, group_ptr->session_ptr); + mpi_errno = MPIR_Group_create_map(nnew, myrank, group_ptr->session_ptr, map, new_group_ptr); + MPIR_ERR_CHECK(mpi_errno); fn_exit: MPL_free(flags); @@ -207,8 +144,6 @@ int MPIR_Group_incl_impl(MPIR_Group * group_ptr, int n, const int ranks[], MPIR_Group ** new_group_ptr) { int mpi_errno = MPI_SUCCESS; - int i; - MPIR_FUNC_ENTER; if (n == 0) { @@ -216,23 +151,20 @@ int MPIR_Group_incl_impl(MPIR_Group * group_ptr, int n, const int ranks[], goto fn_exit; } - /* Allocate a new group and lrank_to_lpid array */ - mpi_errno = MPIR_Group_create(n, new_group_ptr); - if (mpi_errno) - goto fn_fail; + int nnew = n; + MPIR_Lpid *map = MPL_malloc(nnew * sizeof(MPIR_Lpid), MPL_MEM_OTHER); + MPIR_ERR_CHKANDJUMP(!map, mpi_errno, MPI_ERR_OTHER, "**nomem"); - (*new_group_ptr)->rank = MPI_UNDEFINED; - for (i = 0; i < n; i++) { - (*new_group_ptr)->lrank_to_lpid[i].lpid = group_ptr->lrank_to_lpid[ranks[i]].lpid; - if (ranks[i] == group_ptr->rank) - (*new_group_ptr)->rank = i; + int myrank = MPI_UNDEFINED; + for (int i = 0; i < n; i++) { + map[i] = MPIR_Group_rank_to_lpid(group_ptr, ranks[i]); + if (ranks[i] == group_ptr->rank) { + myrank = i; + } } - (*new_group_ptr)->size = n; - (*new_group_ptr)->idx_of_first_lpid = -1; - /* TODO calculate is_local_dense_monotonic */ - - MPIR_Group_set_session_ptr(*new_group_ptr, group_ptr->session_ptr); + mpi_errno = MPIR_Group_create_map(nnew, myrank, group_ptr->session_ptr, map, new_group_ptr); + MPIR_ERR_CHECK(mpi_errno); fn_exit: MPIR_FUNC_EXIT; @@ -245,17 +177,15 @@ int MPIR_Group_range_excl_impl(MPIR_Group * group_ptr, int n, int ranges[][3], MPIR_Group ** new_group_ptr) { int mpi_errno = MPI_SUCCESS; - int size, i, j, k, nnew, first, last, stride; - int *flags = NULL; - MPIR_FUNC_ENTER; + /* Compute size, assuming that included ranks are valid (and distinct) */ - size = group_ptr->size; - nnew = 0; - for (i = 0; i < n; i++) { - first = ranges[i][0]; - last = ranges[i][1]; - stride = ranges[i][2]; + int size = group_ptr->size; + int nnew = 0; + for (int i = 0; i < n; i++) { + int first = ranges[i][0]; + int last = ranges[i][1]; + int stride = ranges[i][2]; /* works for stride of either sign. Error checking above * has already guaranteed stride != 0 */ nnew += 1 + (last - first) / stride; @@ -267,15 +197,6 @@ int MPIR_Group_range_excl_impl(MPIR_Group * group_ptr, int n, int ranges[][3], goto fn_exit; } - /* Allocate a new group and lrank_to_lpid array */ - mpi_errno = MPIR_Group_create(nnew, new_group_ptr); - /* --BEGIN ERROR HANDLING-- */ - if (mpi_errno) { - goto fn_fail; - } - /* --END ERROR HANDLING-- */ - (*new_group_ptr)->rank = MPI_UNDEFINED; - /* Group members are taken in rank order from the original group, * with the specified members removed. Use the flag array for that * purpose. If this was a critical routine, we could use the @@ -283,41 +204,46 @@ int MPIR_Group_range_excl_impl(MPIR_Group * group_ptr, int n, int ranges[][3], * was enabled *and* we are not MPI_THREAD_MULTIPLE, but since this * is a low-usage routine, we haven't taken that optimization. */ - flags = MPL_calloc(size, sizeof(int), MPL_MEM_OTHER); + int *flags = MPL_calloc(size, sizeof(int), MPL_MEM_OTHER); - for (i = 0; i < n; i++) { - first = ranges[i][0]; - last = ranges[i][1]; - stride = ranges[i][2]; + for (int i = 0; i < n; i++) { + int first = ranges[i][0]; + int last = ranges[i][1]; + int stride = ranges[i][2]; if (stride > 0) { - for (j = first; j <= last; j += stride) { + for (int j = first; j <= last; j += stride) { flags[j] = 1; } } else { - for (j = first; j >= last; j += stride) { + for (int j = first; j >= last; j += stride) { flags[j] = 1; } } } + /* Now, run through the group and pick up the members that were * not excluded */ - k = 0; - for (i = 0; i < size; i++) { + MPIR_Lpid *map = MPL_malloc(nnew * sizeof(MPIR_Lpid), MPL_MEM_GROUP); + MPIR_ERR_CHKANDJUMP(!map, mpi_errno, MPI_ERR_OTHER, "**nomem"); + + int myrank = MPI_UNDEFINED; + int k = 0; + for (int i = 0; i < size; i++) { if (!flags[i]) { - (*new_group_ptr)->lrank_to_lpid[k].lpid = group_ptr->lrank_to_lpid[i].lpid; + map[k] = MPIR_Group_rank_to_lpid(group_ptr, i); if (group_ptr->rank == i) { - (*new_group_ptr)->rank = k; + myrank = k; } k++; } } - /* TODO calculate is_local_dense_monotonic */ + MPL_free(flags); - MPIR_Group_set_session_ptr(*new_group_ptr, group_ptr->session_ptr); + mpi_errno = MPIR_Group_create_map(nnew, myrank, group_ptr->session_ptr, map, new_group_ptr); + MPIR_ERR_CHECK(mpi_errno); fn_exit: - MPL_free(flags); MPIR_FUNC_EXIT; return mpi_errno; fn_fail: @@ -328,16 +254,14 @@ int MPIR_Group_range_incl_impl(MPIR_Group * group_ptr, int n, int ranges[][3], MPIR_Group ** new_group_ptr) { int mpi_errno = MPI_SUCCESS; - int first, last, stride, nnew, i, j, k; - MPIR_FUNC_ENTER; /* Compute size, assuming that included ranks are valid (and distinct) */ - nnew = 0; - for (i = 0; i < n; i++) { - first = ranges[i][0]; - last = ranges[i][1]; - stride = ranges[i][2]; + int nnew = 0; + for (int i = 0; i < n; i++) { + int first = ranges[i][0]; + int last = ranges[i][1]; + int stride = ranges[i][2]; /* works for stride of either sign. Error checking above * has already guaranteed stride != 0 */ nnew += 1 + (last - first) / stride; @@ -348,40 +272,39 @@ int MPIR_Group_range_incl_impl(MPIR_Group * group_ptr, int n, int ranges[][3], goto fn_exit; } - /* Allocate a new group and lrank_to_lpid array */ - mpi_errno = MPIR_Group_create(nnew, new_group_ptr); - if (mpi_errno) - goto fn_fail; - (*new_group_ptr)->rank = MPI_UNDEFINED; + MPIR_Lpid *map = MPL_malloc(nnew * sizeof(MPIR_Lpid), MPL_MEM_GROUP); + MPIR_ERR_CHKANDJUMP(!map, mpi_errno, MPI_ERR_OTHER, "**nomem"); /* Group members taken in order specified by the range array */ /* This could be integrated with the error checking, but since this * is a low-usage routine, we haven't taken that optimization */ - k = 0; - for (i = 0; i < n; i++) { - first = ranges[i][0]; - last = ranges[i][1]; - stride = ranges[i][2]; + int myrank = MPI_UNDEFINED; + int k = 0; + for (int i = 0; i < n; i++) { + int first = ranges[i][0]; + int last = ranges[i][1]; + int stride = ranges[i][2]; if (stride > 0) { - for (j = first; j <= last; j += stride) { - (*new_group_ptr)->lrank_to_lpid[k].lpid = group_ptr->lrank_to_lpid[j].lpid; - if (j == group_ptr->rank) - (*new_group_ptr)->rank = k; + for (int j = first; j <= last; j += stride) { + map[k] = MPIR_Group_rank_to_lpid(group_ptr, j); + if (j == group_ptr->rank) { + myrank = k; + } k++; } } else { - for (j = first; j >= last; j += stride) { - (*new_group_ptr)->lrank_to_lpid[k].lpid = group_ptr->lrank_to_lpid[j].lpid; - if (j == group_ptr->rank) - (*new_group_ptr)->rank = k; + for (int j = first; j >= last; j += stride) { + map[k] = MPIR_Group_rank_to_lpid(group_ptr, j); + if (j == group_ptr->rank) { + myrank = k; + } k++; } } } - /* TODO calculate is_local_dense_monotonic */ - - MPIR_Group_set_session_ptr(*new_group_ptr, group_ptr->session_ptr); + mpi_errno = MPIR_Group_create_map(nnew, myrank, group_ptr->session_ptr, map, new_group_ptr); + MPIR_ERR_CHECK(mpi_errno); fn_exit: MPIR_FUNC_EXIT; @@ -394,69 +317,32 @@ int MPIR_Group_difference_impl(MPIR_Group * group_ptr1, MPIR_Group * group_ptr2, MPIR_Group ** new_group_ptr) { int mpi_errno = MPI_SUCCESS; - int size1, i, k, g1_idx, g2_idx, nnew; - uint64_t l1_pid, l2_pid; - int *flags = NULL; - MPIR_FUNC_ENTER; - /* Return a group consisting of the members of group1 that are *not* - * in group2 */ - size1 = group_ptr1->size; - /* Insure that the lpid lists are setup */ - MPIR_Group_setup_lpid_pairs(group_ptr1, group_ptr2); - - flags = MPL_calloc(size1, sizeof(int), MPL_MEM_OTHER); - - g1_idx = group_ptr1->idx_of_first_lpid; - g2_idx = group_ptr2->idx_of_first_lpid; - - nnew = size1; - while (g1_idx >= 0 && g2_idx >= 0) { - l1_pid = group_ptr1->lrank_to_lpid[g1_idx].lpid; - l2_pid = group_ptr2->lrank_to_lpid[g2_idx].lpid; - if (l1_pid < l2_pid) { - g1_idx = group_ptr1->lrank_to_lpid[g1_idx].next_lpid; - } else if (l1_pid > l2_pid) { - g2_idx = group_ptr2->lrank_to_lpid[g2_idx].next_lpid; - } else { - /* Equal */ - flags[g1_idx] = 1; - g1_idx = group_ptr1->lrank_to_lpid[g1_idx].next_lpid; - g2_idx = group_ptr2->lrank_to_lpid[g2_idx].next_lpid; - nnew--; - } - } - /* Create the group */ - if (nnew == 0) { - /* See 5.3.2, Group Constructors. For many group routines, - * the standard explicitly says to return MPI_GROUP_EMPTY; - * for others it is implied */ - *new_group_ptr = MPIR_Group_empty; - goto fn_exit; - } else { - mpi_errno = MPIR_Group_create(nnew, new_group_ptr); - /* --BEGIN ERROR HANDLING-- */ - if (mpi_errno) { - goto fn_fail; - } - /* --END ERROR HANDLING-- */ - (*new_group_ptr)->rank = MPI_UNDEFINED; - k = 0; - for (i = 0; i < size1; i++) { - if (!flags[i]) { - (*new_group_ptr)->lrank_to_lpid[k].lpid = group_ptr1->lrank_to_lpid[i].lpid; - if (i == group_ptr1->rank) - (*new_group_ptr)->rank = k; - k++; + + MPIR_Assert(group_ptr1->session_ptr == group_ptr2->session_ptr); + + MPIR_Lpid *map = MPL_malloc(group_ptr1->size * sizeof(MPIR_Lpid), MPL_MEM_GROUP); + MPIR_ERR_CHKANDJUMP(!map, mpi_errno, MPI_ERR_OTHER, "**nomem"); + + int nnew = 0; + int myrank = MPI_UNDEFINED; + /* For each rank in group1, search it in group2. */ + for (int i = 0; i < group_ptr1->size; i++) { + MPIR_Lpid lpid = MPIR_Group_rank_to_lpid(group_ptr1, i); + if (MPI_UNDEFINED == MPIR_Group_lpid_to_rank(group_ptr2, lpid)) { + /* not found */ + if (i == group_ptr1->rank) { + myrank = nnew; } + map[nnew++] = lpid; } - /* TODO calculate is_local_dense_monotonic */ } - MPIR_Group_set_session_ptr(*new_group_ptr, group_ptr1->session_ptr); + /* Create the group */ + mpi_errno = MPIR_Group_create_map(nnew, myrank, group_ptr1->session_ptr, map, new_group_ptr); + MPIR_ERR_CHECK(mpi_errno); fn_exit: - MPL_free(flags); MPIR_FUNC_EXIT; return mpi_errno; fn_fail: @@ -467,69 +353,34 @@ int MPIR_Group_intersection_impl(MPIR_Group * group_ptr1, MPIR_Group * group_ptr MPIR_Group ** new_group_ptr) { int mpi_errno = MPI_SUCCESS; - int size1, i, k, g1_idx, g2_idx, nnew; - uint64_t l1_pid, l2_pid; - int *flags = NULL; - MPIR_FUNC_ENTER; - /* Return a group consisting of the members of group1 that are - * in group2 */ - size1 = group_ptr1->size; - /* Insure that the lpid lists are setup */ - MPIR_Group_setup_lpid_pairs(group_ptr1, group_ptr2); - - flags = MPL_calloc(size1, sizeof(int), MPL_MEM_OTHER); - - g1_idx = group_ptr1->idx_of_first_lpid; - g2_idx = group_ptr2->idx_of_first_lpid; - - nnew = 0; - while (g1_idx >= 0 && g2_idx >= 0) { - l1_pid = group_ptr1->lrank_to_lpid[g1_idx].lpid; - l2_pid = group_ptr2->lrank_to_lpid[g2_idx].lpid; - if (l1_pid < l2_pid) { - g1_idx = group_ptr1->lrank_to_lpid[g1_idx].next_lpid; - } else if (l1_pid > l2_pid) { - g2_idx = group_ptr2->lrank_to_lpid[g2_idx].next_lpid; - } else { - /* Equal */ - flags[g1_idx] = 1; - g1_idx = group_ptr1->lrank_to_lpid[g1_idx].next_lpid; - g2_idx = group_ptr2->lrank_to_lpid[g2_idx].next_lpid; - nnew++; - } - } - /* Create the group. Handle the trivial case first */ - if (nnew == 0) { - *new_group_ptr = MPIR_Group_empty; - goto fn_exit; - } - mpi_errno = MPIR_Group_create(nnew, new_group_ptr); - MPIR_ERR_CHECK(mpi_errno); + /* Similar to MPI_Group_difference, but take the ranks that are found in group2 */ - (*new_group_ptr)->rank = MPI_UNDEFINED; - (*new_group_ptr)->is_local_dense_monotonic = TRUE; - k = 0; - for (i = 0; i < size1; i++) { - if (flags[i]) { - uint64_t lpid = group_ptr1->lrank_to_lpid[i].lpid; - (*new_group_ptr)->lrank_to_lpid[k].lpid = lpid; - if (i == group_ptr1->rank) - (*new_group_ptr)->rank = k; - if (lpid > MPIR_Process.size || - (k > 0 && (*new_group_ptr)->lrank_to_lpid[k - 1].lpid != (lpid - 1))) { - (*new_group_ptr)->is_local_dense_monotonic = FALSE; - } + MPIR_Assert(group_ptr1->session_ptr == group_ptr2->session_ptr); - k++; + MPIR_Lpid *map = MPL_malloc(group_ptr1->size * sizeof(MPIR_Lpid), MPL_MEM_GROUP); + MPIR_ERR_CHKANDJUMP(!map, mpi_errno, MPI_ERR_OTHER, "**nomem"); + + int nnew = 0; + int myrank = MPI_UNDEFINED; + /* For each rank in group1, search it in group2. */ + for (int i = 0; i < group_ptr1->size; i++) { + MPIR_Lpid lpid = MPIR_Group_rank_to_lpid(group_ptr1, i); + if (MPI_UNDEFINED != MPIR_Group_lpid_to_rank(group_ptr2, lpid)) { + /* found */ + if (i == group_ptr1->rank) { + myrank = nnew; + } + map[nnew++] = lpid; } } - MPIR_Group_set_session_ptr(*new_group_ptr, group_ptr1->session_ptr); + /* Create the group */ + mpi_errno = MPIR_Group_create_map(nnew, myrank, group_ptr1->session_ptr, map, new_group_ptr); + MPIR_ERR_CHECK(mpi_errno); fn_exit: - MPL_free(flags); MPIR_FUNC_EXIT; return mpi_errno; fn_fail: @@ -540,104 +391,41 @@ int MPIR_Group_union_impl(MPIR_Group * group_ptr1, MPIR_Group * group_ptr2, MPIR_Group ** new_group_ptr) { int mpi_errno = MPI_SUCCESS; - int g1_idx, g2_idx, nnew, i, k, size1, size2; - uint64_t mylpid; - int *flags = NULL; - MPIR_FUNC_ENTER; - /* Determine the size of the new group. The new group consists of all - * members of group1 plus the members of group2 that are not in group1. - */ - g1_idx = group_ptr1->idx_of_first_lpid; - g2_idx = group_ptr2->idx_of_first_lpid; - - /* If the lpid list hasn't been created, do it now */ - if (g1_idx < 0) { - MPII_Group_setup_lpid_list(group_ptr1); - g1_idx = group_ptr1->idx_of_first_lpid; - } - if (g2_idx < 0) { - MPII_Group_setup_lpid_list(group_ptr2); - g2_idx = group_ptr2->idx_of_first_lpid; - } - nnew = group_ptr1->size; - - /* Clear the flag bits on the second group. The flag is set if - * a member of the second group belongs to the union */ - size2 = group_ptr2->size; - flags = MPL_calloc(size2, sizeof(int), MPL_MEM_OTHER); - - /* Loop through the lists that are ordered by lpid (local process - * id) to detect which processes in group 2 are not in group 1 - */ - while (g1_idx >= 0 && g2_idx >= 0) { - uint64_t l1_pid, l2_pid; - l1_pid = group_ptr1->lrank_to_lpid[g1_idx].lpid; - l2_pid = group_ptr2->lrank_to_lpid[g2_idx].lpid; - if (l1_pid > l2_pid) { - nnew++; - flags[g2_idx] = 1; - g2_idx = group_ptr2->lrank_to_lpid[g2_idx].next_lpid; - } else if (l1_pid == l2_pid) { - g1_idx = group_ptr1->lrank_to_lpid[g1_idx].next_lpid; - g2_idx = group_ptr2->lrank_to_lpid[g2_idx].next_lpid; - } else { - /* l1 < l2 */ - g1_idx = group_ptr1->lrank_to_lpid[g1_idx].next_lpid; - } - } - /* If we hit the end of group1, add the remaining members of group 2 */ - while (g2_idx >= 0) { - nnew++; - flags[g2_idx] = 1; - g2_idx = group_ptr2->lrank_to_lpid[g2_idx].next_lpid; - } - - if (nnew == 0) { - *new_group_ptr = MPIR_Group_empty; - goto fn_exit; - } + MPIR_Assert(group_ptr1->session_ptr == group_ptr2->session_ptr); - /* Allocate a new group and lrank_to_lpid array */ - mpi_errno = MPIR_Group_create(nnew, new_group_ptr); - MPIR_ERR_CHECK(mpi_errno); + MPIR_Lpid *map = MPL_malloc((group_ptr1->size + group_ptr2->size) * sizeof(MPIR_Lpid), + MPL_MEM_GROUP); + MPIR_ERR_CHKANDJUMP(!map, mpi_errno, MPI_ERR_OTHER, "**nomem"); /* If this process is in group1, then we can set the rank now. * If we are not in this group, this assignment will set the * current rank to MPI_UNDEFINED */ - (*new_group_ptr)->rank = group_ptr1->rank; + int myrank = group_ptr1->rank; /* Add group1 */ - size1 = group_ptr1->size; - for (i = 0; i < size1; i++) { - (*new_group_ptr)->lrank_to_lpid[i].lpid = group_ptr1->lrank_to_lpid[i].lpid; + for (int i = 0; i < group_ptr1->size; i++) { + map[i] = MPIR_Group_rank_to_lpid(group_ptr1, i); } /* Add members of group2 that are not in group 1 */ - - if (group_ptr1->rank == MPI_UNDEFINED && group_ptr2->rank >= 0) { - mylpid = group_ptr2->lrank_to_lpid[group_ptr2->rank].lpid; - } else { - mylpid = (uint64_t) - 2; - } - k = size1; - for (i = 0; i < size2; i++) { - if (flags[i]) { - (*new_group_ptr)->lrank_to_lpid[k].lpid = group_ptr2->lrank_to_lpid[i].lpid; - if ((*new_group_ptr)->rank == MPI_UNDEFINED && - group_ptr2->lrank_to_lpid[i].lpid == mylpid) - (*new_group_ptr)->rank = k; - k++; + int nnew = group_ptr1->size; + for (int i = 0; i < group_ptr2->size; i++) { + MPIR_Lpid lpid = MPIR_Group_rank_to_lpid(group_ptr2, i); + if (MPI_UNDEFINED == MPIR_Group_lpid_to_rank(group_ptr1, lpid)) { + /* not found */ + if (i == group_ptr2->rank) { + myrank = nnew; + } + map[nnew++] = lpid; } } - /* TODO calculate is_local_dense_monotonic */ - - MPIR_Group_set_session_ptr(*new_group_ptr, group_ptr1->session_ptr); + mpi_errno = MPIR_Group_create_map(nnew, myrank, group_ptr1->session_ptr, map, new_group_ptr); + MPIR_ERR_CHECK(mpi_errno); fn_exit: - MPL_free(flags); MPIR_FUNC_EXIT; return mpi_errno; fn_fail: @@ -648,40 +436,19 @@ int MPIR_Group_from_session_pset_impl(MPIR_Session * session_ptr, const char *ps MPIR_Group ** new_group_ptr) { int mpi_errno = MPI_SUCCESS; - MPIR_Group *group_ptr; if (MPL_stricmp(pset_name, "mpi://WORLD") == 0) { - mpi_errno = MPIR_Group_create(MPIR_Process.size, &group_ptr); + mpi_errno = MPIR_Group_create_stride(MPIR_Process.size, MPIR_Process.rank, session_ptr, + 0, 1, 1, new_group_ptr); MPIR_ERR_CHECK(mpi_errno); - - group_ptr->size = MPIR_Process.size; - group_ptr->rank = MPIR_Process.rank; - group_ptr->is_local_dense_monotonic = TRUE; - for (int i = 0; i < group_ptr->size; i++) { - group_ptr->lrank_to_lpid[i].lpid = i; - group_ptr->lrank_to_lpid[i].next_lpid = i + 1; - } - group_ptr->lrank_to_lpid[group_ptr->size - 1].next_lpid = -1; - group_ptr->idx_of_first_lpid = 0; } else if (MPL_stricmp(pset_name, "mpi://SELF") == 0) { - mpi_errno = MPIR_Group_create(1, &group_ptr); + mpi_errno = MPIR_Group_create_stride(1, 0, session_ptr, 0, 1, 1, new_group_ptr); MPIR_ERR_CHECK(mpi_errno); - - group_ptr->size = 1; - group_ptr->rank = 0; - group_ptr->is_local_dense_monotonic = TRUE; - group_ptr->lrank_to_lpid[0].lpid = MPIR_Process.rank; - group_ptr->lrank_to_lpid[0].next_lpid = -1; - group_ptr->idx_of_first_lpid = 0; } else { /* TODO: Implement pset struct, locate pset struct ptr */ MPIR_ERR_SETANDSTMT(mpi_errno, MPI_ERR_ARG, goto fn_fail, "**psetinvalidname"); } - MPIR_Group_set_session_ptr(group_ptr, session_ptr); - - *new_group_ptr = group_ptr; - fn_exit: return mpi_errno; fn_fail: From 7ac4e213a624b3ca207d1db6a13eec6f26a77dcb Mon Sep 17 00:00:00 2001 From: Hui Zhou Date: Tue, 10 Dec 2024 13:14:43 -0600 Subject: [PATCH 09/59] group: refactor MPIR_Group * add option to use stride to describe group composition * remove the linked list design --- src/include/mpir_group.h | 56 +++---- src/mpi/comm/comm_impl.c | 3 - src/mpi/group/grouputil.c | 309 ++++++++++++-------------------------- 3 files changed, 122 insertions(+), 246 deletions(-) diff --git a/src/include/mpir_group.h b/src/include/mpir_group.h index f46659494b5..1148a8e8006 100644 --- a/src/include/mpir_group.h +++ b/src/include/mpir_group.h @@ -12,24 +12,6 @@ * MPI_Group_intersection) and for the scalable RMA synchronization *---------------------------------------------------------------------------*/ -/* Abstract the integer type for lpid (process id). It is possible to use 32-bit - * in principle, but 64-bit is simpler since we can trivially combine - * (world_idx, world_rank). - */ -typedef uint64_t MPIR_Lpid; - -/* This structure is used to implement the group operations such as - MPI_Group_translate_ranks */ -/* note: next_lpid (with idx_of_first_lpid in MPIR_Group) gives a linked list - * in a sorted lpid ascending order */ -typedef struct MPII_Group_pmap_t { - MPIR_Lpid lpid; /* local process id, from VCONN */ - int next_lpid; /* Index of next lpid (in lpid order) */ -} MPII_Group_pmap_t; - -/* Any changes in the MPIR_Group structure must be made to the - predefined value in MPIR_Group_builtin for MPI_GROUP_EMPTY in - src/mpi/group/grouputil.c */ /*S MPIR_Group - Description of the Group data structure @@ -60,22 +42,35 @@ typedef struct MPII_Group_pmap_t { Group-DS S*/ + +/* Abstract the integer type for lpid (process id). It is possible to use 32-bit + * in principle, but 64-bit is simpler since we can trivially combine + * (world_idx, world_rank). + */ +typedef uint64_t MPIR_Lpid; + +struct MPIR_Pmap { + int size; /* same as group->size, duplicate here so Pmap is logically complete */ + bool use_map; + union { + MPIR_Lpid *map; + struct { + MPIR_Lpid offset; + MPIR_Lpid stride; + MPIR_Lpid blocksize; + } stride; + } u; +}; + struct MPIR_Group { MPIR_OBJECT_HEADER; /* adds handle and ref_count fields */ int size; /* Size of a group */ - int rank; /* rank of this process relative to this - * group */ - int idx_of_first_lpid; - MPII_Group_pmap_t *lrank_to_lpid; /* Array mapping a local rank to local - * process number */ - int is_local_dense_monotonic; /* see NOTE-G1 */ - - /* We may want some additional data for the RMA synchronization calls */ - /* Other, device-specific information */ + int rank; /* rank of this process relative to this group */ + struct MPIR_Pmap pmap; + MPIR_Session *session_ptr; /* Pointer to session to which this group belongs */ #ifdef MPID_DEV_GROUP_DECL MPID_DEV_GROUP_DECL #endif - MPIR_Session * session_ptr; /* Pointer to session to which this group belongs */ }; /* NOTE-G1: is_local_dense_monotonic will be true iff the group meets the @@ -104,10 +99,8 @@ extern MPIR_Group *const MPIR_Group_empty; #define MPIR_Group_release_ref(_group, _inuse) \ do { MPIR_Object_release_ref(_group, _inuse); } while (0) -void MPII_Group_setup_lpid_list(MPIR_Group *); int MPIR_Group_check_valid_ranks(MPIR_Group *, const int[], int); int MPIR_Group_check_valid_ranges(MPIR_Group *, int[][3], int); -void MPIR_Group_setup_lpid_pairs(MPIR_Group *, MPIR_Group *); int MPIR_Group_create(int, MPIR_Group **); int MPIR_Group_release(MPIR_Group * group_ptr); @@ -123,7 +116,4 @@ int MPIR_Group_check_subset(MPIR_Group * group_ptr, MPIR_Comm * comm_ptr); void MPIR_Group_set_session_ptr(MPIR_Group * group_ptr, MPIR_Session * session_out); int MPIR_Group_init(void); -/* internal functions */ -void MPII_Group_setup_lpid_list(MPIR_Group *); - #endif /* MPIR_GROUP_H_INCLUDED */ diff --git a/src/mpi/comm/comm_impl.c b/src/mpi/comm/comm_impl.c index 46f06b89762..56db002f58c 100644 --- a/src/mpi/comm/comm_impl.c +++ b/src/mpi/comm/comm_impl.c @@ -198,9 +198,6 @@ int MPII_Comm_create_calculate_mapping(MPIR_Group * group_ptr, * exactly the same as the ranks in comm world. */ - /* we examine the group's lpids in both the intracomm and non-comm_world cases */ - MPII_Group_setup_lpid_list(group_ptr); - /* Optimize for groups contained within MPI_COMM_WORLD. */ if (comm_ptr->comm_kind == MPIR_COMM_KIND__INTRACOMM) { int wsize; diff --git a/src/mpi/group/grouputil.c b/src/mpi/group/grouputil.c index 414c562fe3c..59c45561eca 100644 --- a/src/mpi/group/grouputil.c +++ b/src/mpi/group/grouputil.c @@ -28,10 +28,9 @@ int MPIR_Group_init(void) MPIR_Object_set_ref(&MPIR_Group_builtin[0], 1); MPIR_Group_builtin[0].size = 0; MPIR_Group_builtin[0].rank = MPI_UNDEFINED; - MPIR_Group_builtin[0].idx_of_first_lpid = -1; - MPIR_Group_builtin[0].lrank_to_lpid = NULL; + MPIR_Group_builtin[0].session_ptr = NULL; + memset(&MPIR_Group_builtin[0].pmap, 0, sizeof(struct MPIR_Pmap)); - /* TODO hook for device here? */ return mpi_errno; } @@ -44,7 +43,9 @@ int MPIR_Group_release(MPIR_Group * group_ptr) MPIR_Group_release_ref(group_ptr, &inuse); if (!inuse) { /* Only if refcount is 0 do we actually free. */ - MPL_free(group_ptr->lrank_to_lpid); + if (group_ptr->pmap.use_map) { + MPL_free(group_ptr->pmap.u.map); + } if (group_ptr->session_ptr != NULL) { /* Release session */ MPIR_Session_release(group_ptr->session_ptr); @@ -73,24 +74,14 @@ int MPIR_Group_create(int nproc, MPIR_Group ** new_group_ptr) } /* --END ERROR HANDLING-- */ MPIR_Object_set_ref(*new_group_ptr, 1); - (*new_group_ptr)->lrank_to_lpid = - (MPII_Group_pmap_t *) MPL_calloc(nproc, sizeof(MPII_Group_pmap_t), MPL_MEM_GROUP); - /* --BEGIN ERROR HANDLING-- */ - if (!(*new_group_ptr)->lrank_to_lpid) { - MPIR_Handle_obj_free(&MPIR_Group_mem, *new_group_ptr); - *new_group_ptr = NULL; - MPIR_CHKMEM_SETERR(mpi_errno, nproc * sizeof(MPII_Group_pmap_t), "newgroup->lrank_to_lpid"); - return mpi_errno; - } - /* --END ERROR HANDLING-- */ - (*new_group_ptr)->size = nproc; - /* Make sure that there is no question that the list of ranks sorted - * by pids is marked as uninitialized */ - (*new_group_ptr)->idx_of_first_lpid = -1; - - (*new_group_ptr)->is_local_dense_monotonic = FALSE; + /* initialize fields */ + (*new_group_ptr)->size = nproc; + (*new_group_ptr)->rank = MPI_UNDEFINED; (*new_group_ptr)->session_ptr = NULL; + memset(&(*new_group_ptr)->pmap, 0, sizeof(struct MPIR_Pmap)); + (*new_group_ptr)->pmap.size = nproc; + return mpi_errno; } @@ -103,25 +94,25 @@ int MPIR_Group_create_map(int size, int rank, MPIR_Session * session_ptr, MPIR_L /* See 5.3.2, Group Constructors. For many group routines, * the standard explicitly says to return MPI_GROUP_EMPTY; * for others it is implied */ + MPL_free(map); *new_group_ptr = MPIR_Group_empty; goto fn_exit; - } + } else { + MPIR_Group *newgrp; + mpi_errno = MPIR_Group_create(size, &newgrp); + MPIR_ERR_CHECK(mpi_errno); - MPIR_Group *newgrp; - mpi_errno = MPIR_Group_create(size, &newgrp); - MPIR_ERR_CHECK(mpi_errno); + newgrp->rank = rank; + MPIR_Group_set_session_ptr(newgrp, session_ptr); - newgrp->rank = rank; - MPIR_Group_set_session_ptr(newgrp, session_ptr); + newgrp->pmap.use_map = true; + newgrp->pmap.u.map = map; - for (int i = 0; i < size; i++) { - newgrp->lrank_to_lpid[i].lpid = map[i]; + /* TODO: build hash to accelerate MPIR_Group_lpid_to_rank */ + *new_group_ptr = newgrp; } - *new_group_ptr = newgrp; - fn_exit: - MPL_free(map); return mpi_errno; fn_fail: goto fn_exit; @@ -132,176 +123,88 @@ int MPIR_Group_create_stride(int size, int rank, MPIR_Session * session_ptr, MPIR_Group ** new_group_ptr) { int mpi_errno = MPI_SUCCESS; - MPIR_Group *newgrp; - MPIR_Assert(size > 0); + if (size == 0) { + /* See 5.3.2, Group Constructors. For many group routines, + * the standard explicitly says to return MPI_GROUP_EMPTY; + * for others it is implied */ + *new_group_ptr = MPIR_Group_empty; + goto fn_exit; + } else { + MPIR_Group *newgrp; + mpi_errno = MPIR_Group_create(size, &newgrp); + MPIR_ERR_CHECK(mpi_errno); - mpi_errno = MPIR_Group_create(size, &newgrp); - MPIR_ERR_CHECK(mpi_errno); + newgrp->rank = rank; + MPIR_Group_set_session_ptr(newgrp, session_ptr); - newgrp->rank = rank; - MPIR_Group_set_session_ptr(newgrp, session_ptr); + newgrp->pmap.use_map = false; + newgrp->pmap.u.stride.offset = offset; + newgrp->pmap.u.stride.stride = stride; + newgrp->pmap.u.stride.blocksize = blocksize; - MPIR_Lpid lpid = offset; - int i = 0; - while (i < size) { - for (int j = 0; j < blocksize; j++) { - newgrp->lrank_to_lpid[i + j].lpid = lpid + j; - } - i += blocksize; - lpid += stride; + *new_group_ptr = newgrp; } - *new_group_ptr = newgrp; - fn_exit: return mpi_errno; fn_fail: goto fn_exit; } -MPIR_Lpid MPIR_Group_rank_to_lpid(MPIR_Group * group, int rank) +static MPIR_Lpid pmap_rank_to_lpid(struct MPIR_Pmap *pmap, int rank) { - return group->lrank_to_lpid[rank].lpid; -} + if (rank < 0 || rank >= pmap->size) { + return MPI_UNDEFINED; + } -int MPIR_Group_lpid_to_rank(MPIR_Group * group, MPIR_Lpid lpid) -{ - /* Use linear search for now. - * Optimization, build hash map in MPIR_Group_create_map and do O(1) hash lookup - */ - for (int i = 0; i < group->size; i++) { - if (lpid == group->lrank_to_lpid[i].lpid) { - return i; - } + if (pmap->use_map) { + return pmap->u.map[rank]; + } else { + MPIR_Lpid i_blk = rank / pmap->u.stride.blocksize; + MPIR_Lpid r_blk = rank % pmap->u.stride.blocksize; + return pmap->u.stride.offset + i_blk * pmap->u.stride.stride + r_blk; } - return MPI_UNDEFINED; } -/* - * return value is the first index in the list - * - * This "sorts" an lpid array by lpid value, using a simple merge sort - * algorithm. - * - * In actuality, it does not reorder the elements of maparray (these must remain - * in group rank order). Instead it builds the traversal order (in increasing - * lpid order) through the maparray given by the "next_lpid" fields. - */ -static int mergesort_lpidarray(MPII_Group_pmap_t maparray[], int n) +static int pmap_lpid_to_rank(struct MPIR_Pmap *pmap, MPIR_Lpid lpid) { - int idx1, idx2, first_idx, cur_idx, next_lpid, idx2_offset; - - if (n == 2) { - if (maparray[0].lpid > maparray[1].lpid) { - first_idx = 1; - maparray[0].next_lpid = -1; - maparray[1].next_lpid = 0; - } else { - first_idx = 0; - maparray[0].next_lpid = 1; - maparray[1].next_lpid = -1; + if (pmap->use_map) { + /* Use linear search for now. + * Optimization: build hash map in MPIR_Group_create_map and do O(1) hash lookup + */ + for (int rank = 0; rank < pmap->size; rank++) { + if (pmap->u.map[rank] == lpid) { + return rank; + } } - return first_idx; - } - if (n == 1) { - maparray[0].next_lpid = -1; - return 0; - } - if (n == 0) - return -1; - - /* Sort each half */ - idx2_offset = n / 2; - idx1 = mergesort_lpidarray(maparray, n / 2); - idx2 = mergesort_lpidarray(maparray + idx2_offset, n - n / 2) + idx2_offset; - /* merge the results */ - /* There are three lists: - * first_idx - points to the HEAD of the sorted, merged list - * cur_idx - points to the LAST element of the sorted, merged list - * idx1 - points to the HEAD of one sorted list - * idx2 - points to the HEAD of the other sorted list - * - * We first identify the head element of the sorted list. We then - * take elements from the remaining lists. When one list is empty, - * we add the other list to the end of sorted list. - * - * The last wrinkle is that the next_lpid fields in maparray[idx2] - * are relative to n/2, not 0 (that is, a next_lpid of 1 is - * really 1 + n/2, relative to the beginning of maparray). - */ - /* Find the head element */ - if (maparray[idx1].lpid > maparray[idx2].lpid) { - first_idx = idx2; - idx2 = maparray[idx2].next_lpid + idx2_offset; + return MPI_UNDEFINED; } else { - first_idx = idx1; - idx1 = maparray[idx1].next_lpid; - } + lpid -= pmap->u.stride.offset; + MPIR_Lpid i_blk = lpid / pmap->u.stride.stride; + MPIR_Lpid r_blk = lpid % pmap->u.stride.stride; - /* Merge the lists until one is empty */ - cur_idx = first_idx; - while (idx1 >= 0 && idx2 >= 0) { - if (maparray[idx1].lpid > maparray[idx2].lpid) { - next_lpid = maparray[idx2].next_lpid; - if (next_lpid >= 0) - next_lpid += idx2_offset; - maparray[cur_idx].next_lpid = idx2; - cur_idx = idx2; - idx2 = next_lpid; - } else { - next_lpid = maparray[idx1].next_lpid; - maparray[cur_idx].next_lpid = idx1; - cur_idx = idx1; - idx1 = next_lpid; + if (r_blk >= pmap->u.stride.blocksize) { + return MPI_UNDEFINED; } - } - /* Add whichever list remains */ - if (idx1 >= 0) { - maparray[cur_idx].next_lpid = idx1; - } else { - maparray[cur_idx].next_lpid = idx2; - /* Convert the rest of these next_lpid values to be - * relative to the beginning of maparray */ - while (idx2 >= 0) { - next_lpid = maparray[idx2].next_lpid; - if (next_lpid >= 0) { - next_lpid += idx2_offset; - maparray[idx2].next_lpid = next_lpid; - } - idx2 = next_lpid; + + int rank = i_blk * pmap->u.stride.blocksize + r_blk; + if (rank >= 0 && rank < pmap->size) { + return rank; + } else { + return MPI_UNDEFINED; } } - - return first_idx; } -/* - * Create a list of the lpids, in lpid order. - * - * Called by group_compare, group_translate_ranks, group_union - * - * In the case of a single main thread lock, the lock must - * be held on entry to this routine. This forces some of the routines - * noted above to hold the SINGLE_CS; which would otherwise not be required. - */ -void MPII_Group_setup_lpid_list(MPIR_Group * group_ptr) +int MPIR_Group_lpid_to_rank(MPIR_Group * group, MPIR_Lpid lpid) { - if (group_ptr->idx_of_first_lpid == -1) { - group_ptr->idx_of_first_lpid = - mergesort_lpidarray(group_ptr->lrank_to_lpid, group_ptr->size); - } + return pmap_lpid_to_rank(&group->pmap, lpid); } -void MPIR_Group_setup_lpid_pairs(MPIR_Group * group_ptr1, MPIR_Group * group_ptr2) +MPIR_Lpid MPIR_Group_rank_to_lpid(MPIR_Group * group, int rank) { - /* If the lpid list hasn't been created, do it now */ - if (group_ptr1->idx_of_first_lpid < 0) { - MPII_Group_setup_lpid_list(group_ptr1); - } - if (group_ptr2->idx_of_first_lpid < 0) { - MPII_Group_setup_lpid_list(group_ptr2); - } + return pmap_rank_to_lpid(&group->pmap, rank); } #ifdef HAVE_ERROR_CHECKING @@ -439,54 +342,40 @@ int MPIR_Group_check_valid_ranges(MPIR_Group * group_ptr, int ranges[][3], int n int MPIR_Group_check_subset(MPIR_Group * group_ptr, MPIR_Comm * comm_ptr) { int mpi_errno = MPI_SUCCESS; - int g1_idx, g2_idx, l1_pid, l2_pid, i; - MPII_Group_pmap_t *vmap = 0; + int vsize = comm_ptr->comm_kind == MPIR_COMM_KIND__INTERCOMM ? comm_ptr->local_size : comm_ptr->remote_size; - MPIR_CHKLMEM_DECL(1); - - MPIR_Assert(group_ptr != NULL); - - MPIR_CHKLMEM_MALLOC(vmap, MPII_Group_pmap_t *, - vsize * sizeof(MPII_Group_pmap_t), mpi_errno, "", MPL_MEM_GROUP); /* Initialize the vmap */ - for (i = 0; i < vsize; i++) { - MPID_Comm_get_lpid(comm_ptr, i, &vmap[i].lpid, FALSE); - vmap[i].next_lpid = 0; + MPIR_Lpid *vmap = MPL_malloc(vsize * sizeof(MPIR_Lpid), MPL_MEM_GROUP); + for (int i = 0; i < vsize; i++) { + /* FIXME: MPID_Comm_get_lpid to be removed */ + uint64_t dev_lpid; + MPID_Comm_get_lpid(comm_ptr, i, &dev_lpid, FALSE); + MPIR_Assert((dev_lpid >> 32) == 0); + vmap[i] = dev_lpid; } - MPII_Group_setup_lpid_list(group_ptr); - g1_idx = group_ptr->idx_of_first_lpid; - g2_idx = mergesort_lpidarray(vmap, vsize); - MPL_DBG_MSG_FMT(MPIR_DBG_COMM, VERBOSE, (MPL_DBG_FDEST, - "initial indices: %d %d\n", g1_idx, g2_idx)); - while (g1_idx >= 0 && g2_idx >= 0) { - l1_pid = group_ptr->lrank_to_lpid[g1_idx].lpid; - l2_pid = vmap[g2_idx].lpid; - MPL_DBG_MSG_FMT(MPIR_DBG_COMM, VERBOSE, (MPL_DBG_FDEST, - "Lpids are %d, %d\n", l1_pid, l2_pid)); - if (l1_pid < l2_pid) { - /* If we have to advance g1, we didn't find a match, so - * that's an error. */ - break; - } else if (l1_pid > l2_pid) { - g2_idx = vmap[g2_idx].next_lpid; - } else { - /* Equal */ - g1_idx = group_ptr->lrank_to_lpid[g1_idx].next_lpid; - g2_idx = vmap[g2_idx].next_lpid; + for (int rank = 0; rank < group_ptr->size; rank++) { + MPIR_Lpid lpid = MPIR_Group_rank_to_lpid(group_ptr, rank); + bool found = false; + for (int i = 0; i < vsize; i++) { + if (vmap[i] == lpid) { + found = true; + break; + } + } + if (!found) { + MPIR_ERR_SET1(mpi_errno, MPI_ERR_GROUP, "**groupnotincomm", + "**groupnotincomm %d", rank); + goto fn_fail; } - MPL_DBG_MSG_FMT(MPIR_DBG_COMM, VERBOSE, (MPL_DBG_FDEST, - "g1 = %d, g2 = %d\n", g1_idx, g2_idx)); - } - - if (g1_idx >= 0) { - MPIR_ERR_SET1(mpi_errno, MPI_ERR_GROUP, "**groupnotincomm", "**groupnotincomm %d", g1_idx); } - fn_fail: - MPIR_CHKLMEM_FREEALL(); + fn_exit: + MPL_free(vmap); return mpi_errno; + fn_fail: + goto fn_exit; } #endif /* HAVE_ERROR_CHECKING */ From 928feb939617e6fa6dbd78c9554f856bcb07c1b7 Mon Sep 17 00:00:00 2001 From: Hui Zhou Date: Wed, 11 Dec 2024 09:20:39 -0600 Subject: [PATCH 10/59] ---- START HERE ---- --- dummy | 1 + 1 file changed, 1 insertion(+) create mode 100644 dummy diff --git a/dummy b/dummy new file mode 100644 index 00000000000..d00491fd7e5 --- /dev/null +++ b/dummy @@ -0,0 +1 @@ +1 From 1685d2bb24cbeac778c0cb53af2e167df25edbce Mon Sep 17 00:00:00 2001 From: Hui Zhou Date: Wed, 11 Dec 2024 19:14:31 -0600 Subject: [PATCH 11/59] mpid/ch4: remove MPIDI_NM_comm_get_gpid This is the same as MPID_Comm_get_lpid. NOTE: we'll will remove MPID_Comm_get_lpid as well once we move the ownership of lpid to the MPIR-layer. --- src/mpid/ch4/ch4_api.txt | 4 ---- src/mpid/ch4/netmod/ofi/ofi_proc.h | 16 ---------------- src/mpid/ch4/netmod/ucx/ucx_proc.h | 17 ----------------- src/mpid/ch4/src/ch4_impl.h | 6 +++--- src/mpid/ch4/src/ch4_init.c | 2 +- 5 files changed, 4 insertions(+), 41 deletions(-) diff --git a/src/mpid/ch4/ch4_api.txt b/src/mpid/ch4/ch4_api.txt index c1778e546ff..e9a2e2b7e5d 100644 --- a/src/mpid/ch4/ch4_api.txt +++ b/src/mpid/ch4/ch4_api.txt @@ -87,8 +87,6 @@ Non Native API: am_tag_recv : int NM*: rank, comm, handler_id, tag, buf-2, count, datatype, src_vci, dst_vci, rreq SHM*: rank, comm, handler_id, tag, buf-2, count, datatype, src_vci, dst_vci, rreq - comm_get_gpid : int - NM*: comm_ptr, idx, gpid_ptr, is_remote get_local_upids : int NM : comm, local_upid_size, local_upids upids_to_gpids : int @@ -477,8 +475,6 @@ PARAM: local_upid_size: int ** local_upids: char ** lock_type: int - gpid_ptr: uint64_t * - lpids: const int[] made_progress: int * message: MPIR_Request * message_p: MPIR_Request ** diff --git a/src/mpid/ch4/netmod/ofi/ofi_proc.h b/src/mpid/ch4/netmod/ofi/ofi_proc.h index b23e6ec531d..c7ab1f2fb7f 100644 --- a/src/mpid/ch4/netmod/ofi/ofi_proc.h +++ b/src/mpid/ch4/netmod/ofi/ofi_proc.h @@ -20,20 +20,4 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_NM_rank_is_local(int rank, MPIR_Comm * comm) return ret; } -MPL_STATIC_INLINE_PREFIX int MPIDI_NM_comm_get_gpid(MPIR_Comm * comm_ptr, - int idx, uint64_t * gpid_ptr, bool is_remote) -{ - int avtid = 0, lpid = 0; - if (comm_ptr->comm_kind == MPIR_COMM_KIND__INTRACOMM) - MPIDIU_comm_rank_to_pid(comm_ptr, idx, &lpid, &avtid); - else if (is_remote) - MPIDIU_comm_rank_to_pid(comm_ptr, idx, &lpid, &avtid); - else { - MPIDIU_comm_rank_to_pid_local(comm_ptr, idx, &lpid, &avtid); - } - - *gpid_ptr = MPIDIU_GPID_CREATE(avtid, lpid); - return MPI_SUCCESS; -} - #endif /* OFI_PROC_H_INCLUDED */ diff --git a/src/mpid/ch4/netmod/ucx/ucx_proc.h b/src/mpid/ch4/netmod/ucx/ucx_proc.h index 066670c014a..b8481ffd6a6 100644 --- a/src/mpid/ch4/netmod/ucx/ucx_proc.h +++ b/src/mpid/ch4/netmod/ucx/ucx_proc.h @@ -19,21 +19,4 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_NM_rank_is_local(int rank, MPIR_Comm * comm) return ret; } -MPL_STATIC_INLINE_PREFIX int MPIDI_NM_comm_get_gpid(MPIR_Comm * comm_ptr, - int idx, uint64_t * gpid_ptr, bool is_remote) -{ - int avtid = 0, lpid = 0; - if (comm_ptr->comm_kind == MPIR_COMM_KIND__INTRACOMM) { - MPIDIU_comm_rank_to_pid(comm_ptr, idx, &lpid, &avtid); - } else if (is_remote) { - MPIDIU_comm_rank_to_pid(comm_ptr, idx, &lpid, &avtid); - } else { - MPIDIU_comm_rank_to_pid_local(comm_ptr, idx, &lpid, &avtid); - } - - *gpid_ptr = MPIDIU_GPID_CREATE(avtid, lpid); - return MPI_SUCCESS; - -} - #endif /* UCX_PROC_H_INCLUDED */ diff --git a/src/mpid/ch4/src/ch4_impl.h b/src/mpid/ch4/src/ch4_impl.h index 2f5a31dc767..6fa918db043 100644 --- a/src/mpid/ch4/src/ch4_impl.h +++ b/src/mpid/ch4/src/ch4_impl.h @@ -378,17 +378,17 @@ MPL_STATIC_INLINE_PREFIX void MPIDIG_win_hash_clear(MPIR_Win * win) /* We assume this routine is never called with rank=MPI_PROC_NULL. */ MPL_STATIC_INLINE_PREFIX int MPIDIU_valid_group_rank(MPIR_Comm * comm, int rank, MPIR_Group * grp) { - uint64_t gpid; + MPIR_Lpid lpid; int size = grp->size; int z; int ret; MPIR_FUNC_ENTER; - MPIDI_NM_comm_get_gpid(comm, rank, &gpid, FALSE); + MPID_Comm_get_lpid(comm, rank, &lpid, FALSE); for (z = 0; z < size; ++z) { - if (gpid == MPIR_Group_rank_to_lpid(grp, z)) { + if (lpid == MPIR_Group_rank_to_lpid(grp, z)) { break; } } diff --git a/src/mpid/ch4/src/ch4_init.c b/src/mpid/ch4/src/ch4_init.c index 365a12b37ad..e09357352c7 100644 --- a/src/mpid/ch4/src/ch4_init.c +++ b/src/mpid/ch4/src/ch4_init.c @@ -1073,7 +1073,7 @@ int MPID_Free_mem(void *user_buf) goto fn_exit; } -int MPID_Comm_get_lpid(MPIR_Comm * comm_ptr, int idx, uint64_t * lpid_ptr, bool is_remote) +int MPID_Comm_get_lpid(MPIR_Comm * comm_ptr, int idx, MPIR_Lpid * lpid_ptr, bool is_remote) { int mpi_errno = MPI_SUCCESS; int avtid = 0, lpid = 0; From e92b94b7d04d8b9454fca4a2a9a402df813ad2e9 Mon Sep 17 00:00:00 2001 From: Hui Zhou Date: Wed, 11 Dec 2024 18:57:37 -0600 Subject: [PATCH 12/59] mpid: replace usage of uint64_t lpid with MPIR_Lpid There is no real difference between lpid and gpid. Thus rename gpid in the device layer to lpid for clarification. Replace the usage of uint64_t as the type of lpid to MPIR_Lpid. This improves consistency. --- src/mpid/ch3/include/mpidpost.h | 5 +-- src/mpid/ch3/include/mpidpre.h | 1 - src/mpid/ch3/src/mpid_vc.c | 16 ++++----- src/mpid/ch4/ch4_api.txt | 10 +++--- src/mpid/ch4/include/mpidch4.h | 6 ++-- src/mpid/ch4/netmod/ofi/ofi_spawn.c | 14 ++++---- src/mpid/ch4/netmod/ucx/ucx_spawn.c | 14 ++++---- src/mpid/ch4/src/ch4_comm.c | 56 ++++++++++++++++------------- src/mpid/ch4/src/ch4_impl.h | 2 +- src/mpid/ch4/src/ch4_proc.c | 6 ++-- src/mpid/ch4/src/ch4_proc.h | 4 +-- src/mpid/ch4/src/ch4_spawn.c | 16 ++++----- src/mpid/ch4/src/ch4i_comm.c | 16 ++++----- src/mpid/ch4/src/ch4i_comm.h | 2 +- 14 files changed, 88 insertions(+), 80 deletions(-) diff --git a/src/mpid/ch3/include/mpidpost.h b/src/mpid/ch3/include/mpidpost.h index 2c773d97099..e45f0fba1c2 100644 --- a/src/mpid/ch3/include/mpidpost.h +++ b/src/mpid/ch3/include/mpidpost.h @@ -188,10 +188,11 @@ int MPIDI_PG_ForwardPGInfo( MPIR_Comm *peer_ptr, MPIR_Comm *comm_ptr, int root ); int MPID_Intercomm_exchange_map( MPIR_Comm *local_comm_ptr, int local_leader, MPIR_Comm *peer_comm_ptr, int remote_leader, - int *remote_size, uint64_t **remote_lpids, + int *remote_size, MPIR_Lpid **remote_lpids, int *is_low_group); int MPID_Create_intercomm_from_lpids( MPIR_Comm *newcomm_ptr, - int size, const uint64_t lpids[] ); + int size, const MPIR_Lpid lpids[] ); +int MPID_Comm_get_lpid(MPIR_Comm *comm_ptr, int idx, MPIR_Lpid *lpid_ptr, bool is_remote); #define MPID_INTERCOMM_NO_DYNPROC(comm) (0) diff --git a/src/mpid/ch3/include/mpidpre.h b/src/mpid/ch3/include/mpidpre.h index 2d181e2a7c8..18a83a6af3f 100644 --- a/src/mpid/ch3/include/mpidpre.h +++ b/src/mpid/ch3/include/mpidpre.h @@ -829,7 +829,6 @@ int MPID_Progress_poke(void); int MPID_Get_processor_name( char *name, int namelen, int *resultlen); int MPID_Get_universe_size(int * universe_size); -int MPID_Comm_get_lpid(MPIR_Comm *comm_ptr, int idx, uint64_t *lpid_ptr, bool is_remote); #define MPID_Request_create_from_comm(kind, comm) MPIR_Request_create(kind) void MPID_Request_create_hook(MPIR_Request *); diff --git a/src/mpid/ch3/src/mpid_vc.c b/src/mpid/ch3/src/mpid_vc.c index 496fd1a25ba..bf92e8e330c 100644 --- a/src/mpid/ch3/src/mpid_vc.c +++ b/src/mpid/ch3/src/mpid_vc.c @@ -241,7 +241,7 @@ int MPIDI_VCR_Dup(MPIDI_VCR orig_vcr, MPIDI_VCR * new_vcr) /*@ MPID_Comm_get_lpid - Get the local process ID for a given VC reference @*/ -int MPID_Comm_get_lpid(MPIR_Comm *comm_ptr, int idx, uint64_t *lpid_ptr, bool is_remote) +int MPID_Comm_get_lpid(MPIR_Comm *comm_ptr, int idx, MPIR_Lpid *lpid_ptr, bool is_remote) { MPIR_FUNC_ENTER; @@ -383,7 +383,7 @@ static inline int MPIDI_LPID_GetAllInComm(MPIR_Comm *comm_ptr, int local_size, int mpi_errno = MPI_SUCCESS; MPIR_Assert( comm_ptr->local_size == local_size ); for (i=0; ilocal_size; i++) { - uint64_t tmp_lpid; + MPIR_Lpid tmp_lpid; mpi_errno |= MPID_Comm_get_lpid( comm_ptr, i, &tmp_lpid, FALSE ); local_lpids[i] = tmp_lpid; } @@ -461,13 +461,13 @@ static int check_disjoint_lpids(uint64_t lpids1[], int n1, uint64_t lpids2[], in @*/ int MPID_Intercomm_exchange_map(MPIR_Comm *local_comm_ptr, int local_leader, MPIR_Comm *peer_comm_ptr, int remote_leader, - int *remote_size, uint64_t **remote_lpids, + int *remote_size, MPIR_Lpid **remote_lpids, int *is_low_group) { int mpi_errno = MPI_SUCCESS; int singlePG; int local_size; - uint64_t *local_lpids=0; + MPIR_Lpid *local_lpids=0; MPIDI_Gpid *local_gpids=NULL, *remote_gpids=NULL; int comm_info[2]; int cts_tag; @@ -500,9 +500,9 @@ int MPID_Intercomm_exchange_map(MPIR_Comm *local_comm_ptr, int local_leader, /* With this information, we can now send and receive the global process ids from the peer. */ MPIR_CHKLMEM_MALLOC(remote_gpids,MPIDI_Gpid*,(*remote_size)*sizeof(MPIDI_Gpid), mpi_errno,"remote_gpids", MPL_MEM_DYNAMIC); - *remote_lpids = (uint64_t*) MPL_malloc((*remote_size)*sizeof(uint64_t), MPL_MEM_ADDRESS); + *remote_lpids = MPL_malloc((*remote_size)*sizeof(MPIR_Lpid), MPL_MEM_ADDRESS); MPIR_CHKLMEM_MALLOC(local_gpids,MPIDI_Gpid*,local_size*sizeof(MPIDI_Gpid), mpi_errno,"local_gpids", MPL_MEM_DYNAMIC); - MPIR_CHKLMEM_MALLOC(local_lpids,uint64_t*,local_size*sizeof(uint64_t), mpi_errno,"local_lpids", MPL_MEM_DYNAMIC); + MPIR_CHKLMEM_MALLOC(local_lpids,MPIR_Lpid*,local_size*sizeof(MPIR_Lpid), mpi_errno,"local_lpids", MPL_MEM_DYNAMIC); mpi_errno = MPIDI_GPID_GetAllInComm( local_comm_ptr, local_size, local_gpids, &singlePG ); MPIR_ERR_CHECK(mpi_errno); @@ -570,7 +570,7 @@ int MPID_Intercomm_exchange_map(MPIR_Comm *local_comm_ptr, int local_leader, MPIR_ERR_CHECK(mpi_errno); *remote_size = comm_info[0]; MPIR_CHKLMEM_MALLOC(remote_gpids,MPIDI_Gpid*,(*remote_size)*sizeof(MPIDI_Gpid), mpi_errno,"remote_gpids", MPL_MEM_DYNAMIC); - *remote_lpids = (uint64_t*) MPL_malloc((*remote_size)*sizeof(uint64_t), MPL_MEM_ADDRESS); + *remote_lpids = MPL_malloc((*remote_size)*sizeof(MPIR_Lpid), MPL_MEM_ADDRESS); mpi_errno = MPIR_Bcast( remote_gpids, (*remote_size)*sizeof(MPIDI_Gpid), MPI_BYTE, local_leader, local_comm_ptr, MPIR_ERR_NONE ); MPIR_ERR_CHECK(mpi_errno); @@ -621,7 +621,7 @@ int MPID_Intercomm_exchange_map(MPIR_Comm *local_comm_ptr, int local_leader, 'MPI_Comm_connect/MPI_Comm_accept'. Thus, it is only used for intercommunicators. @*/ int MPID_Create_intercomm_from_lpids( MPIR_Comm *newcomm_ptr, - int size, const uint64_t lpids[] ) + int size, const MPIR_Lpid lpids[] ) { int mpi_errno = MPI_SUCCESS; MPIR_Comm *commworld_ptr; diff --git a/src/mpid/ch4/ch4_api.txt b/src/mpid/ch4/ch4_api.txt index e9a2e2b7e5d..9165d4b8ed0 100644 --- a/src/mpid/ch4/ch4_api.txt +++ b/src/mpid/ch4/ch4_api.txt @@ -89,10 +89,10 @@ Non Native API: SHM*: rank, comm, handler_id, tag, buf-2, count, datatype, src_vci, dst_vci, rreq get_local_upids : int NM : comm, local_upid_size, local_upids - upids_to_gpids : int - NM : size, remote_upid_size, remote_upids, remote_gpids + upids_to_lpids : int + NM : size, remote_upid_size, remote_upids, remote_lpids dynamic_send : int - NM : remote_gpid, tag, buf, size, timeout + NM : remote_lpid, tag, buf, size, timeout dynamic_recv : int NM : tag, buf-2, size, timeout mpi_comm_commit_pre_hook : int @@ -499,8 +499,8 @@ PARAM: recvcounts: const MPI_Aint * recvtype: MPI_Datatype recvtypes: const MPI_Datatype[] - remote_gpid: uint64_t - remote_gpids: uint64_t * + remote_lpid: MPIR_Lpid + remote_lpids: MPIR_Lpid * remote_upid_size: int * remote_upids: char * req: MPIR_Request * diff --git a/src/mpid/ch4/include/mpidch4.h b/src/mpid/ch4/include/mpidch4.h index 3dd3528efbc..f3f57a722c9 100644 --- a/src/mpid/ch4/include/mpidch4.h +++ b/src/mpid/ch4/include/mpidch4.h @@ -26,7 +26,7 @@ int MPID_Comm_get_all_failed_procs(MPIR_Comm *, MPIR_Group **, int); int MPID_Comm_revoke(MPIR_Comm *, int); int MPID_Comm_failure_ack(MPIR_Comm *); MPL_STATIC_INLINE_PREFIX int MPID_Comm_AS_enabled(MPIR_Comm *) MPL_STATIC_INLINE_SUFFIX; -int MPID_Comm_get_lpid(MPIR_Comm *, int, uint64_t *, bool); +int MPID_Comm_get_lpid(MPIR_Comm *, int, MPIR_Lpid *, bool); int MPID_CS_finalize(void); int MPID_Finalize(void); int MPID_Get_universe_size(int *); @@ -167,8 +167,8 @@ int MPID_Type_commit_hook(MPIR_Datatype *); int MPID_Type_free_hook(MPIR_Datatype *); int MPID_Op_commit_hook(MPIR_Op *); int MPID_Op_free_hook(MPIR_Op *); -int MPID_Intercomm_exchange_map(MPIR_Comm *, int, MPIR_Comm *, int, int *, uint64_t **, int *); -int MPID_Create_intercomm_from_lpids(MPIR_Comm *, int, const uint64_t[]); +int MPID_Intercomm_exchange_map(MPIR_Comm *, int, MPIR_Comm *, int, int *, MPIR_Lpid **, int *); +int MPID_Create_intercomm_from_lpids(MPIR_Comm *, int, const MPIR_Lpid[]); int MPID_Comm_commit_pre_hook(MPIR_Comm *); int MPID_Comm_free_hook(MPIR_Comm *); int MPID_Comm_set_hints(MPIR_Comm *, MPIR_Info *); diff --git a/src/mpid/ch4/netmod/ofi/ofi_spawn.c b/src/mpid/ch4/netmod/ofi/ofi_spawn.c index 20adc54b3b1..6ccb9bd1cf2 100644 --- a/src/mpid/ch4/netmod/ofi/ofi_spawn.c +++ b/src/mpid/ch4/netmod/ofi/ofi_spawn.c @@ -7,7 +7,7 @@ #include "ofi_impl.h" #include "ofi_noinline.h" -int MPIDI_OFI_dynamic_send(uint64_t remote_gpid, int tag, const void *buf, int size, int timeout) +int MPIDI_OFI_dynamic_send(MPIR_Lpid remote_lpid, int tag, const void *buf, int size, int timeout) { int mpi_errno = MPI_SUCCESS; @@ -16,8 +16,8 @@ int MPIDI_OFI_dynamic_send(uint64_t remote_gpid, int tag, const void *buf, int s int nic = 0; /* dynamic process only use nic 0 */ int vci = 0; /* dynamic process only use vci 0 */ int ctx_idx = 0; - int avtid = MPIDIU_GPID_GET_AVTID(remote_gpid); - int lpid = MPIDIU_GPID_GET_LPID(remote_gpid); + int avtid = MPIDIU_GPID_GET_AVTID(remote_lpid); + int lpid = MPIDIU_GPID_GET_LPID(remote_lpid); fi_addr_t remote_addr = MPIDI_OFI_av_to_phys(&MPIDIU_get_av(avtid, lpid), nic, vci); MPID_THREAD_CS_ENTER(VCI, MPIDI_VCI(vci).lock); @@ -135,8 +135,8 @@ int MPIDI_OFI_dynamic_recv(int tag, void *buf, int size, int timeout) /* the following functions are "proc" functions, but because they are only used during dynamic * process spawning, having them here provides better context */ -int MPIDI_OFI_upids_to_gpids(int size, int *remote_upid_size, char *remote_upids, - uint64_t * remote_gpids) +int MPIDI_OFI_upids_to_lpids(int size, int *remote_upid_size, char *remote_upids, + MPIR_Lpid * remote_lpids) { int i, mpi_errno = MPI_SUCCESS; int *new_avt_procs; @@ -178,7 +178,7 @@ int MPIDI_OFI_upids_to_gpids(int size, int *remote_upid_size, char *remote_upids MPIDI_OFI_TO_PHYS(k, j, nic), &tbladdr, &sz), 0, avlookup); if (sz == addrname_len && !memcmp(tbladdr, addrname, addrname_len)) { - remote_gpids[i] = MPIDIU_GPID_CREATE(k, j); + remote_lpids[i] = MPIDIU_GPID_CREATE(k, j); found = 1; break; } @@ -217,7 +217,7 @@ int MPIDI_OFI_upids_to_gpids(int size, int *remote_upid_size, char *remote_upids MPIR_ERR_CHECK(mpi_errno); MPIDIU_get_av(avtid, i).node_id = node_id; - remote_gpids[new_avt_procs[i]] = MPIDIU_GPID_CREATE(avtid, i); + remote_lpids[new_avt_procs[i]] = MPIDIU_GPID_CREATE(avtid, i); } } diff --git a/src/mpid/ch4/netmod/ucx/ucx_spawn.c b/src/mpid/ch4/netmod/ucx/ucx_spawn.c index 05e888d5639..e78dc2a0af3 100644 --- a/src/mpid/ch4/netmod/ucx/ucx_spawn.c +++ b/src/mpid/ch4/netmod/ucx/ucx_spawn.c @@ -20,7 +20,7 @@ static void dynamic_recv_cb(void *request, ucs_status_t status, *done = true; } -int MPIDI_UCX_dynamic_send(uint64_t remote_gpid, int tag, const void *buf, int size, int timeout) +int MPIDI_UCX_dynamic_send(MPIR_Lpid remote_lpid, int tag, const void *buf, int size, int timeout) { int mpi_errno = MPI_SUCCESS; @@ -29,8 +29,8 @@ int MPIDI_UCX_dynamic_send(uint64_t remote_gpid, int tag, const void *buf, int s MPID_THREAD_CS_ENTER(VCI, MPIDI_VCI(vci).lock); - int avtid = MPIDIU_GPID_GET_AVTID(remote_gpid); - int lpid = MPIDIU_GPID_GET_LPID(remote_gpid); + int avtid = MPIDIU_GPID_GET_AVTID(remote_lpid); + int lpid = MPIDIU_GPID_GET_LPID(remote_lpid); ucp_ep_h ep = MPIDI_UCX_AV_TO_EP(&MPIDIU_get_av(avtid, lpid), vci, vci); bool done = false; @@ -147,8 +147,8 @@ int MPIDI_UCX_get_local_upids(MPIR_Comm * comm, int **local_upid_size, char **lo goto fn_exit; } -int MPIDI_UCX_upids_to_gpids(int size, int *remote_upid_size, char *remote_upids, - uint64_t * remote_gpids) +int MPIDI_UCX_upids_to_lpids(int size, int *remote_upid_size, char *remote_upids, + MPIR_Lpid * remote_lpids) { int mpi_errno = MPI_SUCCESS; @@ -167,7 +167,7 @@ int MPIDI_UCX_upids_to_gpids(int size, int *remote_upid_size, char *remote_upids for (int i = 0; i < size; i++) { MPIDI_upid_hash *t = MPIDIU_upidhash_find(curr_upid, remote_upid_size[i]); if (t) { - remote_gpids[i] = MPIDIU_GPID_CREATE(t->avtid, t->lpid); + remote_lpids[i] = MPIDIU_GPID_CREATE(t->avtid, t->lpid); } else { new_avt_procs[n_new_procs] = i; new_upids[n_new_procs] = curr_upid; @@ -193,7 +193,7 @@ int MPIDI_UCX_upids_to_gpids(int size, int *remote_upid_size, char *remote_upids MPIDI_UCX_CHK_STATUS(ucx_status); MPIDIU_upidhash_add(new_upids[i], remote_upid_size[new_avt_procs[i]], avtid, i); - remote_gpids[new_avt_procs[i]] = MPIDIU_GPID_CREATE(avtid, i); + remote_lpids[new_avt_procs[i]] = MPIDIU_GPID_CREATE(avtid, i); } } diff --git a/src/mpid/ch4/src/ch4_comm.c b/src/mpid/ch4/src/ch4_comm.c index 808d6f6e21b..aa705061b22 100644 --- a/src/mpid/ch4/src/ch4_comm.c +++ b/src/mpid/ch4/src/ch4_comm.c @@ -391,7 +391,7 @@ int MPID_Comm_set_hints(MPIR_Comm * comm_ptr, MPIR_Info * info_ptr) } int MPID_Intercomm_exchange_map(MPIR_Comm * local_comm, int local_leader, MPIR_Comm * peer_comm, - int remote_leader, int *remote_size, uint64_t ** remote_gpids, + int remote_leader, int *remote_size, MPIR_Lpid ** remote_lpids, int *is_low_group) { int mpi_errno = MPI_SUCCESS; @@ -402,7 +402,7 @@ int MPID_Intercomm_exchange_map(MPIR_Comm * local_comm, int local_leader, MPIR_C int cts_tag = 0; int pure_intracomm = 1; int local_size = 0; - uint64_t *local_gpids = NULL; + MPIR_Lpid *local_lpids = NULL; int *local_upid_size = NULL, *remote_upid_size = NULL; int upid_send_size = 0, upid_recv_size = 0; char *local_upids = NULL, *remote_upids = NULL; @@ -462,13 +462,13 @@ int MPID_Intercomm_exchange_map(MPIR_Comm * local_comm, int local_leader, MPIR_C (MPL_DBG_FDEST, "local size = %d, remote size = %d, pure_intracomm = %d", local_size, *remote_size, pure_intracomm)); - MPIR_CHKPMEM_MALLOC((*remote_gpids), uint64_t *, (*remote_size) * sizeof(uint64_t), - mpi_errno, "remote_gpids", MPL_MEM_ADDRESS); - MPIR_CHKLMEM_MALLOC(local_gpids, uint64_t *, local_size * sizeof(uint64_t), - mpi_errno, "local_gpids", MPL_MEM_ADDRESS); + MPIR_CHKPMEM_MALLOC((*remote_lpids), MPIR_Lpid *, (*remote_size) * sizeof(MPIR_Lpid), + mpi_errno, "remote_lpids", MPL_MEM_ADDRESS); + MPIR_CHKLMEM_MALLOC(local_lpids, MPIR_Lpid *, local_size * sizeof(MPIR_Lpid), + mpi_errno, "local_lpids", MPL_MEM_ADDRESS); for (i = 0; i < local_size; i++) { MPIDIU_comm_rank_to_pid(local_comm, i, &lpid, &avtid); - local_gpids[i] = MPIDIU_GPID_CREATE(avtid, lpid); + local_lpids[i] = MPIDIU_GPID_CREATE(avtid, lpid); } /* TODO: optimizations -- @@ -506,12 +506,12 @@ int MPID_Intercomm_exchange_map(MPIR_Comm * local_comm, int local_leader, MPIR_C MPIR_ERR_CHECK(mpi_errno); /* Stage 1.2 convert remote UPID to GPID and get GPID for local group */ - MPIDIU_upids_to_gpids(*remote_size, remote_upid_size, remote_upids, *remote_gpids); + MPIDIU_upids_to_lpids(*remote_size, remote_upid_size, remote_upids, *remote_lpids); } else { /* Stage 1.1f only exchange GPIDS if no dynamic process involved */ - mpi_errno = MPIC_Sendrecv(local_gpids, local_size, MPI_UINT64_T, + mpi_errno = MPIC_Sendrecv(local_lpids, local_size, MPI_UINT64_T, remote_leader, cts_tag, - *remote_gpids, *remote_size, MPI_UINT64_T, + *remote_lpids, *remote_size, MPI_UINT64_T, remote_leader, cts_tag, peer_comm, MPI_STATUS_IGNORE, MPIR_ERR_NONE); MPIR_ERR_CHECK(mpi_errno); @@ -536,8 +536,8 @@ int MPID_Intercomm_exchange_map(MPIR_Comm * local_comm, int local_leader, MPIR_C { /* Now that we have both the local and remote processes, * check for any overlap */ - mpi_errno = MPIDI_check_disjoint_gpids(local_gpids, local_size, - *remote_gpids, *remote_size); + mpi_errno = MPIDI_check_disjoint_lpids(local_lpids, local_size, + *remote_lpids, *remote_size); MPIR_ERR_CHECK(mpi_errno); } MPID_END_ERROR_CHECKS; @@ -552,7 +552,7 @@ int MPID_Intercomm_exchange_map(MPIR_Comm * local_comm, int local_leader, MPIR_C * local group is always smaller than remote */ if (pure_intracomm) { - *is_low_group = local_gpids[0] < (*remote_gpids)[0]; + *is_low_group = local_lpids[0] < (*remote_lpids)[0]; } else { if (local_upid_size[0] == remote_upid_size[0]) { *is_low_group = memcmp(local_upids, remote_upids, local_upid_size[0]); @@ -568,7 +568,7 @@ int MPID_Intercomm_exchange_map(MPIR_Comm * local_comm, int local_leader, MPIR_C /* At this point, we're done with the local lpids; they'll * be freed with the other local memory on exit */ - local_gpids = NULL; + local_lpids = NULL; } /* @@ -578,7 +578,7 @@ int MPID_Intercomm_exchange_map(MPIR_Comm * local_comm, int local_leader, MPIR_C (MPL_DBG_FDEST, "Intercomm map exchange stage 2: intra-group")); mpi_errno = MPIDIU_Intercomm_map_bcast_intra(local_comm, local_leader, remote_size, is_low_group, pure_intracomm, - remote_upid_size, remote_upids, remote_gpids); + remote_upid_size, remote_upids, remote_lpids); MPIR_ERR_CHECK(mpi_errno); MPIR_CHKPMEM_COMMIT(); @@ -590,14 +590,14 @@ int MPID_Intercomm_exchange_map(MPIR_Comm * local_comm, int local_leader, MPIR_C return mpi_errno; fn_fail: MPIR_CHKPMEM_REAP(); - *remote_gpids = NULL; + *remote_lpids = NULL; goto fn_exit; } int MPIDIU_Intercomm_map_bcast_intra(MPIR_Comm * local_comm, int local_leader, int *remote_size, int *is_low_group, int pure_intracomm, int *remote_upid_size, char *remote_upids, - uint64_t ** remote_gpids) + MPIR_Lpid ** remote_lpids) { int mpi_errno = MPI_SUCCESS; int i; @@ -611,6 +611,14 @@ int MPIDIU_Intercomm_map_bcast_intra(MPIR_Comm * local_comm, int local_leader, i MPIR_FUNC_ENTER; + MPI_Datatype lpid_datatype; + if (sizeof(MPIR_Lpid) == 8) { + lpid_datatype = MPI_UINT64_T; + } else { + MPIR_Assert(sizeof(MPIR_Lpid) == 4); + lpid_datatype = MPI_UINT32_T; + } + if (local_comm->rank == local_leader) { if (!pure_intracomm) { for (i = 0; i < (*remote_size); i++) { @@ -633,7 +641,7 @@ int MPIDIU_Intercomm_map_bcast_intra(MPIR_Comm * local_comm, int local_leader, i local_leader, local_comm, MPIR_ERR_NONE); MPIR_ERR_CHECK(mpi_errno); } else { - mpi_errno = MPIR_Bcast_allcomm_auto(*remote_gpids, *remote_size, MPI_UINT64_T, + mpi_errno = MPIR_Bcast_allcomm_auto(*remote_lpids, *remote_size, lpid_datatype, local_leader, local_comm, MPIR_ERR_NONE); } } else { @@ -645,8 +653,8 @@ int MPIDIU_Intercomm_map_bcast_intra(MPIR_Comm * local_comm, int local_leader, i *is_low_group = map_info[2]; pure_intracomm = map_info[3]; - MPIR_CHKPMEM_MALLOC((*remote_gpids), uint64_t *, (*remote_size) * sizeof(uint64_t), - mpi_errno, "remote_gpids", MPL_MEM_COMM); + MPIR_CHKPMEM_MALLOC((*remote_lpids), MPIR_Lpid *, (*remote_size) * sizeof(MPIR_Lpid), + mpi_errno, "remote_lpids", MPL_MEM_COMM); if (!pure_intracomm) { MPIR_CHKLMEM_MALLOC(_remote_upid_size, int *, (*remote_size) * sizeof(int), mpi_errno, "_remote_upid_size", MPL_MEM_COMM); @@ -659,9 +667,9 @@ int MPIDIU_Intercomm_map_bcast_intra(MPIR_Comm * local_comm, int local_leader, i local_leader, local_comm, MPIR_ERR_NONE); MPIR_ERR_CHECK(mpi_errno); - MPIDIU_upids_to_gpids(*remote_size, _remote_upid_size, _remote_upids, *remote_gpids); + MPIDIU_upids_to_lpids(*remote_size, _remote_upid_size, _remote_upids, *remote_lpids); } else { - mpi_errno = MPIR_Bcast_allcomm_auto(*remote_gpids, *remote_size, MPI_UINT64_T, + mpi_errno = MPIR_Bcast_allcomm_auto(*remote_lpids, *remote_size, lpid_datatype, local_leader, local_comm, MPIR_ERR_NONE); } } @@ -673,11 +681,11 @@ int MPIDIU_Intercomm_map_bcast_intra(MPIR_Comm * local_comm, int local_leader, i return mpi_errno; fn_fail: MPIR_CHKPMEM_REAP(); - *remote_gpids = NULL; + *remote_lpids = NULL; goto fn_exit; } -int MPID_Create_intercomm_from_lpids(MPIR_Comm * newcomm_ptr, int size, const uint64_t lpids[]) +int MPID_Create_intercomm_from_lpids(MPIR_Comm * newcomm_ptr, int size, const MPIR_Lpid lpids[]) { int mpi_errno = MPI_SUCCESS, i; MPIR_FUNC_ENTER; diff --git a/src/mpid/ch4/src/ch4_impl.h b/src/mpid/ch4/src/ch4_impl.h index 6fa918db043..7726ee90992 100644 --- a/src/mpid/ch4/src/ch4_impl.h +++ b/src/mpid/ch4/src/ch4_impl.h @@ -16,7 +16,7 @@ int MPIDIU_Intercomm_map_bcast_intra(MPIR_Comm * local_comm, int local_leader, int *remote_size, int *is_low_group, int pure_intracomm, int *remote_upid_size, char *remote_upids, - uint64_t ** remote_gpids); + MPIR_Lpid ** remote_lpids); int MPIDIG_get_context_index(uint64_t context_id); uint64_t MPIDIG_generate_win_id(MPIR_Comm * comm_ptr); diff --git a/src/mpid/ch4/src/ch4_proc.c b/src/mpid/ch4/src/ch4_proc.c index 56cb70b48c1..01b182da582 100644 --- a/src/mpid/ch4/src/ch4_proc.c +++ b/src/mpid/ch4/src/ch4_proc.c @@ -249,14 +249,14 @@ void MPIDIU_upidhash_free(void) /* convert upid to gpid by netmod. * For ofi netmod, it inserts the address and fills an av entry. */ -int MPIDIU_upids_to_gpids(int size, int *remote_upid_size, char *remote_upids, - uint64_t * remote_gpids) +int MPIDIU_upids_to_lpids(int size, int *remote_upid_size, char *remote_upids, + MPIR_Lpid * remote_lpids) { int mpi_errno = MPI_SUCCESS; MPIR_FUNC_ENTER; MPID_THREAD_CS_ENTER(VCI, MPIDIU_THREAD_DYNPROC_MUTEX); - mpi_errno = MPIDI_NM_upids_to_gpids(size, remote_upid_size, remote_upids, remote_gpids); + mpi_errno = MPIDI_NM_upids_to_lpids(size, remote_upid_size, remote_upids, remote_lpids); MPIR_ERR_CHECK(mpi_errno); fn_exit: diff --git a/src/mpid/ch4/src/ch4_proc.h b/src/mpid/ch4/src/ch4_proc.h index d4ba4901f26..5462d2e407d 100644 --- a/src/mpid/ch4/src/ch4_proc.h +++ b/src/mpid/ch4/src/ch4_proc.h @@ -33,8 +33,8 @@ void MPIDIU_upidhash_add(const void *upid, int upid_len, int avtid, int lpid); MPIDI_upid_hash *MPIDIU_upidhash_find(const void *upid, int upid_len); void MPIDIU_upidhash_free(void); #endif -int MPIDIU_upids_to_gpids(int size, int *remote_upid_size, char *remote_upids, - uint64_t * remote_gpids); +int MPIDIU_upids_to_lpids(int size, int *remote_upid_size, char *remote_upids, + MPIR_Lpid * remote_lpids); int MPIDIU_alloc_lut(MPIDI_rank_map_lut_t ** lut, int size); int MPIDIU_release_lut(MPIDI_rank_map_lut_t * lut); int MPIDIU_alloc_mlut(MPIDI_rank_map_mlut_t ** mlut, int size); diff --git a/src/mpid/ch4/src/ch4_spawn.c b/src/mpid/ch4/src/ch4_spawn.c index 6b59d171620..10241261336 100644 --- a/src/mpid/ch4/src/ch4_spawn.c +++ b/src/mpid/ch4/src/ch4_spawn.c @@ -290,7 +290,7 @@ static int peer_intercomm_create(char *remote_addrname, int len, int tag, { int mpi_errno = MPI_SUCCESS; int context_id, recvcontext_id; - uint64_t remote_gpid; + MPIR_Lpid remote_lpid; mpi_errno = MPIR_Get_contextid_sparse(MPIR_Process.comm_self, &recvcontext_id, FALSE); MPIR_ERR_CHECK(mpi_errno); @@ -299,8 +299,8 @@ static int peer_intercomm_create(char *remote_addrname, int len, int tag, if (is_sender) { /* insert remote address */ int addrname_len = len; - uint64_t *remote_gpids = &remote_gpid; - mpi_errno = MPIDIU_upids_to_gpids(1, &addrname_len, remote_addrname, remote_gpids); + MPIR_Lpid *remote_lpids = &remote_lpid; + mpi_errno = MPIDIU_upids_to_lpids(1, &addrname_len, remote_addrname, remote_lpids); MPIR_ERR_CHECK(mpi_errno); /* fill hdr with context_id and addrname */ @@ -317,7 +317,7 @@ static int peer_intercomm_create(char *remote_addrname, int len, int tag, /* send remote context_id + addrname */ int hdr_sz = sizeof(hdr) - MPIDI_DYNPROC_NAME_MAX + hdr.addrname_len; - mpi_errno = MPIDI_NM_dynamic_send(remote_gpid, tag, &hdr, hdr_sz, timeout); + mpi_errno = MPIDI_NM_dynamic_send(remote_lpid, tag, &hdr, hdr_sz, timeout); MPL_free(addrname); MPL_free(addrname_size); MPIR_ERR_CHECK(mpi_errno); @@ -333,19 +333,19 @@ static int peer_intercomm_create(char *remote_addrname, int len, int tag, /* insert remote address */ int addrname_len = hdr.addrname_len; - uint64_t *remote_gpids = &remote_gpid; - mpi_errno = MPIDIU_upids_to_gpids(1, &addrname_len, hdr.addrname, remote_gpids); + MPIR_Lpid *remote_lpids = &remote_lpid; + mpi_errno = MPIDIU_upids_to_lpids(1, &addrname_len, hdr.addrname, remote_lpids); MPIR_ERR_CHECK(mpi_errno); /* send remote context_id */ hdr.context_id = recvcontext_id; - mpi_errno = MPIDI_NM_dynamic_send(remote_gpid, tag, &hdr, sizeof(hdr.context_id), timeout); + mpi_errno = MPIDI_NM_dynamic_send(remote_lpid, tag, &hdr, sizeof(hdr.context_id), timeout); MPIR_ERR_CHECK(mpi_errno); } /* create peer intercomm */ mpi_errno = MPIR_peer_intercomm_create(context_id, recvcontext_id, - remote_gpid, is_sender, newcomm); + remote_lpid, is_sender, newcomm); MPIR_ERR_CHECK(mpi_errno); fn_exit: diff --git a/src/mpid/ch4/src/ch4i_comm.c b/src/mpid/ch4/src/ch4i_comm.c index 7a8b5a97d9f..d8a3fe3e9f6 100644 --- a/src/mpid/ch4/src/ch4i_comm.c +++ b/src/mpid/ch4/src/ch4i_comm.c @@ -928,7 +928,7 @@ static uint64_t shrink(uint64_t x, int num_low_bits) return ((x >> 32) << num_low_bits) + (x & 0xffffffff); } -int MPIDI_check_disjoint_gpids(uint64_t gpids1[], int n1, uint64_t gpids2[], int n2) +int MPIDI_check_disjoint_lpids(MPIR_Lpid lpids1[], int n1, MPIR_Lpid lpids2[], int n2) { int mpi_errno = MPI_SUCCESS; uint32_t gpidmaskPrealloc[128]; @@ -944,12 +944,12 @@ int MPIDI_check_disjoint_gpids(uint64_t gpids1[], int n1, uint64_t gpids2[], int /* Find the max low-32-bit gpid */ uint64_t max_lpid = 0; for (int i = 0; i < n1; i++) { - uint64_t n = gpids1[i] & 0xffffffff; + uint64_t n = lpids1[i] & 0xffffffff; if (n > max_lpid) max_lpid = n; } for (int i = 0; i < n2; i++) { - uint64_t n = gpids2[i] & 0xffffffff; + uint64_t n = lpids2[i] & 0xffffffff; if (n > max_lpid) max_lpid = n; } @@ -958,12 +958,12 @@ int MPIDI_check_disjoint_gpids(uint64_t gpids1[], int n1, uint64_t gpids2[], int uint64_t max_gpid = 0; for (int i = 0; i < n1; i++) { - uint64_t n = shrink(gpids1[i], num_low_bits); + uint64_t n = shrink(lpids1[i], num_low_bits); if (n > max_gpid) max_gpid = n; } for (int i = 0; i < n2; i++) { - uint64_t n = shrink(gpids2[i], num_low_bits); + uint64_t n = shrink(lpids2[i], num_low_bits); if (n > max_gpid) max_gpid = n; } @@ -981,7 +981,7 @@ int MPIDI_check_disjoint_gpids(uint64_t gpids1[], int n1, uint64_t gpids2[], int /* Set the bits for the first array */ for (int i = 0; i < n1; i++) { - uint64_t n = shrink(gpids1[i], num_low_bits); + uint64_t n = shrink(lpids1[i], num_low_bits); int idx = n / 32; int bit = n % 32; gpidmask[idx] = gpidmask[idx] | (1 << bit); @@ -990,12 +990,12 @@ int MPIDI_check_disjoint_gpids(uint64_t gpids1[], int n1, uint64_t gpids2[], int /* Look for any duplicates in the second array */ for (int i = 0; i < n2; i++) { - uint64_t n = shrink(gpids2[i], num_low_bits); + uint64_t n = shrink(lpids2[i], num_low_bits); int idx = n / 32; int bit = n % 32; if (gpidmask[idx] & (1 << bit)) { MPIR_ERR_SET1(mpi_errno, MPI_ERR_COMM, - "**dupprocesses", "**dupprocesses %d", gpids2[i]); + "**dupprocesses", "**dupprocesses %d", (int) lpids2[i]); goto fn_fail; } /* Add a check on duplicates *within* group 2 */ diff --git a/src/mpid/ch4/src/ch4i_comm.h b/src/mpid/ch4/src/ch4i_comm.h index 823d945ded4..40032de5698 100644 --- a/src/mpid/ch4/src/ch4i_comm.h +++ b/src/mpid/ch4/src/ch4i_comm.h @@ -9,6 +9,6 @@ #include "ch4_types.h" int MPIDI_comm_create_rank_map(MPIR_Comm * comm); -int MPIDI_check_disjoint_gpids(uint64_t gpids1[], int n1, uint64_t gpids2[], int n2); +int MPIDI_check_disjoint_lpids(MPIR_Lpid lpids1[], int n1, MPIR_Lpid lpids2[], int n2); #endif /* CH4I_COMM_H_INCLUDED */ From 3f9677dc9dcc9ba5d3fdd3b8d0ead5736dbf0326 Mon Sep 17 00:00:00 2001 From: Hui Zhou Date: Wed, 11 Dec 2024 14:39:45 -0600 Subject: [PATCH 13/59] group: add MPIR_Worlds We need a device-independent way of identifying processes. One way is to use the combination of (world_idx, world_rank). Thus, we need maintain a list of worlds so that the world_idx points to the world record. This may not fit in the concept of MPI group, but since the group need a ways of id processes, thus it seems most closely related. The first world, world_idx 0, is always initialized at init. Due to session re-init, we need make sure to reset num_worlds to 0 at finalize. New worlds will be added upon spawning or connecting dynamic processes (to-be-implemented). --- src/include/mpir_group.h | 26 ++++++++++++++++++++++++++ src/mpi/group/grouputil.c | 30 ++++++++++++++++++++++++++++++ src/mpi/init/mpir_init.c | 1 + src/util/mpir_pmi.c | 3 +++ 4 files changed, 60 insertions(+) diff --git a/src/include/mpir_group.h b/src/include/mpir_group.h index 1148a8e8006..5f8c619f1e0 100644 --- a/src/include/mpir_group.h +++ b/src/include/mpir_group.h @@ -43,6 +43,31 @@ S*/ +/* Worlds - + * We need a device-independent way of identifying processes. Assuming the concept of + * "worlds", we can describe a process with (world_idx, world_rank). + * + * The world_idx is a local id because each process may not see all worlds. Thus, + * each process only can maintain a list of worlds as it encounters them. Thus, + * a process id derived from (world_idx, world_rank) is referred as LPID, or + * "local process id". + * + * Each process should maintain a table of worlds with sufficient information so + * processes can match worlds upon connection or making address exchange. + */ + +#define MPIR_NAMESPACE_MAX 128 +struct MPIR_World { + char namespace[MPIR_NAMESPACE_MAX]; + /* other useful fields */ + int num_procs; +}; + +extern struct MPIR_World MPIR_Worlds[]; + +int MPIR_add_world(const char *namespace, int num_procs); +int MPIR_find_world(const char *namespace); + /* Abstract the integer type for lpid (process id). It is possible to use 32-bit * in principle, but 64-bit is simpler since we can trivially combine * (world_idx, world_rank). @@ -115,5 +140,6 @@ int MPIR_Group_lpid_to_rank(MPIR_Group * group, MPIR_Lpid lpid); int MPIR_Group_check_subset(MPIR_Group * group_ptr, MPIR_Comm * comm_ptr); void MPIR_Group_set_session_ptr(MPIR_Group * group_ptr, MPIR_Session * session_out); int MPIR_Group_init(void); +void MPIR_Group_finalize(void); #endif /* MPIR_GROUP_H_INCLUDED */ diff --git a/src/mpi/group/grouputil.c b/src/mpi/group/grouputil.c index 59c45561eca..9186cdaf5e5 100644 --- a/src/mpi/group/grouputil.c +++ b/src/mpi/group/grouputil.c @@ -6,6 +6,32 @@ #include "mpiimpl.h" #include "group.h" +/* Global world list. + * world_idx, part of MPIR_Lpid, points to this array */ +#define MPIR_MAX_WORLDS 1024 +static int num_worlds = 0; +struct MPIR_World MPIR_Worlds[MPIR_MAX_WORLDS]; + +int MPIR_add_world(const char *namespace, int num_procs) +{ + int world_idx = num_worlds++; + + MPL_strncpy(MPIR_Worlds[world_idx].namespace, namespace, MPIR_NAMESPACE_MAX); + MPIR_Worlds[world_idx].num_procs = num_procs; + + return world_idx; +} + +int MPIR_find_world(const char *namespace) +{ + for (int i = 0; i < num_worlds; i++) { + if (strncmp(MPIR_Worlds[i].namespace, namespace, MPIR_NAMESPACE_MAX) == 0) { + return i; + } + } + return -1; +} + /* Preallocated group objects */ MPIR_Group MPIR_Group_builtin[MPIR_GROUP_N_BUILTIN]; MPIR_Group MPIR_Group_direct[MPIR_GROUP_PREALLOC]; @@ -34,6 +60,10 @@ int MPIR_Group_init(void) return mpi_errno; } +void MPIR_Group_finalize(void) +{ + num_worlds = 0; +} int MPIR_Group_release(MPIR_Group * group_ptr) { diff --git a/src/mpi/init/mpir_init.c b/src/mpi/init/mpir_init.c index 2f1c115aa13..6e04cabd400 100644 --- a/src/mpi/init/mpir_init.c +++ b/src/mpi/init/mpir_init.c @@ -484,6 +484,7 @@ int MPII_Finalize(MPIR_Session * session_ptr) MPII_thread_mutex_destroy(); MPIR_Typerep_finalize(); + MPIR_Group_finalize(); MPL_atomic_store_int(&MPIR_Process.mpich_state, MPICH_MPI_STATE__UNINITIALIZED); fn_exit: diff --git a/src/util/mpir_pmi.c b/src/util/mpir_pmi.c index 9aff4e38dfa..d2b9eae8e5d 100644 --- a/src/util/mpir_pmi.c +++ b/src/util/mpir_pmi.c @@ -168,6 +168,9 @@ int MPIR_pmi_init(void) pmi_connected = true; } + int world_idx = MPIR_add_world(pmi_kvs_name, size); + MPIR_Assertp(world_idx == 0); + MPIR_Process.has_parent = has_parent; MPIR_Process.rank = rank; MPIR_Process.size = size; From 1d109bbbea797baf6819c906f4ed65e14e9e35da Mon Sep 17 00:00:00 2001 From: Hui Zhou Date: Thu, 19 Dec 2024 13:19:26 -0600 Subject: [PATCH 14/59] group: add MPIR_Group_finalize We need reset num_worlds so that Session re-init will work. --- src/include/mpir_group.h | 2 +- src/mpi/group/grouputil.c | 3 ++- src/mpi/init/mpir_init.c | 3 ++- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/src/include/mpir_group.h b/src/include/mpir_group.h index 5f8c619f1e0..27c4043a015 100644 --- a/src/include/mpir_group.h +++ b/src/include/mpir_group.h @@ -140,6 +140,6 @@ int MPIR_Group_lpid_to_rank(MPIR_Group * group, MPIR_Lpid lpid); int MPIR_Group_check_subset(MPIR_Group * group_ptr, MPIR_Comm * comm_ptr); void MPIR_Group_set_session_ptr(MPIR_Group * group_ptr, MPIR_Session * session_out); int MPIR_Group_init(void); -void MPIR_Group_finalize(void); +int MPIR_Group_finalize(void); #endif /* MPIR_GROUP_H_INCLUDED */ diff --git a/src/mpi/group/grouputil.c b/src/mpi/group/grouputil.c index 9186cdaf5e5..99288da1078 100644 --- a/src/mpi/group/grouputil.c +++ b/src/mpi/group/grouputil.c @@ -60,9 +60,10 @@ int MPIR_Group_init(void) return mpi_errno; } -void MPIR_Group_finalize(void) +int MPIR_Group_finalize(void) { num_worlds = 0; + return MPI_SUCCESS; } int MPIR_Group_release(MPIR_Group * group_ptr) diff --git a/src/mpi/init/mpir_init.c b/src/mpi/init/mpir_init.c index 6e04cabd400..a20038d8b40 100644 --- a/src/mpi/init/mpir_init.c +++ b/src/mpi/init/mpir_init.c @@ -479,12 +479,13 @@ int MPII_Finalize(MPIR_Session * session_ptr) MPL_free(MPIR_Process.memory_alloc_kinds); MPIR_Process.memory_alloc_kinds = NULL; + MPIR_Group_finalize(); + /* All memory should be freed at this point */ MPII_finalize_memory_tracing(); MPII_thread_mutex_destroy(); MPIR_Typerep_finalize(); - MPIR_Group_finalize(); MPL_atomic_store_int(&MPIR_Process.mpich_state, MPICH_MPI_STATE__UNINITIALIZED); fn_exit: From d9c96aab82fbf0aeee70bf711ee80ad00e51fe0a Mon Sep 17 00:00:00 2001 From: Hui Zhou Date: Wed, 11 Dec 2024 16:22:31 -0600 Subject: [PATCH 15/59] group: add builtin MPIR_GROUP_{WORLD,SELF} Add builtin MPIR_GROUP_WORLD and MPIR_GROUP_SELF, so we can create builtin communicators from builtin groups. --- src/include/mpir_group.h | 7 +++++++ src/include/mpir_objects.h | 2 +- src/mpi/group/group_impl.c | 9 ++++----- src/mpi/group/grouputil.c | 28 +++++++++++++++++++++++++++- 4 files changed, 39 insertions(+), 7 deletions(-) diff --git a/src/include/mpir_group.h b/src/include/mpir_group.h index 27c4043a015..1ddf9619bf8 100644 --- a/src/include/mpir_group.h +++ b/src/include/mpir_group.h @@ -43,6 +43,13 @@ S*/ +/* In addition to MPI_GROUP_EMPTY, internally we have a few more builtins */ +#define MPIR_GROUP_WORLD ((MPI_Group)0x48000001) +#define MPIR_GROUP_SELF ((MPI_Group)0x48000002) + +#define MPIR_GROUP_WORLD_PTR (MPIR_Group_builtin + 1) +#define MPIR_GROUP_SELF_PTR (MPIR_Group_builtin + 2) + /* Worlds - * We need a device-independent way of identifying processes. Assuming the concept of * "worlds", we can describe a process with (world_idx, world_rank). diff --git a/src/include/mpir_objects.h b/src/include/mpir_objects.h index 89e7aea8d35..2f2ffeb6dae 100644 --- a/src/include/mpir_objects.h +++ b/src/include/mpir_objects.h @@ -210,7 +210,7 @@ const char *MPIR_Handle_get_kind_str(int kind); #define MPIR_COMM_PREALLOC 8 #endif -#define MPIR_GROUP_N_BUILTIN 1 +#define MPIR_GROUP_N_BUILTIN 3 #ifdef MPID_GROUP_PREALLOC #define MPIR_GROUP_PREALLOC MPID_GROUP_PREALLOC #else diff --git a/src/mpi/group/group_impl.c b/src/mpi/group/group_impl.c index fa123a70efc..e10a2a486d1 100644 --- a/src/mpi/group/group_impl.c +++ b/src/mpi/group/group_impl.c @@ -438,12 +438,11 @@ int MPIR_Group_from_session_pset_impl(MPIR_Session * session_ptr, const char *ps int mpi_errno = MPI_SUCCESS; if (MPL_stricmp(pset_name, "mpi://WORLD") == 0) { - mpi_errno = MPIR_Group_create_stride(MPIR_Process.size, MPIR_Process.rank, session_ptr, - 0, 1, 1, new_group_ptr); - MPIR_ERR_CHECK(mpi_errno); + *new_group_ptr = MPIR_GROUP_WORLD_PTR; + MPIR_Group_add_ref(*new_group_ptr); } else if (MPL_stricmp(pset_name, "mpi://SELF") == 0) { - mpi_errno = MPIR_Group_create_stride(1, 0, session_ptr, 0, 1, 1, new_group_ptr); - MPIR_ERR_CHECK(mpi_errno); + *new_group_ptr = MPIR_GROUP_SELF_PTR; + MPIR_Group_add_ref(*new_group_ptr); } else { /* TODO: Implement pset struct, locate pset struct ptr */ MPIR_ERR_SETANDSTMT(mpi_errno, MPI_ERR_ARG, goto fn_fail, "**psetinvalidname"); diff --git a/src/mpi/group/grouputil.c b/src/mpi/group/grouputil.c index 99288da1078..c1f18661480 100644 --- a/src/mpi/group/grouputil.c +++ b/src/mpi/group/grouputil.c @@ -48,7 +48,9 @@ int MPIR_Group_init(void) { int mpi_errno = MPI_SUCCESS; - MPIR_Assert(MPIR_GROUP_N_BUILTIN == 1); /* update this func if this ever triggers */ + MPIR_Assert(MPIR_GROUP_N_BUILTIN == 3); /* update this func if this ever triggers */ + + struct MPIR_Pmap *pmap; MPIR_Group_builtin[0].handle = MPI_GROUP_EMPTY; MPIR_Object_set_ref(&MPIR_Group_builtin[0], 1); @@ -57,6 +59,30 @@ int MPIR_Group_init(void) MPIR_Group_builtin[0].session_ptr = NULL; memset(&MPIR_Group_builtin[0].pmap, 0, sizeof(struct MPIR_Pmap)); + MPIR_Group_builtin[1].handle = MPIR_GROUP_WORLD; + MPIR_Object_set_ref(&MPIR_Group_builtin[1], 1); + MPIR_Group_builtin[1].size = MPIR_Process.size; + MPIR_Group_builtin[1].rank = MPIR_Process.rank; + MPIR_Group_builtin[1].session_ptr = NULL; + pmap = &MPIR_Group_builtin[1].pmap; + pmap->size = MPIR_Process.size; + pmap->use_map = false; + pmap->u.stride.offset = 0; + pmap->u.stride.stride = 1; + pmap->u.stride.blocksize = 1; + + MPIR_Group_builtin[2].handle = MPIR_GROUP_SELF; + MPIR_Object_set_ref(&MPIR_Group_builtin[2], 1); + MPIR_Group_builtin[2].size = 1; + MPIR_Group_builtin[2].rank = 0; + MPIR_Group_builtin[2].session_ptr = NULL; + pmap = &MPIR_Group_builtin[2].pmap; + pmap->size = 1; + pmap->use_map = false; + pmap->u.stride.offset = MPIR_Process.rank; + pmap->u.stride.stride = 1; + pmap->u.stride.blocksize = 1; + return mpi_errno; } From 53d474250915e953f2029c953a2d6e5b64adf07e Mon Sep 17 00:00:00 2001 From: Hui Zhou Date: Fri, 13 Dec 2024 07:43:05 -0600 Subject: [PATCH 16/59] group: add MPIR_Group_dup Internally the only reason to duplicate a group is to copy from NULL session to a new session. Otherwise, we can just use the same group and increment the reference count. --- src/include/mpir_group.h | 1 + src/mpi/group/group_impl.c | 8 ++++---- src/mpi/group/grouputil.c | 35 +++++++++++++++++++++++++++++++++++ 3 files changed, 40 insertions(+), 4 deletions(-) diff --git a/src/include/mpir_group.h b/src/include/mpir_group.h index 1ddf9619bf8..69b665fcc87 100644 --- a/src/include/mpir_group.h +++ b/src/include/mpir_group.h @@ -136,6 +136,7 @@ int MPIR_Group_check_valid_ranges(MPIR_Group *, int[][3], int); int MPIR_Group_create(int, MPIR_Group **); int MPIR_Group_release(MPIR_Group * group_ptr); +int MPIR_Group_dup(MPIR_Group * old_group, MPIR_Session * session_ptr, MPIR_Group ** new_group_ptr); int MPIR_Group_create_map(int size, int rank, MPIR_Session * session_ptr, MPIR_Lpid * map, MPIR_Group ** new_group_ptr); int MPIR_Group_create_stride(int size, int rank, MPIR_Session * session_ptr, diff --git a/src/mpi/group/group_impl.c b/src/mpi/group/group_impl.c index e10a2a486d1..8e09e216554 100644 --- a/src/mpi/group/group_impl.c +++ b/src/mpi/group/group_impl.c @@ -438,11 +438,11 @@ int MPIR_Group_from_session_pset_impl(MPIR_Session * session_ptr, const char *ps int mpi_errno = MPI_SUCCESS; if (MPL_stricmp(pset_name, "mpi://WORLD") == 0) { - *new_group_ptr = MPIR_GROUP_WORLD_PTR; - MPIR_Group_add_ref(*new_group_ptr); + mpi_errno = MPIR_Group_dup(MPIR_GROUP_WORLD_PTR, session_ptr, new_group_ptr); + MPIR_ERR_CHECK(mpi_errno); } else if (MPL_stricmp(pset_name, "mpi://SELF") == 0) { - *new_group_ptr = MPIR_GROUP_SELF_PTR; - MPIR_Group_add_ref(*new_group_ptr); + mpi_errno = MPIR_Group_dup(MPIR_GROUP_SELF_PTR, session_ptr, new_group_ptr); + MPIR_ERR_CHECK(mpi_errno); } else { /* TODO: Implement pset struct, locate pset struct ptr */ MPIR_ERR_SETANDSTMT(mpi_errno, MPI_ERR_ARG, goto fn_fail, "**psetinvalidname"); diff --git a/src/mpi/group/grouputil.c b/src/mpi/group/grouputil.c index c1f18661480..3a709451103 100644 --- a/src/mpi/group/grouputil.c +++ b/src/mpi/group/grouputil.c @@ -142,6 +142,41 @@ int MPIR_Group_create(int nproc, MPIR_Group ** new_group_ptr) return mpi_errno; } +/* Internally the only reason to duplicate a group is to copy from NULL session to a new session. + * Otherwise, we can just use the same group and increment the reference count. + */ +int MPIR_Group_dup(MPIR_Group * old_group, MPIR_Session * session_ptr, MPIR_Group ** new_group_ptr) +{ + int mpi_errno = MPI_SUCCESS; + MPIR_Group *new_group; + + new_group = (MPIR_Group *) MPIR_Handle_obj_alloc(&MPIR_Group_mem); + if (!new_group) { + mpi_errno = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, "MPIR_Group_dup", + __LINE__, MPI_ERR_OTHER, "**nomem", 0); + goto fn_fail; + } + MPIR_Object_set_ref(new_group, 1); + + /* initialize fields */ + new_group->size = old_group->size; + new_group->rank = old_group->rank; + MPIR_Group_set_session_ptr(new_group, session_ptr); + memcpy(&new_group->pmap, &old_group->pmap, sizeof(struct MPIR_Pmap)); + if (old_group->pmap.use_map) { + new_group->pmap.u.map = MPL_malloc(old_group->size * sizeof(MPIR_Lpid), MPL_MEM_GROUP); + MPIR_ERR_CHKANDJUMP(!new_group->pmap.u.map, mpi_errno, MPI_ERR_OTHER, "**nomem"); + memcpy(new_group->pmap.u.map, old_group->pmap.u.map, old_group->size * sizeof(MPIR_Lpid)); + } + + *new_group_ptr = new_group; + + fn_exit: + return mpi_errno; + fn_fail: + goto fn_exit; +} + int MPIR_Group_create_map(int size, int rank, MPIR_Session * session_ptr, MPIR_Lpid * map, MPIR_Group ** new_group_ptr) { From 8f6cddfdbca7303854bdcd2103ea624d01379450 Mon Sep 17 00:00:00 2001 From: Hui Zhou Date: Thu, 12 Dec 2024 13:45:00 -0600 Subject: [PATCH 17/59] binding/group: remove error check in MPI_Group_free Since builtin groups can be returned to users, they should be allowed to free. They are reference counted anyway. --- src/binding/c/group_api.txt | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/src/binding/c/group_api.txt b/src/binding/c/group_api.txt index dd2074024d0..532389d61ac 100644 --- a/src/binding/c/group_api.txt +++ b/src/binding/c/group_api.txt @@ -37,18 +37,6 @@ MPI_Group_excl: MPI_Group_free: .desc: Frees a group -{ -- error_check -- - /* Cannot free the predefined groups, but allow GROUP_EMPTY - * because otherwise many tests fail */ - if ((HANDLE_IS_BUILTIN(*group)) && *group != MPI_GROUP_EMPTY) { - mpi_errno = MPIR_Err_create_code(MPI_SUCCESS, - MPIR_ERR_RECOVERABLE, __func__, __LINE__, - MPI_ERR_GROUP, "**groupperm", 0); - } - if (mpi_errno) { - goto fn_fail; - } -} MPI_Group_incl: .desc: Produces a group by reordering an existing group and taking only listed members From c52fd730ae84654ac4591def9dfbd37e3ee985fb Mon Sep 17 00:00:00 2001 From: Hui Zhou Date: Wed, 11 Dec 2024 16:45:22 -0600 Subject: [PATCH 18/59] comm: always set local_group and remote_group To make MPI group a first-class citizen, we will always have group before creating communicators, so that when device layer activate communiators, e.g. in MPID_Comm_commit_pre_hook, it can rely on the group to look up the involved processes. It also removes the necessity to maintain any other process addressing schemes. --- src/include/mpir_comm.h | 6 +-- src/mpi/comm/builtin_comms.c | 9 ++++ src/mpi/comm/comm_impl.c | 93 +++++++++++++++++++++++++++++++----- src/mpi/comm/comm_split.c | 13 +++++ src/mpi/comm/commutil.c | 34 +++++++++++++ src/mpid/ch3/src/ch3u_port.c | 24 +++++++++- src/mpid/ch4/src/ch4_comm.c | 4 ++ src/mpid/ch4/src/init_comm.c | 15 +++++- 8 files changed, 179 insertions(+), 19 deletions(-) diff --git a/src/include/mpir_comm.h b/src/include/mpir_comm.h index 8af43abc6d7..af50031ebaf 100644 --- a/src/include/mpir_comm.h +++ b/src/include/mpir_comm.h @@ -166,9 +166,9 @@ struct MPIR_Comm { int rank; /* Value of MPI_Comm_rank */ MPIR_Attribute *attributes; /* List of attributes */ int local_size; /* Value of MPI_Comm_size for local group */ - MPIR_Group *local_group, /* Groups in communicator. */ - *remote_group; /* The local and remote groups are the - * same for intra communicators */ + MPIR_Group *local_group; /* Groups in communicator. */ + MPIR_Group *remote_group; /* The remote group in a inter communicator. + * Must be NULL in a intra communicator. */ MPIR_Comm_kind_t comm_kind; /* MPIR_COMM_KIND__INTRACOMM or MPIR_COMM_KIND__INTERCOMM */ char name[MPI_MAX_OBJECT_NAME]; /* Required for MPI-2 */ MPIR_Errhandler *errhandler; /* Pointer to the error handler structure */ diff --git a/src/mpi/comm/builtin_comms.c b/src/mpi/comm/builtin_comms.c index 16a75588036..7e0273a677f 100644 --- a/src/mpi/comm/builtin_comms.c +++ b/src/mpi/comm/builtin_comms.c @@ -30,6 +30,9 @@ int MPIR_init_comm_world(void) MPIR_Process.comm_world->remote_size = MPIR_Process.size; MPIR_Process.comm_world->local_size = MPIR_Process.size; + MPIR_Process.comm_world->local_group = MPIR_GROUP_WORLD_PTR; + MPIR_Group_add_ref(MPIR_GROUP_WORLD_PTR); + mpi_errno = MPIR_Comm_commit(MPIR_Process.comm_world); MPIR_ERR_CHECK(mpi_errno); @@ -59,6 +62,9 @@ int MPIR_init_comm_self(void) MPIR_Process.comm_self->remote_size = 1; MPIR_Process.comm_self->local_size = 1; + MPIR_Process.comm_self->local_group = MPIR_GROUP_SELF_PTR; + MPIR_Group_add_ref(MPIR_GROUP_SELF_PTR); + mpi_errno = MPIR_Comm_commit(MPIR_Process.comm_self); MPIR_ERR_CHECK(mpi_errno); @@ -91,6 +97,9 @@ int MPIR_init_icomm_world(void) MPIR_Process.icomm_world->remote_size = MPIR_Process.size; MPIR_Process.icomm_world->local_size = MPIR_Process.size; + MPIR_Process.icomm_world->local_group = MPIR_GROUP_WORLD_PTR; + MPIR_Group_add_ref(MPIR_GROUP_WORLD_PTR); + mpi_errno = MPIR_Comm_commit(MPIR_Process.icomm_world); MPIR_ERR_CHECK(mpi_errno); diff --git a/src/mpi/comm/comm_impl.c b/src/mpi/comm/comm_impl.c index 56db002f58c..746b2825b6a 100644 --- a/src/mpi/comm/comm_impl.c +++ b/src/mpi/comm/comm_impl.c @@ -337,8 +337,7 @@ int MPIR_Comm_create_intra(MPIR_Comm * comm_ptr, MPIR_Group * group_ptr, MPIR_Co (*newcomm_ptr)->local_group = group_ptr; MPIR_Group_add_ref(group_ptr); - (*newcomm_ptr)->remote_group = group_ptr; - MPIR_Group_add_ref(group_ptr); + (*newcomm_ptr)->remote_group = NULL; (*newcomm_ptr)->context_id = (*newcomm_ptr)->recvcontext_id; (*newcomm_ptr)->remote_size = (*newcomm_ptr)->local_size = n; @@ -382,15 +381,12 @@ int MPIR_Comm_create_inter(MPIR_Comm * comm_ptr, MPIR_Group * group_ptr, MPIR_Co int mpi_errno = MPI_SUCCESS; int new_context_id; int *mapping = NULL; - int *remote_mapping = NULL; - MPIR_Comm *mapping_comm = NULL; - int remote_size = -1; - int rinfo[2]; MPIR_CHKLMEM_DECL(1); MPIR_FUNC_ENTER; MPIR_Assert(comm_ptr->comm_kind == MPIR_COMM_KIND__INTERCOMM); + MPIR_Session *session_ptr = comm_ptr->session_ptr; /* Create a new communicator from the specified group members */ @@ -409,6 +405,7 @@ int MPIR_Comm_create_inter(MPIR_Comm * comm_ptr, MPIR_Group * group_ptr, MPIR_Co MPIR_Assert(new_context_id != 0); MPIR_Assert(new_context_id != comm_ptr->recvcontext_id); + MPIR_Comm *mapping_comm; mpi_errno = MPII_Comm_create_calculate_mapping(group_ptr, comm_ptr, &mapping, &mapping_comm); MPIR_ERR_CHECK(mpi_errno); @@ -434,7 +431,7 @@ int MPIR_Comm_create_inter(MPIR_Comm * comm_ptr, MPIR_Group * group_ptr, MPIR_Co (*newcomm_ptr)->is_low_group = comm_ptr->is_low_group; - MPIR_Comm_set_session_ptr(*newcomm_ptr, comm_ptr->session_ptr); + MPIR_Comm_set_session_ptr(*newcomm_ptr, session_ptr); } /* There is an additional step. We must communicate the @@ -445,6 +442,11 @@ int MPIR_Comm_create_inter(MPIR_Comm * comm_ptr, MPIR_Group * group_ptr, MPIR_Co * in the remote group, from which the remote network address * mapping can be constructed. We need to use the "collective" * context in the original intercommunicator */ + + int remote_size = -1; + int *remote_mapping; /* a list of remote ranks */ + int rinfo[2]; + if (comm_ptr->rank == 0) { int info[2]; info[0] = new_context_id; @@ -494,6 +496,7 @@ int MPIR_Comm_create_inter(MPIR_Comm * comm_ptr, MPIR_Group * group_ptr, MPIR_Co MPIR_Assert(remote_size >= 0); + if (group_ptr->rank != MPI_UNDEFINED) { (*newcomm_ptr)->remote_size = remote_size; /* Now, everyone has the remote_mapping, and can apply that to @@ -505,6 +508,23 @@ int MPIR_Comm_create_inter(MPIR_Comm * comm_ptr, MPIR_Group * group_ptr, MPIR_Co mapping, remote_mapping, mapping_comm, *newcomm_ptr); MPIR_ERR_CHECK(mpi_errno); + /* create remote_group. + * FIXME: we can directly exchange group maps once we get rid of comm mappers */ + MPIR_Group *remote_group; + + MPIR_Lpid *remote_map; + remote_map = MPL_malloc(remote_size * sizeof(MPIR_Lpid), MPL_MEM_GROUP); + MPIR_ERR_CHKANDJUMP(!remote_map, mpi_errno, MPI_ERR_OTHER, "**nomem"); + + MPIR_Group *mapping_group = mapping_comm->remote_group; + MPIR_Assert(mapping_group); + for (int i = 0; i < remote_size; i++) { + remote_map[i] = MPIR_Group_rank_to_lpid(mapping_group, remote_mapping[i]); + } + mpi_errno = MPIR_Group_create_map(remote_size, MPI_UNDEFINED, session_ptr, remote_map, + &remote_group); + (*newcomm_ptr)->remote_group = remote_group; + (*newcomm_ptr)->tainted = comm_ptr->tainted; mpi_errno = MPIR_Comm_commit(*newcomm_ptr); MPIR_ERR_CHECK(mpi_errno); @@ -605,8 +625,7 @@ int MPIR_Comm_create_group_impl(MPIR_Comm * comm_ptr, MPIR_Group * group_ptr, in (*newcomm_ptr)->local_group = group_ptr; MPIR_Group_add_ref(group_ptr); - (*newcomm_ptr)->remote_group = group_ptr; - MPIR_Group_add_ref(group_ptr); + (*newcomm_ptr)->remote_group = NULL; (*newcomm_ptr)->context_id = (*newcomm_ptr)->recvcontext_id; (*newcomm_ptr)->remote_size = (*newcomm_ptr)->local_size = n; @@ -913,6 +932,9 @@ int MPIR_Comm_remote_group_impl(MPIR_Comm * comm_ptr, MPIR_Group ** group_ptr) int mpi_errno = MPI_SUCCESS; MPIR_FUNC_ENTER; + /* FIXME: remove the following remote_group creation once this assertion passes */ + MPIR_Assert(comm_ptr->comm_kind == MPIR_COMM_KIND__INTERCOMM && comm_ptr->remote_group); + /* Create a group and populate it with the local process ids */ if (!comm_ptr->remote_group) { int n = comm_ptr->remote_size; @@ -965,6 +987,7 @@ int MPIR_Intercomm_create_impl(MPIR_Comm * local_comm_ptr, int local_leader, uint64_t *remote_lpids = NULL; int comm_info[3]; int is_low_group = 0; + MPIR_Session *session_ptr = local_comm_ptr->session_ptr; MPIR_FUNC_ENTER; @@ -1042,7 +1065,14 @@ int MPIR_Intercomm_create_impl(MPIR_Comm * local_comm_ptr, int local_leader, (*new_intercomm_ptr)->local_comm = 0; (*new_intercomm_ptr)->is_low_group = is_low_group; - MPIR_Comm_set_session_ptr(*new_intercomm_ptr, local_comm_ptr->session_ptr); + (*new_intercomm_ptr)->local_group = local_comm_ptr->local_group; + MPIR_Group_add_ref(local_comm_ptr->local_group); + + /* construct remote_group */ + mpi_errno = MPIR_Group_create_map(remote_size, MPI_UNDEFINED, session_ptr, remote_lpids, + &(*new_intercomm_ptr)->remote_group); + + MPIR_Comm_set_session_ptr(*new_intercomm_ptr, session_ptr); mpi_errno = MPID_Create_intercomm_from_lpids(*new_intercomm_ptr, remote_size, remote_lpids); if (mpi_errno) @@ -1064,8 +1094,6 @@ int MPIR_Intercomm_create_impl(MPIR_Comm * local_comm_ptr, int local_leader, fn_exit: - MPL_free(remote_lpids); - remote_lpids = NULL; MPIR_FUNC_EXIT; return mpi_errno; fn_fail: @@ -1106,6 +1134,15 @@ int MPIR_peer_intercomm_create(int context_id, int recvcontext_id, } MPID_THREAD_CS_EXIT(VCI, comm_self->mutex); + MPIR_Session *session_ptr = NULL; /* Can we just use NULL session since peer_intercomm is always temporary? */ + MPIR_Lpid my_lpid = MPIR_Group_rank_to_lpid(comm_self->local_group, 0); + mpi_errno = MPIR_Group_create_stride(1, 0, session_ptr, my_lpid, 1, 1, + &(*newcomm)->local_group); + MPIR_ERR_CHECK(mpi_errno); + mpi_errno = MPIR_Group_create_stride(1, 0, session_ptr, remote_lpid, 1, 1, + &(*newcomm)->remote_group); + MPIR_ERR_CHECK(mpi_errno); + (*newcomm)->tainted = 1; mpi_errno = MPIR_Comm_commit(*newcomm); MPIR_ERR_CHECK(mpi_errno); @@ -1222,6 +1259,37 @@ int MPIR_Intercomm_merge_impl(MPIR_Comm * comm_ptr, int high, MPIR_Comm ** new_i MPIR_Comm_set_session_ptr(*new_intracomm_ptr, comm_ptr->session_ptr); + /* construct local_group */ + MPIR_Group *new_local_group; + + MPIR_Lpid *map; + map = MPL_malloc(new_size * sizeof(MPIR_Lpid), MPL_MEM_GROUP); + MPIR_ERR_CHKANDJUMP(!map, mpi_errno, MPI_ERR_OTHER, "**nomem"); + + int myrank; + MPIR_Group *group1, *group2; + if (local_high) { + group1 = comm_ptr->remote_group; + group2 = comm_ptr->local_group; + myrank = group1->size + group2->rank; + } else { + group1 = comm_ptr->local_group; + group2 = comm_ptr->remote_group; + myrank = group1->rank; + } + for (int i = 0; i < group1->size; i++) { + map[i] = MPIR_Group_rank_to_lpid(group1, i); + } + for (int i = 0; i < group2->size; i++) { + map[group1->size + i] = MPIR_Group_rank_to_lpid(group2, i); + } + + mpi_errno = MPIR_Group_create_map(new_size, myrank, comm_ptr->session_ptr, map, + &new_local_group); + + (*new_intracomm_ptr)->local_group = new_local_group; + MPIR_Group_add_ref(new_local_group); + /* Now we know which group comes first. Build the new mapping * from the existing comm */ mpi_errno = create_and_map(comm_ptr, local_high, (*new_intracomm_ptr)); @@ -1260,6 +1328,7 @@ int MPIR_Intercomm_merge_impl(MPIR_Comm * comm_ptr, int high, MPIR_Comm ** new_i (*new_intracomm_ptr)->recvcontext_id = new_context_id; MPIR_Comm_set_session_ptr(*new_intracomm_ptr, comm_ptr->session_ptr); + (*new_intracomm_ptr)->local_group = new_local_group; mpi_errno = create_and_map(comm_ptr, local_high, (*new_intracomm_ptr)); MPIR_ERR_CHECK(mpi_errno); diff --git a/src/mpi/comm/comm_split.c b/src/mpi/comm/comm_split.c index 7c5519278e4..4c0e5a826c2 100644 --- a/src/mpi/comm/comm_split.c +++ b/src/mpi/comm/comm_split.c @@ -292,6 +292,10 @@ int MPIR_Comm_split_impl(MPIR_Comm * comm_ptr, int color, int key, MPIR_Comm ** (*newcomm_ptr)->rank = i; } + mpi_errno = MPIR_Group_incl_impl(comm_ptr->local_group, new_size, mapper->src_mapping, + &(*newcomm_ptr)->local_group); + MPIR_ERR_CHECK(mpi_errno); + /* For the remote group, the situation is more complicated. * We need to find the size of our "partner" group in the * remote comm. The easiest way (in terms of code) is for @@ -313,6 +317,11 @@ int MPIR_Comm_split_impl(MPIR_Comm * comm_ptr, int color, int key, MPIR_Comm ** for (i = 0; i < new_remote_size; i++) mapper->src_mapping[i] = remotekeytable[i].color; + mpi_errno = MPIR_Group_incl_impl(comm_ptr->remote_group, + new_remote_size, mapper->src_mapping, + &(*newcomm_ptr)->remote_group); + MPIR_ERR_CHECK(mpi_errno); + (*newcomm_ptr)->context_id = remote_context_id; (*newcomm_ptr)->remote_size = new_remote_size; (*newcomm_ptr)->local_comm = 0; @@ -331,6 +340,10 @@ int MPIR_Comm_split_impl(MPIR_Comm * comm_ptr, int color, int key, MPIR_Comm ** if (keytable[i].color == comm_ptr->rank) (*newcomm_ptr)->rank = i; } + + mpi_errno = MPIR_Group_incl_impl(comm_ptr->local_group, new_size, mapper->src_mapping, + &(*newcomm_ptr)->local_group); + MPIR_ERR_CHECK(mpi_errno); } /* Inherit the error handler (if any) */ diff --git a/src/mpi/comm/commutil.c b/src/mpi/comm/commutil.c index 9a51e8565ee..da824bff420 100644 --- a/src/mpi/comm/commutil.c +++ b/src/mpi/comm/commutil.c @@ -382,6 +382,10 @@ int MPII_Setup_intercomm_localcomm(MPIR_Comm * intercomm_ptr) mpi_errno = MPII_Comm_init(localcomm_ptr); MPIR_ERR_CHECK(mpi_errno); + MPIR_Assert(intercomm_ptr->local_group); + localcomm_ptr->local_group = intercomm_ptr->local_group; + MPIR_Group_add_ref(intercomm_ptr->local_group); + MPIR_Comm_set_session_ptr(localcomm_ptr, intercomm_ptr->session_ptr); /* use the parent intercomm's recv ctx as the basis for our ctx */ @@ -687,6 +691,14 @@ int MPIR_Comm_create_subcomms(MPIR_Comm * comm) /* Copy relevant hints to node_comm */ propagate_hints_to_subcomm(comm, comm->node_comm); + /* construct local_group */ + MPIR_Group *parent_group = comm->local_group; + MPIR_Assert(parent_group); + mpi_errno = MPIR_Group_incl_impl(parent_group, num_local, local_procs, + &comm->node_comm->local_group); + MPIR_ERR_CHECK(mpi_errno); + + /* mapper */ MPIR_Comm_map_irregular(comm->node_comm, comm, local_procs, num_local, MPIR_COMM_MAP_DIR__L2L, NULL); mpi_errno = MPIR_Comm_commit_internal(comm->node_comm); @@ -714,6 +726,14 @@ int MPIR_Comm_create_subcomms(MPIR_Comm * comm) /* Copy relevant hints to node_roots_comm */ propagate_hints_to_subcomm(comm, comm->node_roots_comm); + /* construct local_group */ + MPIR_Group *parent_group = comm->local_group; + MPIR_Assert(parent_group); + mpi_errno = MPIR_Group_incl_impl(parent_group, num_external, external_procs, + &comm->node_roots_comm->local_group); + MPIR_ERR_CHECK(mpi_errno); + + /* mapper */ MPIR_Comm_map_irregular(comm->node_roots_comm, comm, external_procs, num_external, MPIR_COMM_MAP_DIR__L2L, NULL); mpi_errno = MPIR_Comm_commit_internal(comm->node_roots_comm); @@ -961,6 +981,13 @@ int MPII_Comm_copy(MPIR_Comm * comm_ptr, int size, MPIR_Info * info, MPIR_Comm * newcomm_ptr->comm_kind = comm_ptr->comm_kind; newcomm_ptr->local_comm = 0; + newcomm_ptr->local_group = comm_ptr->local_group; + MPIR_Group_add_ref(comm_ptr->local_group); + if (comm_ptr->comm_kind == MPIR_COMM_KIND__INTERCOMM) { + newcomm_ptr->remote_group = comm_ptr->remote_group; + MPIR_Group_add_ref(comm_ptr->remote_group); + } + MPIR_Comm_set_session_ptr(newcomm_ptr, comm_ptr->session_ptr); /* There are two cases here - size is the same as the old communicator, @@ -1059,6 +1086,13 @@ int MPII_Comm_copy_data(MPIR_Comm * comm_ptr, MPIR_Info * info, MPIR_Comm ** out newcomm_ptr->comm_kind = comm_ptr->comm_kind; newcomm_ptr->local_comm = 0; + newcomm_ptr->local_group = comm_ptr->local_group; + MPIR_Group_add_ref(comm_ptr->local_group); + if (comm_ptr->comm_kind == MPIR_COMM_KIND__INTERCOMM) { + newcomm_ptr->remote_group = comm_ptr->remote_group; + MPIR_Group_add_ref(comm_ptr->remote_group); + } + if (comm_ptr->comm_kind == MPIR_COMM_KIND__INTRACOMM) MPIR_Comm_map_dup(newcomm_ptr, comm_ptr, MPIR_COMM_MAP_DIR__L2L); else diff --git a/src/mpid/ch3/src/ch3u_port.c b/src/mpid/ch3/src/ch3u_port.c index bd6c8bebfeb..39249e73035 100644 --- a/src/mpid/ch3/src/ch3u_port.c +++ b/src/mpid/ch3/src/ch3u_port.c @@ -544,6 +544,13 @@ static int MPIDI_CH3I_Initialize_tmp_comm(MPIR_Comm **comm_pptr, MPIR_Coll_comm_init(tmp_comm); + MPIR_Lpid local_lpid = tmp_comm->dev.local_vcrt->vcr_table[0]->lpid; + MPIR_Lpid remote_lpid = tmp_comm->dev.vcrt->vcr_table[0]->lpid; + mpi_errno = MPIR_Group_create_stride(1, 0, commself_ptr->session_ptr, local_lpid, 1, 1, + &tmp_comm->local_group); + mpi_errno = MPIR_Group_create_stride(1, 0, commself_ptr->session_ptr, remote_lpid, 1, 1, + &tmp_comm->remote_group); + /* Even though this is a tmp comm and we don't call MPI_Comm_commit, we still need to call the creation hook because the destruction hook will be called in comm_release */ @@ -1337,8 +1344,6 @@ static int SetupNewIntercomm( MPIR_Comm *comm_ptr, int remote_comm_size, intercomm->remote_size = remote_comm_size; intercomm->local_size = comm_ptr->local_size; intercomm->rank = comm_ptr->rank; - intercomm->local_group = NULL; - intercomm->remote_group = NULL; intercomm->comm_kind = MPIR_COMM_KIND__INTERCOMM; intercomm->local_comm = NULL; @@ -1356,6 +1361,21 @@ static int SetupNewIntercomm( MPIR_Comm *comm_ptr, int remote_comm_size, remote_translation[i].pg_rank, &intercomm->dev.vcrt->vcr_table[i]); } + intercomm->local_group = comm_ptr->local_group; + MPIR_Group_add_ref(comm_ptr->local_group); + + MPIR_Lpid *remote_map; + remote_map = MPL_malloc(remote_comm_size * sizeof(MPIR_Lpid), MPL_MEM_GROUP); + MPIR_ERR_CHKANDJUMP(!remote_map, mpi_errno, MPI_ERR_OTHER, "**nomem"); + for (i=0; i < intercomm->remote_size; i++) { + MPIDI_PG_t *pg = remote_pg[remote_translation[i].pg_index]; + int rank = remote_translation[i].pg_rank; + remote_map[i] = pg->vct[rank].lpid; + } + mpi_errno = MPIR_Group_create_map(remote_comm_size, MPI_UNDEFINED, comm_ptr->session_ptr, + remote_map, &intercomm->remote_group); + MPIR_ERR_CHECK(mpi_errno); + mpi_errno = MPIR_Comm_commit(intercomm); MPIR_ERR_CHECK(mpi_errno); diff --git a/src/mpid/ch4/src/ch4_comm.c b/src/mpid/ch4/src/ch4_comm.c index aa705061b22..2ca6c693cfa 100644 --- a/src/mpid/ch4/src/ch4_comm.c +++ b/src/mpid/ch4/src/ch4_comm.c @@ -795,6 +795,10 @@ int MPIDI_Comm_create_multi_leaders(MPIR_Comm * comm) MPIR_Comm_map_irregular(MPIDI_COMM(comm, multi_leads_comm), comm, external_procs, num_external, MPIR_COMM_MAP_DIR__L2L, NULL); + mpi_errno = MPIR_Group_incl_impl(comm->local_group, num_external, external_procs, + &MPIDI_COMM(comm, multi_leads_comm)->local_group); + MPIR_ERR_CHECK(mpi_errno); + /* Notify device of communicator creation */ mpi_errno = MPID_Comm_commit_pre_hook(MPIDI_COMM(comm, multi_leads_comm)); if (mpi_errno) diff --git a/src/mpid/ch4/src/init_comm.c b/src/mpid/ch4/src/init_comm.c index e546337bd6f..249d7700324 100644 --- a/src/mpid/ch4/src/init_comm.c +++ b/src/mpid/ch4/src/init_comm.c @@ -33,6 +33,17 @@ int MPIDI_create_init_comm(MPIR_Comm ** comm) init_comm->remote_size = node_roots_comm_size; init_comm->local_size = node_roots_comm_size; init_comm->coll.pof2 = MPL_pof2(node_roots_comm_size); + + MPIR_Lpid *map; + map = MPL_malloc(node_roots_comm_size * sizeof(MPIR_Lpid), MPL_MEM_GROUP); + MPIR_ERR_CHKANDJUMP(!map, mpi_errno, MPI_ERR_OTHER, "**nomem"); + for (i = 0; i < node_roots_comm_size; ++i) { + map[i] = MPIR_Process.node_root_map[i]; + } + mpi_errno = MPIR_Group_create_map(node_roots_comm_size, node_roots_comm_rank, NULL, + map, &init_comm->local_group); + MPIR_ERR_CHECK(mpi_errno); + MPIDI_COMM(init_comm, map).mode = MPIDI_RANK_MAP_LUT_INTRA; mpi_errno = MPIDIU_alloc_lut(&lut, node_roots_comm_size); MPIR_ERR_CHECK(mpi_errno); @@ -47,8 +58,8 @@ int MPIDI_create_init_comm(MPIR_Comm ** comm) mpi_errno = MPIDIG_init_comm(init_comm); MPIR_ERR_CHECK(mpi_errno); /* hacky, consider a separate MPIDI_{NM,SHM}_init_comm_hook - * to initialize the init_comm, e.g. to eliminate potential - * runtime features for stability during init */ + * to initialize the init_comm, e.g. to eliminate potential + * runtime features for stability during init */ mpi_errno = MPIDI_NM_mpi_comm_commit_pre_hook(init_comm); MPIR_ERR_CHECK(mpi_errno); From 3371f7913e52039df66d64e1981969c308ab4123 Mon Sep 17 00:00:00 2001 From: Hui Zhou Date: Thu, 12 Dec 2024 22:53:50 -0600 Subject: [PATCH 19/59] group: avoid freeing MPIR_Group_empty Many places we just return MPIR_Group_empty without increment the ref_count. This is fixable. But for now, let's avoid freeing it. --- src/mpi/group/grouputil.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/mpi/group/grouputil.c b/src/mpi/group/grouputil.c index 3a709451103..f3bdefc4b42 100644 --- a/src/mpi/group/grouputil.c +++ b/src/mpi/group/grouputil.c @@ -95,10 +95,16 @@ int MPIR_Group_finalize(void) int MPIR_Group_release(MPIR_Group * group_ptr) { int mpi_errno = MPI_SUCCESS; - int inuse; + /* MPIR_Group_empty was not properly reference counted - FIXME */ + if (group_ptr == MPIR_Group_empty) { + goto fn_exit; + } + + int inuse; MPIR_Group_release_ref(group_ptr, &inuse); if (!inuse) { + MPIR_Assert(!HANDLE_IS_BUILTIN(group_ptr->handle)); /* Only if refcount is 0 do we actually free. */ if (group_ptr->pmap.use_map) { MPL_free(group_ptr->pmap.u.map); @@ -109,6 +115,8 @@ int MPIR_Group_release(MPIR_Group * group_ptr) } MPIR_Handle_obj_free(&MPIR_Group_mem, group_ptr); } + + fn_exit: return mpi_errno; } From 3d9a110f73f82b37df51126d6b69fa6991fbfc17 Mon Sep 17 00:00:00 2001 From: Hui Zhou Date: Thu, 12 Dec 2024 23:39:02 -0600 Subject: [PATCH 20/59] ch4: release init_comm->local_group The init_comm does the release manually. --- src/mpid/ch4/src/init_comm.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/mpid/ch4/src/init_comm.c b/src/mpid/ch4/src/init_comm.c index 249d7700324..17915496417 100644 --- a/src/mpid/ch4/src/init_comm.c +++ b/src/mpid/ch4/src/init_comm.c @@ -78,6 +78,7 @@ void MPIDI_destroy_init_comm(MPIR_Comm ** comm_ptr) if (*comm_ptr != NULL) { comm = *comm_ptr; MPIDIU_release_lut(MPIDI_COMM(comm, map).irreg.lut.t); + MPIR_Group_release(comm->local_group); MPIDIG_destroy_comm(comm); MPIR_Object_release_ref(comm, &in_use); MPIR_Assertp(in_use == 0); From 0242bbc3d46039ab5b52d9679374fe756ac135ea Mon Sep 17 00:00:00 2001 From: Hui Zhou Date: Thu, 12 Dec 2024 11:06:53 -0600 Subject: [PATCH 21/59] ch4: assert group before communicator commit Add assertions to make sure the local_group and remote_group (for inter communicators) are always set before MPID_Comm_commit_pre_hook. --- src/mpid/ch4/src/ch4_comm.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/mpid/ch4/src/ch4_comm.c b/src/mpid/ch4/src/ch4_comm.c index 2ca6c693cfa..8429acb8290 100644 --- a/src/mpid/ch4/src/ch4_comm.c +++ b/src/mpid/ch4/src/ch4_comm.c @@ -140,6 +140,9 @@ int MPID_Comm_commit_pre_hook(MPIR_Comm * comm) int mpi_errno; MPIR_FUNC_ENTER; + MPIR_Assert(comm->local_group); + MPIR_Assert(comm->comm_kind == MPIR_COMM_KIND__INTRACOMM || comm->remote_group); + if (comm == MPIR_Process.comm_world) { MPIDI_COMM(comm, map).mode = MPIDI_RANK_MAP_DIRECT_INTRA; MPIDI_COMM(comm, map).avtid = 0; From fcce168f98009a88be080687f819e4ee5b5241eb Mon Sep 17 00:00:00 2001 From: Hui Zhou Date: Thu, 12 Dec 2024 12:14:25 -0600 Subject: [PATCH 22/59] ---- START HERE ---- --- dummy | 1 + 1 file changed, 1 insertion(+) diff --git a/dummy b/dummy index d00491fd7e5..6ed281c757a 100644 --- a/dummy +++ b/dummy @@ -1 +1,2 @@ 1 +1 From e598d92af4a6d892dfdc301cab645a63ffc60f6f Mon Sep 17 00:00:00 2001 From: Hui Zhou Date: Sat, 21 Dec 2024 20:46:31 -0600 Subject: [PATCH 23/59] abi: run ABI_init_builtins in MPI_T_init_thread Otherwise, the MPI_T functions may not able to convert builtin datatypes. --- maint/local_python/binding_c.py | 2 +- src/binding/abi/mpi_abi_util.h | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/maint/local_python/binding_c.py b/maint/local_python/binding_c.py index f219ee4e194..2654a58cc75 100644 --- a/maint/local_python/binding_c.py +++ b/maint/local_python/binding_c.py @@ -1106,7 +1106,7 @@ def out_can_be_undefined(p): G.out.append("int ret = " + static_call + ";") for l in post_filters: G.out.append(l) - if re.match(r'MPI_(Init|Init_thread|Session_init)$', func_name, re.IGNORECASE): + if re.match(r'MPI_(Init|Init_thread|Session_init)|MPI_T_init_thread$', func_name, re.IGNORECASE): G.out.append("ABI_init_builtins();") G.out.append("return ret;") G.out.append("DEDENT") diff --git a/src/binding/abi/mpi_abi_util.h b/src/binding/abi/mpi_abi_util.h index 5793b044c87..07be78989d8 100644 --- a/src/binding/abi/mpi_abi_util.h +++ b/src/binding/abi/mpi_abi_util.h @@ -137,6 +137,7 @@ static inline ABI_Datatype ABI_Datatype_from_mpi(MPI_Datatype in) return (ABI_Datatype) ((intptr_t) ABI_DATATYPE_NULL + i); } } + MPIR_Assert(0); } MPIR_Datatype *ptr; MPIR_Datatype_get_ptr(in, ptr); From 27fbc603af93adbd52bb593cb68397cd6bcebcb1 Mon Sep 17 00:00:00 2001 From: Hui Zhou Date: Fri, 20 Dec 2024 23:04:09 -0600 Subject: [PATCH 24/59] test/runtests: capture stray output in run_mpitests When we run tests as functions, the stray output in MPI_Finalize, such as the debug messages in debug builds, are not captures previously. This patch make sure we report such stray output as failures. --- test/mpi/include/multi_tests.c | 2 +- test/mpi/runtests | 16 ++++++++++++++++ 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/test/mpi/include/multi_tests.c b/test/mpi/include/multi_tests.c index 96341d20abf..d7ccfc0c66b 100644 --- a/test/mpi/include/multi_tests.c +++ b/test/mpi/include/multi_tests.c @@ -204,7 +204,7 @@ static void cleanup_cvars(void) { for (int i = 0; i < num_cvars; i++) { if (cvar_list[i].num_enums > 0) { - for (int j = 0; j < num_cvars; j++) { + for (int j = 0; j < cvar_list[i].num_enums; j++) { free(cvar_list[i].enum_list[j]); } } diff --git a/test/mpi/runtests b/test/mpi/runtests index d39f72a1a79..45fc3aaf894 100755 --- a/test/mpi/runtests +++ b/test/mpi/runtests @@ -765,6 +765,22 @@ sub run_mpitests { } } close($in); + { + my @inline; + while (<$out>) { + print " $_" if $g_opt{verbose}; + push @inline, $_; + } + if (@inline) { + my $runtime = 0; + my $test_opt = {name=>"run_mpitests", np=>$np, dir=>".", args=>[], envs=>[] }; + RunPreMsg($test_opt); + print "run_mpitests: stray output in finalize\n"; + show_failed_test_detail($test_opt, \@inline); + RunTestFailed($test_opt, join('', @inline), $runtime); + RunPostMsg($test_opt); + } + } close($out); waitpid($pid, 0); # TODO: check $? if ($flag_aborted) { From c85c02c27574998bf2fdfe5395b5cb38efe910ad Mon Sep 17 00:00:00 2001 From: Hui Zhou Date: Tue, 17 Dec 2024 22:11:07 -0600 Subject: [PATCH 25/59] comm: add MPIR_comm_rank_to_lpid and make it inline Now that we always have group inside a communicator, we can simply return the lpid from the group. Because this will be used in the hot path, make it inline. --- src/include/mpir_comm.h | 9 +++++++++ src/include/mpir_group.h | 16 +++++++++++++++- src/mpi/group/grouputil.c | 20 -------------------- 3 files changed, 24 insertions(+), 21 deletions(-) diff --git a/src/include/mpir_comm.h b/src/include/mpir_comm.h index af50031ebaf..16c90bfe5b4 100644 --- a/src/include/mpir_comm.h +++ b/src/include/mpir_comm.h @@ -296,6 +296,15 @@ void MPIR_stream_comm_free(MPIR_Comm * comm_ptr); int MPIR_Comm_copy_stream(MPIR_Comm * oldcomm, MPIR_Comm * newcomm); int MPIR_get_local_gpu_stream(MPIR_Comm * comm_ptr, MPL_gpu_stream_t * gpu_stream); +MPL_STATIC_INLINE_PREFIX MPIR_Lpid MPIR_comm_rank_to_lpid(MPIR_Comm * comm_ptr, int rank) +{ + if (comm_ptr->comm_kind == MPIR_COMM_KIND__INTRACOMM) { + return MPIR_Group_rank_to_lpid(comm_ptr->local_group, rank); + } else { + return MPIR_Group_rank_to_lpid(comm_ptr->remote_group, rank); + } +} + MPL_STATIC_INLINE_PREFIX MPIR_Stream *MPIR_stream_comm_get_local_stream(MPIR_Comm * comm_ptr) { if (comm_ptr->stream_comm_type == MPIR_STREAM_COMM_SINGLE) { diff --git a/src/include/mpir_group.h b/src/include/mpir_group.h index 69b665fcc87..e9e704e56b1 100644 --- a/src/include/mpir_group.h +++ b/src/include/mpir_group.h @@ -142,7 +142,6 @@ int MPIR_Group_create_map(int size, int rank, MPIR_Session * session_ptr, MPIR_L int MPIR_Group_create_stride(int size, int rank, MPIR_Session * session_ptr, MPIR_Lpid offset, MPIR_Lpid stride, MPIR_Lpid blocksize, MPIR_Group ** new_group_ptr); -MPIR_Lpid MPIR_Group_rank_to_lpid(MPIR_Group * group, int rank); int MPIR_Group_lpid_to_rank(MPIR_Group * group, MPIR_Lpid lpid); int MPIR_Group_check_subset(MPIR_Group * group_ptr, MPIR_Comm * comm_ptr); @@ -150,4 +149,19 @@ void MPIR_Group_set_session_ptr(MPIR_Group * group_ptr, MPIR_Session * session_o int MPIR_Group_init(void); int MPIR_Group_finalize(void); +MPL_STATIC_INLINE_PREFIX MPIR_Lpid MPIR_Group_rank_to_lpid(MPIR_Group * group, int rank) +{ + if (rank < 0 || rank >= group->pmap.size) { + return MPI_UNDEFINED; + } + + if (group->pmap.use_map) { + return group->pmap.u.map[rank]; + } else { + MPIR_Lpid i_blk = rank / group->pmap.u.stride.blocksize; + MPIR_Lpid r_blk = rank % group->pmap.u.stride.blocksize; + return group->pmap.u.stride.offset + i_blk * group->pmap.u.stride.stride + r_blk; + } +} + #endif /* MPIR_GROUP_H_INCLUDED */ diff --git a/src/mpi/group/grouputil.c b/src/mpi/group/grouputil.c index f3bdefc4b42..1086a486122 100644 --- a/src/mpi/group/grouputil.c +++ b/src/mpi/group/grouputil.c @@ -252,21 +252,6 @@ int MPIR_Group_create_stride(int size, int rank, MPIR_Session * session_ptr, goto fn_exit; } -static MPIR_Lpid pmap_rank_to_lpid(struct MPIR_Pmap *pmap, int rank) -{ - if (rank < 0 || rank >= pmap->size) { - return MPI_UNDEFINED; - } - - if (pmap->use_map) { - return pmap->u.map[rank]; - } else { - MPIR_Lpid i_blk = rank / pmap->u.stride.blocksize; - MPIR_Lpid r_blk = rank % pmap->u.stride.blocksize; - return pmap->u.stride.offset + i_blk * pmap->u.stride.stride + r_blk; - } -} - static int pmap_lpid_to_rank(struct MPIR_Pmap *pmap, MPIR_Lpid lpid) { if (pmap->use_map) { @@ -302,11 +287,6 @@ int MPIR_Group_lpid_to_rank(MPIR_Group * group, MPIR_Lpid lpid) return pmap_lpid_to_rank(&group->pmap, lpid); } -MPIR_Lpid MPIR_Group_rank_to_lpid(MPIR_Group * group, int rank) -{ - return pmap_rank_to_lpid(&group->pmap, rank); -} - #ifdef HAVE_ERROR_CHECKING /* * The following routines are needed only for error checking From 2e78cff25991e7c2fe97bead6557d53b597c25eb Mon Sep 17 00:00:00 2001 From: Hui Zhou Date: Mon, 16 Dec 2024 21:06:32 -0600 Subject: [PATCH 26/59] group: add LPID macros Add the following macros: MPIR_LPID_WORLD_INDEX MPIR_LPID_WORLD_RANK MPIR_LPID_FROM --- src/include/mpir_group.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/include/mpir_group.h b/src/include/mpir_group.h index e9e704e56b1..7397aa14f6a 100644 --- a/src/include/mpir_group.h +++ b/src/include/mpir_group.h @@ -81,6 +81,11 @@ int MPIR_find_world(const char *namespace); */ typedef uint64_t MPIR_Lpid; +#define MPIR_LPID_WORLD_INDEX(lpid) ((lpid) >> 32) +#define MPIR_LPID_WORLD_RANK(lpid) ((lpid) & 0xffffffff) +#define MPIR_LPID_FROM(world_idx, world_rank) (((uint64_t)(world_idx) << 32) | (world_rank)) +#define MPIR_LPID_DYNAMIC_MASK ((MPIR_Lpid)0x1 << 63) + struct MPIR_Pmap { int size; /* same as group->size, duplicate here so Pmap is logically complete */ bool use_map; From ca0ed496d32d9ce73529a128f6b127fb96411756 Mon Sep 17 00:00:00 2001 From: Hui Zhou Date: Wed, 18 Dec 2024 22:37:53 -0600 Subject: [PATCH 27/59] group: misc fixups for grouputil Fix a typo in setting the size of MPIR_GROUP_SELF. Add ref_count if we return MPIR_GROUP_EMPTY to prevent freeing the builtin when it is released internally. Unfortunately, since user can directly use MPI_GROUP_EMPTY, we can't keep ref_count accurate. But at least we can keep it positive to prevent an actual free. --- src/mpi/group/grouputil.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/mpi/group/grouputil.c b/src/mpi/group/grouputil.c index 1086a486122..0f21402e7ca 100644 --- a/src/mpi/group/grouputil.c +++ b/src/mpi/group/grouputil.c @@ -196,6 +196,7 @@ int MPIR_Group_create_map(int size, int rank, MPIR_Session * session_ptr, MPIR_L * for others it is implied */ MPL_free(map); *new_group_ptr = MPIR_Group_empty; + MPIR_Group_add_ref(*new_group_ptr); goto fn_exit; } else { MPIR_Group *newgrp; From e139b152793060df6ec7fc5ecaa914f39f5419cd Mon Sep 17 00:00:00 2001 From: Hui Zhou Date: Thu, 19 Dec 2024 15:56:20 -0600 Subject: [PATCH 28/59] group: add MPIR_Group_dup duplicating builtin groups The builtin groups are in session NULL. We need duplicate the groups in MPIR_Group_from_session_pset_impl to return a group in the correct session. --- src/mpi/group/grouputil.c | 37 +++++++++++++++++++------------------ 1 file changed, 19 insertions(+), 18 deletions(-) diff --git a/src/mpi/group/grouputil.c b/src/mpi/group/grouputil.c index 0f21402e7ca..c4ffac4a8d5 100644 --- a/src/mpi/group/grouputil.c +++ b/src/mpi/group/grouputil.c @@ -156,28 +156,29 @@ int MPIR_Group_create(int nproc, MPIR_Group ** new_group_ptr) int MPIR_Group_dup(MPIR_Group * old_group, MPIR_Session * session_ptr, MPIR_Group ** new_group_ptr) { int mpi_errno = MPI_SUCCESS; - MPIR_Group *new_group; - new_group = (MPIR_Group *) MPIR_Handle_obj_alloc(&MPIR_Group_mem); - if (!new_group) { - mpi_errno = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, "MPIR_Group_dup", - __LINE__, MPI_ERR_OTHER, "**nomem", 0); - goto fn_fail; - } - MPIR_Object_set_ref(new_group, 1); + *new_group_ptr = (MPIR_Group *) MPIR_Handle_obj_alloc(&MPIR_Group_mem); + MPIR_ERR_CHKANDJUMP(!*new_group_ptr, mpi_errno, MPI_ERR_OTHER, "**nomem"); + MPIR_Object_set_ref(*new_group_ptr, 1); + + (*new_group_ptr)->size = old_group->size; + (*new_group_ptr)->rank = old_group->rank; + MPIR_Group_set_session_ptr(*new_group_ptr, session_ptr); + memcpy(&(*new_group_ptr)->pmap, &old_group->pmap, sizeof(struct MPIR_Pmap)); - /* initialize fields */ - new_group->size = old_group->size; - new_group->rank = old_group->rank; - MPIR_Group_set_session_ptr(new_group, session_ptr); - memcpy(&new_group->pmap, &old_group->pmap, sizeof(struct MPIR_Pmap)); if (old_group->pmap.use_map) { - new_group->pmap.u.map = MPL_malloc(old_group->size * sizeof(MPIR_Lpid), MPL_MEM_GROUP); - MPIR_ERR_CHKANDJUMP(!new_group->pmap.u.map, mpi_errno, MPI_ERR_OTHER, "**nomem"); - memcpy(new_group->pmap.u.map, old_group->pmap.u.map, old_group->size * sizeof(MPIR_Lpid)); - } + int size = old_group->size; + MPIR_Lpid *map = MPL_malloc(size * sizeof(MPIR_Lpid), MPL_MEM_GROUP); + MPIR_ERR_CHKANDJUMP(!map, mpi_errno, MPI_ERR_OTHER, "**nomem"); + for (int i = 0; i < size; i++) { + map[i] = old_group->pmap.u.map[i]; + } - *new_group_ptr = new_group; + (*new_group_ptr)->pmap.u.map = map; + } +#ifdef MPID_DEV_GROUP_DECL + mpi_errno = MPID_Group_init_hook(*new_group_ptr); +#endif fn_exit: return mpi_errno; From 8a999c6ab674014d2181a408861add4b89efd0e3 Mon Sep 17 00:00:00 2001 From: Hui Zhou Date: Wed, 18 Dec 2024 11:45:29 -0600 Subject: [PATCH 29/59] ch3: add ch3_vcrt to MPIR_Group Group are a natural place to host vcrt (virtual connection reference table). When communicators are duplicated, groups are simply inherited and reference counted. Thus we won't end up with duplication of vcrt. --- src/mpi/group/grouputil.c | 7 +++++++ src/mpid/ch3/include/mpidpre.h | 5 +++++ src/mpid/ch3/src/ch3u_comm.c | 19 +++++++++++++++++++ src/mpid/ch3/src/mpidi_pg.c | 13 +++++++++++++ 4 files changed, 44 insertions(+) diff --git a/src/mpi/group/grouputil.c b/src/mpi/group/grouputil.c index c4ffac4a8d5..5d0c66847f0 100644 --- a/src/mpi/group/grouputil.c +++ b/src/mpi/group/grouputil.c @@ -89,6 +89,7 @@ int MPIR_Group_init(void) int MPIR_Group_finalize(void) { num_worlds = 0; + return MPI_SUCCESS; } @@ -113,6 +114,9 @@ int MPIR_Group_release(MPIR_Group * group_ptr) /* Release session */ MPIR_Session_release(group_ptr->session_ptr); } +#ifdef MPID_DEV_GROUP_DECL + mpi_errno = MPID_Group_free_hook(group_ptr); +#endif MPIR_Handle_obj_free(&MPIR_Group_mem, group_ptr); } @@ -146,6 +150,9 @@ int MPIR_Group_create(int nproc, MPIR_Group ** new_group_ptr) (*new_group_ptr)->session_ptr = NULL; memset(&(*new_group_ptr)->pmap, 0, sizeof(struct MPIR_Pmap)); (*new_group_ptr)->pmap.size = nproc; +#ifdef MPID_DEV_GROUP_DECL + mpi_errno = MPID_Group_init_hook(*new_group_ptr); +#endif return mpi_errno; } diff --git a/src/mpid/ch3/include/mpidpre.h b/src/mpid/ch3/include/mpidpre.h index 18a83a6af3f..67176a7b232 100644 --- a/src/mpid/ch3/include/mpidpre.h +++ b/src/mpid/ch3/include/mpidpre.h @@ -195,6 +195,11 @@ typedef struct MPIDI_CH3I_comm } MPIDI_CH3I_comm_t; +/* add vcrt to MPIR_Group so we can inherit it whenever possible */ +#define MPID_DEV_GROUP_DECL struct MPIDI_VCRT *ch3_vcrt; +int MPID_Group_init_hook(MPIR_Group * group_ptr); +int MPID_Group_free_hook(MPIR_Group * group_ptr); + #define MPID_DEV_COMM_DECL MPIDI_CH3I_comm_t dev; #ifndef DEFINED_REQ diff --git a/src/mpid/ch3/src/ch3u_comm.c b/src/mpid/ch3/src/ch3u_comm.c index ce2f495055b..95271c052cf 100644 --- a/src/mpid/ch3/src/ch3u_comm.c +++ b/src/mpid/ch3/src/ch3u_comm.c @@ -581,3 +581,22 @@ void MPIDI_CH3I_Comm_find(int context_id, MPIR_Comm **comm) MPIR_FUNC_EXIT; } + +int MPID_Group_init_hook(MPIR_Group * group_ptr) +{ + group_ptr->ch3_vcrt = NULL; + return MPI_SUCCESS; +} + +int MPID_Group_free_hook(MPIR_Group * group_ptr) +{ + int mpi_errno = MPI_SUCCESS; + + if (group_ptr->ch3_vcrt) { + /* FIXME: setting TRUE so vc entries may get released. + * Is there a case we don't want that? + */ + mpi_errno = MPIDI_VCRT_Release(group_ptr->ch3_vcrt, TRUE); + } + return mpi_errno; +} diff --git a/src/mpid/ch3/src/mpidi_pg.c b/src/mpid/ch3/src/mpidi_pg.c index 5db84999bb9..364ec260a34 100644 --- a/src/mpid/ch3/src/mpidi_pg.c +++ b/src/mpid/ch3/src/mpidi_pg.c @@ -44,6 +44,12 @@ int MPIDI_PG_Init(MPIDI_PG_Compare_ids_fn_t compare_ids_fn, MPIDI_PG_Compare_ids_fn = compare_ids_fn; MPIDI_PG_Destroy_fn = destroy_fn; + /* initialize the device fields in builtin groups */ +#ifdef MPID_DEV_GROUP_DECL + for (int i = 0; i < MPIR_GROUP_N_BUILTIN; i++) { + MPID_Group_init_hook(MPIR_Group_builtin + i); + } +#endif return mpi_errno; } @@ -64,6 +70,13 @@ int MPIDI_PG_Finalize(void) MPIU_PG_Printall( stdout ); } + /* release the vcrt in builtin groups, since they don't really get freed */ +#ifdef MPID_DEV_GROUP_DECL + for (int i = 0; i < MPIR_GROUP_N_BUILTIN; i++) { + MPID_Group_free_hook(MPIR_Group_builtin + i); + } +#endif + /* Free the storage associated with the process groups */ pg = MPIDI_PG_list; while (pg) { From 5306acbda9a21e41c670ab9d11221e7c18f6ac91 Mon Sep 17 00:00:00 2001 From: Hui Zhou Date: Thu, 19 Dec 2024 17:02:15 -0600 Subject: [PATCH 30/59] ch3: shortcut tmp_comm used in MPI_Comm_accept/connect Because the tmp_comm uses a temporary vc that doesn't belong to any pg, it is incompatible to the new comm init process (that relies on lpid lookup to construct vcrt tables). Turns out we only need tmp_comm to perform basic send/recv (MPIC_Sendrecv) and we don't need most of the facility of a normal communicator. Shortcut the tmp_comm construction and destroy greatly simplifies the code. --- src/mpid/ch3/src/ch3u_port.c | 44 ++++++++++++++++-------------------- 1 file changed, 19 insertions(+), 25 deletions(-) diff --git a/src/mpid/ch3/src/ch3u_port.c b/src/mpid/ch3/src/ch3u_port.c index 39249e73035..91aaf1f11d5 100644 --- a/src/mpid/ch3/src/ch3u_port.c +++ b/src/mpid/ch3/src/ch3u_port.c @@ -487,12 +487,10 @@ static int MPIDI_CH3I_Initialize_tmp_comm(MPIR_Comm **comm_pptr, MPIDI_VC_t *vc_ptr, int is_low_group, int context_id_offset) { int mpi_errno = MPI_SUCCESS; - MPIR_Comm *tmp_comm, *commself_ptr; + MPIR_Comm *tmp_comm; MPIR_FUNC_ENTER; - MPIR_Comm_get_ptr( MPI_COMM_SELF, commself_ptr ); - /* WDG-old code allocated a context id that was then discarded */ mpi_errno = MPIR_Comm_create(&tmp_comm); MPIR_ERR_CHECK(mpi_errno); @@ -524,11 +522,6 @@ static int MPIDI_CH3I_Initialize_tmp_comm(MPIR_Comm **comm_pptr, /* No pg structure needed since vc has already been set up (connection has been established). */ - /* Point local vcrt at those of commself_ptr */ - /* FIXME: Explain why */ - tmp_comm->dev.local_vcrt = commself_ptr->dev.vcrt; - MPIDI_VCRT_Add_ref(commself_ptr->dev.vcrt); - /* No pg needed since connection has already been formed. FIXME - ensure that the comm_release code does not try to free an unallocated pg */ @@ -542,21 +535,6 @@ static int MPIDI_CH3I_Initialize_tmp_comm(MPIR_Comm **comm_pptr, /* FIXME: Why do we do a dup here? */ MPIDI_VCR_Dup(vc_ptr, &tmp_comm->dev.vcrt->vcr_table[0]); - MPIR_Coll_comm_init(tmp_comm); - - MPIR_Lpid local_lpid = tmp_comm->dev.local_vcrt->vcr_table[0]->lpid; - MPIR_Lpid remote_lpid = tmp_comm->dev.vcrt->vcr_table[0]->lpid; - mpi_errno = MPIR_Group_create_stride(1, 0, commself_ptr->session_ptr, local_lpid, 1, 1, - &tmp_comm->local_group); - mpi_errno = MPIR_Group_create_stride(1, 0, commself_ptr->session_ptr, remote_lpid, 1, 1, - &tmp_comm->remote_group); - - /* Even though this is a tmp comm and we don't call - MPI_Comm_commit, we still need to call the creation hook - because the destruction hook will be called in comm_release */ - mpi_errno = MPID_Comm_commit_pre_hook(tmp_comm); - MPIR_ERR_CHECK(mpi_errno); - *comm_pptr = tmp_comm; fn_exit: @@ -566,6 +544,22 @@ static int MPIDI_CH3I_Initialize_tmp_comm(MPIR_Comm **comm_pptr, goto fn_exit; } +static int MPIDI_CH3I_Release_tmp_comm(MPIR_Comm *tmp_comm) +{ + int mpi_errno = MPI_SUCCESS; + + mpi_errno = MPIDI_VCRT_Release(tmp_comm->dev.vcrt, FALSE); + MPIR_ERR_CHECK(mpi_errno); + + MPIR_Free_contextid(tmp_comm->recvcontext_id); + MPIR_Handle_obj_free(&MPIR_Comm_mem, tmp_comm); + + fn_exit: + return mpi_errno; + fn_fail: + goto fn_exit; +} + /* ------------------------------------------------------------------------- */ /* MPIDI_Comm_connect() @@ -752,7 +746,7 @@ int MPIDI_Comm_connect(const char *port_name, MPIR_Info *info, int root, MPIR_ERR_CHECK(mpi_errno); /* All communication with remote root done. Release the communicator. */ - MPIR_Comm_release(tmp_comm); + MPIDI_CH3I_Release_tmp_comm(tmp_comm); } /*printf("connect:barrier\n");fflush(stdout);*/ @@ -1283,7 +1277,7 @@ int MPIDI_Comm_accept(const char *port_name, MPIR_Info *info, int root, MPIR_ERR_CHECK(mpi_errno); /* All communication with remote root done. Release the communicator. */ - MPIR_Comm_release(tmp_comm); + MPIDI_CH3I_Release_tmp_comm(tmp_comm); } MPL_DBG_MSG(MPIDI_CH3_DBG_CONNECT,VERBOSE,"Barrier"); From 2b5ee690a4e6cf90ad4ab854bdf9f5f6819a0665 Mon Sep 17 00:00:00 2001 From: Hui Zhou Date: Wed, 18 Dec 2024 13:01:37 -0600 Subject: [PATCH 31/59] ch3: use group to build vcrt instead of mapper Replace the usage of mapper with comm->local_group and comm->remote_group in MPIDI_CH3I_Comm_commit_pre_hook. --- src/mpid/ch3/src/ch3u_comm.c | 223 ++++++++++++----------------------- src/mpid/ch3/src/ch3u_port.c | 14 --- src/mpid/ch3/src/mpid_vc.c | 56 --------- 3 files changed, 76 insertions(+), 217 deletions(-) diff --git a/src/mpid/ch3/src/ch3u_comm.c b/src/mpid/ch3/src/ch3u_comm.c index 95271c052cf..c2c87442e66 100644 --- a/src/mpid/ch3/src/ch3u_comm.c +++ b/src/mpid/ch3/src/ch3u_comm.c @@ -111,77 +111,72 @@ int MPIDI_CH3I_Comm_init(void) goto fn_exit; } - -static void dup_vcrt(struct MPIDI_VCRT *src_vcrt, struct MPIDI_VCRT **dest_vcrt, - MPIR_Comm_map_t *mapper, int src_comm_size, int vcrt_size, - int vcrt_offset) +static int create_vcrt_from_group(MPIR_Group *group, struct MPIDI_VCRT **vcrt_out) { - int flag, i; - - /* try to find the simple case where the new comm is a simple - * duplicate of the previous comm. in that case, we simply add a - * reference to the previous VCRT instead of recreating it. */ - if (mapper->type == MPIR_COMM_MAP_TYPE__DUP && src_comm_size == vcrt_size) { - *dest_vcrt = src_vcrt; - MPIDI_VCRT_Add_ref(src_vcrt); - return; - } - else if (mapper->type == MPIR_COMM_MAP_TYPE__IRREGULAR && - mapper->src_mapping_size == vcrt_size) { - /* if the mapping array is exactly the same as the original - * comm's VC list, there is no need to create a new VCRT. - * instead simply point to the original comm's VCRT and bump - * up it's reference count */ - flag = 1; - for (i = 0; i < mapper->src_mapping_size; i++) - if (mapper->src_mapping[i] != i) - flag = 0; + int mpi_errno = MPI_SUCCESS; - if (flag) { - *dest_vcrt = src_vcrt; - MPIDI_VCRT_Add_ref(src_vcrt); - return; - } + if (group->ch3_vcrt) { + MPIDI_VCRT_Add_ref(group->ch3_vcrt); + *vcrt_out = group->ch3_vcrt; + goto fn_exit; } - /* we are in the more complex case where we need to allocate a new - * VCRT */ + struct MPIDI_VCRT *vcrt; + mpi_errno = MPIDI_VCRT_Create(group->size, &vcrt); + MPIR_ERR_CHECK(mpi_errno); - if (!vcrt_offset) - MPIDI_VCRT_Create(vcrt_size, dest_vcrt); + *vcrt_out = vcrt; - if (mapper->type == MPIR_COMM_MAP_TYPE__DUP) { - for (i = 0; i < src_comm_size; i++) - MPIDI_VCR_Dup(src_vcrt->vcr_table[i], - &((*dest_vcrt)->vcr_table[i + vcrt_offset])); - } - else { - for (i = 0; i < mapper->src_mapping_size; i++) - MPIDI_VCR_Dup(src_vcrt->vcr_table[mapper->src_mapping[i]], - &((*dest_vcrt)->vcr_table[i + vcrt_offset])); + for (int i = 0; i < group->size; i++) { + MPIR_Lpid lpid = MPIR_Group_rank_to_lpid(group, i); + /* Currently ch3 does not synchronize pg with MPIR_worlds. All lpid are contiguous + * with world_idx = 0. We can tell whether it is a spawned process by checking whether + * it is >= world size. + */ + if (lpid < MPIR_Process.size) { + MPIDI_VCR_Dup(&MPIDI_Process.my_pg->vct[lpid], &vcrt->vcr_table[i]); + } else { + /* search PGs to find the vc. Not particularly efficient, but likely not critical */ + /* TODO: Build a vc hash for dynamic processes */ + MPIDI_PG_iterator iter; + MPIDI_PG_Get_iterator(&iter); + bool found_it = false; + while (MPIDI_PG_Has_next(&iter)) { + MPIDI_PG_t *pg; + MPIDI_PG_Get_next(&iter, &pg); + for (int j = 0; j < pg->size; j++) { + if (pg->vct[j].lpid == lpid) { + MPIDI_VCR_Dup(&pg->vct[j], &vcrt->vcr_table[i]); + found_it = true; + break; + } + } + if (found_it) { + break; + } + pg = pg->next; + } + MPIR_Assert(found_it); + } } -} -static inline int map_size(MPIR_Comm_map_t map) -{ - if (map.type == MPIR_COMM_MAP_TYPE__IRREGULAR) - return map.src_mapping_size; - else if (map.dir == MPIR_COMM_MAP_DIR__L2L || map.dir == MPIR_COMM_MAP_DIR__L2R) - return map.src_comm->local_size; - else - return map.src_comm->remote_size; + fn_exit: + return mpi_errno; + fn_fail: + goto fn_exit; + } int MPIDI_CH3I_Comm_commit_pre_hook(MPIR_Comm *comm) { int mpi_errno = MPI_SUCCESS; - hook_elt *elt; - MPIR_Comm_map_t *mapper; - MPIR_Comm *src_comm; - int vcrt_size, vcrt_offset; - MPIR_FUNC_ENTER; + /* initialize the is_disconnected variable to FALSE. this will be + * set to TRUE if the communicator is freed by an + * MPI_COMM_DISCONNECT call. */ + comm->dev.is_disconnected = 0; + if (comm == MPIR_Process.comm_world) { comm->rank = MPIR_Process.rank; comm->remote_size = MPIR_Process.size; @@ -198,6 +193,7 @@ int MPIDI_CH3I_Comm_commit_pre_hook(MPIR_Comm *comm) for (int p = 0; p < MPIR_Process.size; p++) { MPIDI_VCR_Dup(&MPIDI_Process.my_pg->vct[p], &comm->dev.vcrt->vcr_table[p]); } + goto done_vcrt; } else if (comm == MPIR_Process.comm_self) { comm->rank = 0; comm->remote_size = 1; @@ -211,6 +207,7 @@ int MPIDI_CH3I_Comm_commit_pre_hook(MPIR_Comm *comm) } MPIDI_VCR_Dup(&MPIDI_Process.my_pg->vct[MPIR_Process.rank], &comm->dev.vcrt->vcr_table[0]); + goto done_vcrt; } else if (comm == MPIR_Process.icomm_world) { comm->rank = MPIR_Process.rank; comm->remote_size = MPIR_Process.size; @@ -218,104 +215,35 @@ int MPIDI_CH3I_Comm_commit_pre_hook(MPIR_Comm *comm) MPIDI_VCRT_Add_ref(MPIR_Process.comm_world->dev.vcrt ); comm->dev.vcrt = MPIR_Process.comm_world->dev.vcrt; + goto done_vcrt; } - /* initialize the is_disconnected variable to FALSE. this will be - * set to TRUE if the communicator is freed by an - * MPI_COMM_DISCONNECT call. */ - comm->dev.is_disconnected = 0; - - /* do some sanity checks */ - LL_FOREACH(comm->mapper_head, mapper) { - if (mapper->src_comm->comm_kind == MPIR_COMM_KIND__INTRACOMM) - MPIR_Assertp(mapper->dir == MPIR_COMM_MAP_DIR__L2L || - mapper->dir == MPIR_COMM_MAP_DIR__L2R); - if (comm->comm_kind == MPIR_COMM_KIND__INTRACOMM) - MPIR_Assertp(mapper->dir == MPIR_COMM_MAP_DIR__L2L || - mapper->dir == MPIR_COMM_MAP_DIR__R2L); - } - - /* First, handle all the mappers that contribute to the local part - * of the comm */ - vcrt_size = 0; - LL_FOREACH(comm->mapper_head, mapper) { - if (mapper->dir == MPIR_COMM_MAP_DIR__L2R || - mapper->dir == MPIR_COMM_MAP_DIR__R2R) - continue; - - vcrt_size += map_size(*mapper); + if (comm->comm_kind == MPIR_COMM_KIND__INTRACOMM) { + mpi_errno = create_vcrt_from_group(comm->local_group, &comm->dev.vcrt); + MPIR_ERR_CHECK(mpi_errno); + } else { + mpi_errno = create_vcrt_from_group(comm->local_group, &comm->dev.local_vcrt); + MPIR_ERR_CHECK(mpi_errno); + mpi_errno = create_vcrt_from_group(comm->remote_group, &comm->dev.vcrt); + MPIR_ERR_CHECK(mpi_errno); } - vcrt_offset = 0; - LL_FOREACH(comm->mapper_head, mapper) { - src_comm = mapper->src_comm; - - if (mapper->dir == MPIR_COMM_MAP_DIR__L2R || - mapper->dir == MPIR_COMM_MAP_DIR__R2R) - continue; - if (mapper->dir == MPIR_COMM_MAP_DIR__L2L) { - if (src_comm->comm_kind == MPIR_COMM_KIND__INTRACOMM && comm->comm_kind == MPIR_COMM_KIND__INTRACOMM) { - dup_vcrt(src_comm->dev.vcrt, &comm->dev.vcrt, mapper, mapper->src_comm->local_size, - vcrt_size, vcrt_offset); - } - else if (src_comm->comm_kind == MPIR_COMM_KIND__INTRACOMM && comm->comm_kind == MPIR_COMM_KIND__INTERCOMM) - dup_vcrt(src_comm->dev.vcrt, &comm->dev.local_vcrt, mapper, mapper->src_comm->local_size, - vcrt_size, vcrt_offset); - else if (src_comm->comm_kind == MPIR_COMM_KIND__INTERCOMM && comm->comm_kind == MPIR_COMM_KIND__INTRACOMM) { - dup_vcrt(src_comm->dev.local_vcrt, &comm->dev.vcrt, mapper, mapper->src_comm->local_size, - vcrt_size, vcrt_offset); - } - else - dup_vcrt(src_comm->dev.local_vcrt, &comm->dev.local_vcrt, mapper, - mapper->src_comm->local_size, vcrt_size, vcrt_offset); + done_vcrt: + /* add vcrt to the comm groups if they are not there */ + if (comm->comm_kind == MPIR_COMM_KIND__INTRACOMM) { + if (comm->local_group->ch3_vcrt == NULL) { + MPIDI_VCRT_Add_ref(comm->dev.vcrt); + comm->local_group->ch3_vcrt = comm->dev.vcrt; } - else { /* mapper->dir == MPIR_COMM_MAP_DIR__R2L */ - MPIR_Assert(src_comm->comm_kind == MPIR_COMM_KIND__INTERCOMM); - if (comm->comm_kind == MPIR_COMM_KIND__INTRACOMM) { - dup_vcrt(src_comm->dev.vcrt, &comm->dev.vcrt, mapper, mapper->src_comm->remote_size, - vcrt_size, vcrt_offset); - } - else - dup_vcrt(src_comm->dev.vcrt, &comm->dev.local_vcrt, mapper, mapper->src_comm->remote_size, - vcrt_size, vcrt_offset); - } - vcrt_offset += map_size(*mapper); - } - - /* Next, handle all the mappers that contribute to the remote part - * of the comm (only valid for intercomms) */ - vcrt_size = 0; - LL_FOREACH(comm->mapper_head, mapper) { - if (mapper->dir == MPIR_COMM_MAP_DIR__L2L || - mapper->dir == MPIR_COMM_MAP_DIR__R2L) - continue; - - vcrt_size += map_size(*mapper); - } - vcrt_offset = 0; - LL_FOREACH(comm->mapper_head, mapper) { - src_comm = mapper->src_comm; - - if (mapper->dir == MPIR_COMM_MAP_DIR__L2L || - mapper->dir == MPIR_COMM_MAP_DIR__R2L) - continue; - - MPIR_Assert(comm->comm_kind == MPIR_COMM_KIND__INTERCOMM); - - if (mapper->dir == MPIR_COMM_MAP_DIR__L2R) { - if (src_comm->comm_kind == MPIR_COMM_KIND__INTRACOMM) - dup_vcrt(src_comm->dev.vcrt, &comm->dev.vcrt, mapper, mapper->src_comm->local_size, - vcrt_size, vcrt_offset); - else - dup_vcrt(src_comm->dev.local_vcrt, &comm->dev.vcrt, mapper, - mapper->src_comm->local_size, vcrt_size, vcrt_offset); + } else { + if (comm->local_group->ch3_vcrt == NULL) { + MPIDI_VCRT_Add_ref(comm->dev.local_vcrt); + comm->local_group->ch3_vcrt = comm->dev.local_vcrt; } - else { /* mapper->dir == MPIR_COMM_MAP_DIR__R2R */ - MPIR_Assert(src_comm->comm_kind == MPIR_COMM_KIND__INTERCOMM); - dup_vcrt(src_comm->dev.vcrt, &comm->dev.vcrt, mapper, mapper->src_comm->remote_size, - vcrt_size, vcrt_offset); + if (comm->remote_group->ch3_vcrt == NULL) { + MPIDI_VCRT_Add_ref(comm->dev.vcrt); + comm->remote_group->ch3_vcrt = comm->dev.vcrt; } - vcrt_offset += map_size(*mapper); } if (comm->comm_kind == MPIR_COMM_KIND__INTERCOMM) { @@ -326,6 +254,7 @@ int MPIDI_CH3I_Comm_commit_pre_hook(MPIR_Comm *comm) } } + hook_elt *elt; LL_FOREACH(create_hooks_head, elt) { mpi_errno = elt->hook_fn(comm, elt->param); if (mpi_errno) MPIR_ERR_POP(mpi_errno);; diff --git a/src/mpid/ch3/src/ch3u_port.c b/src/mpid/ch3/src/ch3u_port.c index 91aaf1f11d5..40a71e20da1 100644 --- a/src/mpid/ch3/src/ch3u_port.c +++ b/src/mpid/ch3/src/ch3u_port.c @@ -1341,20 +1341,6 @@ static int SetupNewIntercomm( MPIR_Comm *comm_ptr, int remote_comm_size, intercomm->comm_kind = MPIR_COMM_KIND__INTERCOMM; intercomm->local_comm = NULL; - /* Point local vcrt at those of incoming intracommunicator */ - intercomm->dev.local_vcrt = comm_ptr->dev.vcrt; - MPIDI_VCRT_Add_ref(comm_ptr->dev.vcrt); - - /* Set up VC reference table */ - mpi_errno = MPIDI_VCRT_Create(intercomm->remote_size, &intercomm->dev.vcrt); - if (mpi_errno != MPI_SUCCESS) { - MPIR_ERR_SETANDJUMP(mpi_errno,MPI_ERR_OTHER, "**init_vcrt"); - } - for (i=0; i < intercomm->remote_size; i++) { - MPIDI_PG_Dup_vcr(remote_pg[remote_translation[i].pg_index], - remote_translation[i].pg_rank, &intercomm->dev.vcrt->vcr_table[i]); - } - intercomm->local_group = comm_ptr->local_group; MPIR_Group_add_ref(comm_ptr->local_group); diff --git a/src/mpid/ch3/src/mpid_vc.c b/src/mpid/ch3/src/mpid_vc.c index bf92e8e330c..64553fbc7f3 100644 --- a/src/mpid/ch3/src/mpid_vc.c +++ b/src/mpid/ch3/src/mpid_vc.c @@ -624,64 +624,8 @@ int MPID_Create_intercomm_from_lpids( MPIR_Comm *newcomm_ptr, int size, const MPIR_Lpid lpids[] ) { int mpi_errno = MPI_SUCCESS; - MPIR_Comm *commworld_ptr; - int i; - MPIDI_PG_iterator iter; - commworld_ptr = MPIR_Process.comm_world; - /* Setup the communicator's vc table: remote group */ - MPIDI_VCRT_Create( size, &newcomm_ptr->dev.vcrt ); - for (i=0; irank, i, lpids[i] ); */ - if (lpids[i] < commworld_ptr->remote_size) { - vc = commworld_ptr->dev.vcrt->vcr_table[lpids[i]]; - } - else { - /* We must find the corresponding vcr for a given lpid */ - /* For now, this means iterating through the process groups */ - MPIDI_PG_t *pg = 0; - int j; - - MPIDI_PG_Get_iterator(&iter); - /* Skip comm_world */ - MPIDI_PG_Get_next( &iter, &pg ); - do { - MPIDI_PG_Get_next( &iter, &pg ); - MPIR_ERR_CHKINTERNAL(!pg, mpi_errno, "no pg"); - /* FIXME: a quick check on the min/max values of the lpid - for this process group could help speed this search */ - for (j=0; jsize; j++) { - /*printf( "Checking lpid %d against %d in pg %s\n", - lpids[i], pg->vct[j].lpid, (char *)pg->id ); - fflush(stdout); */ - if (pg->vct[j].lpid == lpids[i]) { - vc = &pg->vct[j]; - /*printf( "found vc %x for lpid = %d in another pg\n", - (int)vc, lpids[i] );*/ - break; - } - } - } while (!vc); - } - - /* printf( "about to dup vc %x for lpid = %d in another pg\n", - (int)vc, lpids[i] ); */ - /* Note that his will increment the ref count for the associate - PG if necessary. */ - MPIDI_VCR_Dup( vc, &newcomm_ptr->dev.vcrt->vcr_table[i] ); - } -fn_exit: return mpi_errno; -fn_fail: - goto fn_exit; } /* The following is a temporary hook to ensure that all processes in From 0ab1e5509edb176ba25b021699d3657ff7f98285 Mon Sep 17 00:00:00 2001 From: Hui Zhou Date: Fri, 20 Dec 2024 16:57:28 -0600 Subject: [PATCH 32/59] ch3: remove the isDisconnected arg from MPIDI_VCRT_Release The only logic for whether to release a vc is whether this vc is for a dynamic process. It has nothing to do with the whether MPI_Comm_disconnect is called. The semantics of MPI_Comm_disconnect is just to wait for all communication complete. It is orthogonal to how the comm is destroyed. --- src/mpid/ch3/include/mpidimpl.h | 2 +- src/mpid/ch3/include/mpidpre.h | 4 ---- src/mpid/ch3/src/ch3u_comm.c | 14 +++----------- src/mpid/ch3/src/ch3u_port.c | 2 +- src/mpid/ch3/src/mpid_comm_disconnect.c | 4 ---- src/mpid/ch3/src/mpid_vc.c | 22 +++------------------- 6 files changed, 8 insertions(+), 40 deletions(-) diff --git a/src/mpid/ch3/include/mpidimpl.h b/src/mpid/ch3/include/mpidimpl.h index e7d30ae62ad..1234fbb6539 100644 --- a/src/mpid/ch3/include/mpidimpl.h +++ b/src/mpid/ch3/include/mpidimpl.h @@ -484,7 +484,7 @@ typedef int (*MPIDI_PG_Destroy_fn_t)(MPIDI_PG_t * pg); int MPIDI_VCRT_Create(int size, struct MPIDI_VCRT **vcrt_ptr); int MPIDI_VCRT_Add_ref(struct MPIDI_VCRT *vcrt); -int MPIDI_VCRT_Release(struct MPIDI_VCRT *vcrt, int isDisconnect); +int MPIDI_VCRT_Release(struct MPIDI_VCRT *vcrt); int MPIDI_VCR_Dup(MPIDI_VCR orig_vcr, MPIDI_VCR * new_vcr); int MPIDI_PG_Init(MPIDI_PG_Compare_ids_fn_t, MPIDI_PG_Destroy_fn_t); diff --git a/src/mpid/ch3/include/mpidpre.h b/src/mpid/ch3/include/mpidpre.h index 67176a7b232..4177ca80a8e 100644 --- a/src/mpid/ch3/include/mpidpre.h +++ b/src/mpid/ch3/include/mpidpre.h @@ -182,10 +182,6 @@ typedef struct MPIDI_CH3I_comm * waiting for a revoke message before we can release * the context id */ - int is_disconnected; /* set to TRUE if this communicator was - * disconnected as a part of - * MPI_COMM_DISCONNECT; FALSE otherwise. */ - struct MPIDI_VCRT *vcrt; /* virtual connection reference table */ struct MPIDI_VCRT *local_vcrt; /* local virtual connection reference table */ diff --git a/src/mpid/ch3/src/ch3u_comm.c b/src/mpid/ch3/src/ch3u_comm.c index c2c87442e66..52c6d97210b 100644 --- a/src/mpid/ch3/src/ch3u_comm.c +++ b/src/mpid/ch3/src/ch3u_comm.c @@ -172,11 +172,6 @@ int MPIDI_CH3I_Comm_commit_pre_hook(MPIR_Comm *comm) int mpi_errno = MPI_SUCCESS; MPIR_FUNC_ENTER; - /* initialize the is_disconnected variable to FALSE. this will be - * set to TRUE if the communicator is freed by an - * MPI_COMM_DISCONNECT call. */ - comm->dev.is_disconnected = 0; - if (comm == MPIR_Process.comm_world) { comm->rank = MPIR_Process.rank; comm->remote_size = MPIR_Process.size; @@ -288,11 +283,11 @@ int MPIDI_CH3I_Comm_destroy_hook(MPIR_Comm *comm) MPIR_ERR_CHECK(mpi_errno); } - mpi_errno = MPIDI_VCRT_Release(comm->dev.vcrt, comm->dev.is_disconnected); + mpi_errno = MPIDI_VCRT_Release(comm->dev.vcrt); MPIR_ERR_CHECK(mpi_errno); if (comm->comm_kind == MPIR_COMM_KIND__INTERCOMM) { - mpi_errno = MPIDI_VCRT_Release(comm->dev.local_vcrt, comm->dev.is_disconnected); + mpi_errno = MPIDI_VCRT_Release(comm->dev.local_vcrt); MPIR_ERR_CHECK(mpi_errno); } @@ -522,10 +517,7 @@ int MPID_Group_free_hook(MPIR_Group * group_ptr) int mpi_errno = MPI_SUCCESS; if (group_ptr->ch3_vcrt) { - /* FIXME: setting TRUE so vc entries may get released. - * Is there a case we don't want that? - */ - mpi_errno = MPIDI_VCRT_Release(group_ptr->ch3_vcrt, TRUE); + mpi_errno = MPIDI_VCRT_Release(group_ptr->ch3_vcrt); } return mpi_errno; } diff --git a/src/mpid/ch3/src/ch3u_port.c b/src/mpid/ch3/src/ch3u_port.c index 40a71e20da1..fa1d29bf069 100644 --- a/src/mpid/ch3/src/ch3u_port.c +++ b/src/mpid/ch3/src/ch3u_port.c @@ -548,7 +548,7 @@ static int MPIDI_CH3I_Release_tmp_comm(MPIR_Comm *tmp_comm) { int mpi_errno = MPI_SUCCESS; - mpi_errno = MPIDI_VCRT_Release(tmp_comm->dev.vcrt, FALSE); + mpi_errno = MPIDI_VCRT_Release(tmp_comm->dev.vcrt); MPIR_ERR_CHECK(mpi_errno); MPIR_Free_contextid(tmp_comm->recvcontext_id); diff --git a/src/mpid/ch3/src/mpid_comm_disconnect.c b/src/mpid/ch3/src/mpid_comm_disconnect.c index cec8a198e2f..2809dd51ff3 100644 --- a/src/mpid/ch3/src/mpid_comm_disconnect.c +++ b/src/mpid/ch3/src/mpid_comm_disconnect.c @@ -27,10 +27,6 @@ int MPID_Comm_disconnect(MPIR_Comm *comm_ptr) MPIR_ERR_SETANDJUMP(mpi_errno,MPIX_ERR_REVOKED,"**revoked"); } - /* it's more than a comm_release, but ok for now */ - /* FIXME: Describe what more might be required */ - /* MPIU_PG_Printall( stdout ); */ - comm_ptr->dev.is_disconnected = 1; mpi_errno = MPIR_Comm_release(comm_ptr); MPIR_ERR_CHECK(mpi_errno); /* If any of the VCs were released by this Comm_release, wait diff --git a/src/mpid/ch3/src/mpid_vc.c b/src/mpid/ch3/src/mpid_vc.c index 64553fbc7f3..68c9e8fae8a 100644 --- a/src/mpid/ch3/src/mpid_vc.c +++ b/src/mpid/ch3/src/mpid_vc.c @@ -106,7 +106,7 @@ int MPIDI_VCRT_Add_ref(struct MPIDI_VCRT *vcrt) Notes: @*/ -int MPIDI_VCRT_Release(struct MPIDI_VCRT *vcrt, int isDisconnect ) +int MPIDI_VCRT_Release(struct MPIDI_VCRT *vcrt) { int in_use; int mpi_errno = MPI_SUCCESS; @@ -130,24 +130,8 @@ int MPIDI_VCRT_Release(struct MPIDI_VCRT *vcrt, int isDisconnect ) MPIDI_VC_release_ref(vc, &in_use); - /* Dynamic connections start with a refcount of 2 instead of 1. - * That way we can distinguish between an MPI_Free and an - * MPI_Comm_disconnect. */ - /* XXX DJG FIXME-MT should we be checking this? */ - /* probably not, need to do something like the following instead: */ -#if 0 - if (isDisconnect) { - MPIR_Assert(in_use); - /* FIXME this is still bogus, the VCRT may contain a mix of - * dynamic and non-dynamic VCs, so the ref_count isn't - * guaranteed to have started at 2. The best thing to do might - * be to avoid overloading the reference counting this way and - * use a separate check for dynamic VCs (another flag? compare - * PGs?) */ - MPIR_Object_release_ref(vc, &in_use); - } -#endif - if (isDisconnect && MPIR_Object_get_ref(vc) == 1) { + if (vc->lpid >= MPIR_Process.size && MPIR_Object_get_ref(vc) == 1) { + /* release vc from dynamic process */ MPIDI_VC_release_ref(vc, &in_use); } From ca79ab7c7ba3df349483960f72c5cac1055ae8af Mon Sep 17 00:00:00 2001 From: Hui Zhou Date: Fri, 20 Dec 2024 23:46:40 -0600 Subject: [PATCH 33/59] comm: avoid creating invalid intercomm then destroy it In MPIR_Comm_create_inter, we know whether the remote group is empty after the exchange, thus it is unnecessary to create and commit the intercomm then delete it later. Simply don't create it in the first place. The device layer is not necessarily equipped to handle intercomm commit with empty groups. --- src/mpi/comm/comm_impl.c | 86 +++++++++++++++++----------------------- 1 file changed, 37 insertions(+), 49 deletions(-) diff --git a/src/mpi/comm/comm_impl.c b/src/mpi/comm/comm_impl.c index 746b2825b6a..8c21a760e7b 100644 --- a/src/mpi/comm/comm_impl.c +++ b/src/mpi/comm/comm_impl.c @@ -409,31 +409,6 @@ int MPIR_Comm_create_inter(MPIR_Comm * comm_ptr, MPIR_Group * group_ptr, MPIR_Co mpi_errno = MPII_Comm_create_calculate_mapping(group_ptr, comm_ptr, &mapping, &mapping_comm); MPIR_ERR_CHECK(mpi_errno); - *newcomm_ptr = NULL; - - if (group_ptr->rank != MPI_UNDEFINED) { - /* Get the new communicator structure and context id */ - mpi_errno = MPIR_Comm_create(newcomm_ptr); - if (mpi_errno) - goto fn_fail; - - (*newcomm_ptr)->recvcontext_id = new_context_id; - (*newcomm_ptr)->rank = group_ptr->rank; - (*newcomm_ptr)->comm_kind = comm_ptr->comm_kind; - /* Since the group has been provided, let the new communicator know - * about the group */ - (*newcomm_ptr)->local_comm = 0; - (*newcomm_ptr)->local_group = group_ptr; - MPIR_Group_add_ref(group_ptr); - - (*newcomm_ptr)->local_size = group_ptr->size; - (*newcomm_ptr)->remote_group = 0; - - (*newcomm_ptr)->is_low_group = comm_ptr->is_low_group; - - MPIR_Comm_set_session_ptr(*newcomm_ptr, session_ptr); - } - /* There is an additional step. We must communicate the * information on the local context id and the group members, * given by the ranks so that the remote process can construct the @@ -456,9 +431,6 @@ int MPIR_Comm_create_inter(MPIR_Comm * comm_ptr, MPIR_Group * group_ptr, MPIR_Co rinfo, 2, MPI_INT, 0, 0, comm_ptr, MPI_STATUS_IGNORE, MPIR_ERR_NONE); MPIR_ERR_CHECK(mpi_errno); - if (*newcomm_ptr != NULL) { - (*newcomm_ptr)->context_id = rinfo[0]; - } remote_size = rinfo[1]; MPIR_CHKLMEM_MALLOC(remote_mapping, int *, @@ -482,9 +454,7 @@ int MPIR_Comm_create_inter(MPIR_Comm * comm_ptr, MPIR_Group * group_ptr, MPIR_Co /* Broadcast to the other members of the local group */ mpi_errno = MPIR_Bcast(rinfo, 2, MPI_INT, 0, comm_ptr->local_comm, MPIR_ERR_NONE); MPIR_ERR_CHECK(mpi_errno); - if (*newcomm_ptr != NULL) { - (*newcomm_ptr)->context_id = rinfo[0]; - } + remote_size = rinfo[1]; MPIR_CHKLMEM_MALLOC(remote_mapping, int *, remote_size * sizeof(int), @@ -495,10 +465,45 @@ int MPIR_Comm_create_inter(MPIR_Comm * comm_ptr, MPIR_Group * group_ptr, MPIR_Co } MPIR_Assert(remote_size >= 0); + if (group_ptr->rank == MPI_UNDEFINED || remote_size <= 0) { + /* If we are not part of the group, or - + * It's possible that no members of the other side of comm were + * members of the group that they passed, which we only know after + * receiving/bcasting the remote_size above. We must return + * MPI_COMM_NULL in this case. + */ + MPIR_Free_contextid(new_context_id); + *newcomm_ptr = NULL; + goto fn_exit; + } + + /* FIXME: the branch was kept to minimize line changes. Remove the if-check. */ + if (group_ptr->rank != MPI_UNDEFINED) { + /* Get the new communicator structure and context id */ + mpi_errno = MPIR_Comm_create(newcomm_ptr); + if (mpi_errno) + goto fn_fail; + (*newcomm_ptr)->context_id = rinfo[0]; + (*newcomm_ptr)->remote_size = rinfo[1]; + (*newcomm_ptr)->recvcontext_id = new_context_id; + (*newcomm_ptr)->rank = group_ptr->rank; + (*newcomm_ptr)->comm_kind = comm_ptr->comm_kind; + /* Since the group has been provided, let the new communicator know + * about the group */ + (*newcomm_ptr)->local_comm = 0; + (*newcomm_ptr)->local_group = group_ptr; + MPIR_Group_add_ref(group_ptr); + + (*newcomm_ptr)->local_size = group_ptr->size; + (*newcomm_ptr)->remote_group = 0; + + (*newcomm_ptr)->is_low_group = comm_ptr->is_low_group; + + MPIR_Comm_set_session_ptr(*newcomm_ptr, session_ptr); + } if (group_ptr->rank != MPI_UNDEFINED) { - (*newcomm_ptr)->remote_size = remote_size; /* Now, everyone has the remote_mapping, and can apply that to * the network address mapping. */ @@ -528,23 +533,6 @@ int MPIR_Comm_create_inter(MPIR_Comm * comm_ptr, MPIR_Group * group_ptr, MPIR_Co (*newcomm_ptr)->tainted = comm_ptr->tainted; mpi_errno = MPIR_Comm_commit(*newcomm_ptr); MPIR_ERR_CHECK(mpi_errno); - - if (remote_size <= 0) { - /* It's possible that no members of the other side of comm were - * members of the group that they passed, which we only know after - * receiving/bcasting the remote_size above. We must return - * MPI_COMM_NULL in this case, but we can't free the newcomm_ptr - * immediately after the communication above because - * MPIR_Comm_release won't work correctly with a half-constructed - * comm. */ - mpi_errno = MPIR_Comm_release(*newcomm_ptr); - MPIR_ERR_CHECK(mpi_errno); - *newcomm_ptr = NULL; - } - } else { - /* This process is not in the group */ - MPIR_Free_contextid(new_context_id); - *newcomm_ptr = NULL; } fn_exit: From d486176b1e27b8ea9fe31797fef4ca9a273161e2 Mon Sep 17 00:00:00 2001 From: Hui Zhou Date: Fri, 20 Dec 2024 10:18:12 -0600 Subject: [PATCH 34/59] ---- START HERE ---- --- dummy | 1 + 1 file changed, 1 insertion(+) diff --git a/dummy b/dummy index 6ed281c757a..e8183f05f5d 100644 --- a/dummy +++ b/dummy @@ -1,2 +1,3 @@ 1 1 +1 From 8fc2f94576c1bc55ca3452ec0f53857bd0c278cb Mon Sep 17 00:00:00 2001 From: Hui Zhou Date: Wed, 25 Dec 2024 18:40:17 -0600 Subject: [PATCH 35/59] mpir/mem: add MPIR_CHKLMEM_ADD Add a macro that tracks local memory allocation from other routines. --- src/include/mpir_mem.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/include/mpir_mem.h b/src/include/mpir_mem.h index 147e67ef7bb..4eb03415105 100644 --- a/src/include/mpir_mem.h +++ b/src/include/mpir_mem.h @@ -110,6 +110,12 @@ extern "C" { int mpiu_chklmem_stk_sp_=0; \ MPIR_AssertDeclValue(const int mpiu_chklmem_stk_sz_,n_) +#define MPIR_CHKLMEM_ADD(pointer_) \ + do { \ + MPIR_Assert(mpiu_chklmem_stk_sp_ Date: Mon, 23 Dec 2024 08:36:27 -0600 Subject: [PATCH 36/59] mpir: split lpid headers to mpir_lpid.h Because we need access MPIR_Lpid definitions in mpidpre headers, we need move worlds and lpid definitions to device-independent headers. Add macro MPIR_LPID_INVALID. Make MPIR_Lpid signed. Since we are going to perform arithmetic on MPIR_Lpid, e.g. in using strided pmap, make MPIR_Lpid int64_t instead of uint64_t to avoid accidental conversion errors. --- src/include/mpiimpl.h | 1 + src/include/mpir_group.h | 36 ------------------------------- src/include/mpir_lpid.h | 46 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 47 insertions(+), 36 deletions(-) create mode 100644 src/include/mpir_lpid.h diff --git a/src/include/mpiimpl.h b/src/include/mpiimpl.h index d665d3e2eff..c118802410d 100644 --- a/src/include/mpiimpl.h +++ b/src/include/mpiimpl.h @@ -169,6 +169,7 @@ typedef struct MPIR_Stream MPIR_Stream; #include "mpir_errhandler.h" #include "mpir_attr_generic.h" #include "mpir_contextid.h" +#include "mpir_lpid.h" #include "mpir_status.h" #include "mpir_debugger.h" #include "mpir_op.h" diff --git a/src/include/mpir_group.h b/src/include/mpir_group.h index 7397aa14f6a..b2b99e1a80a 100644 --- a/src/include/mpir_group.h +++ b/src/include/mpir_group.h @@ -50,42 +50,6 @@ #define MPIR_GROUP_WORLD_PTR (MPIR_Group_builtin + 1) #define MPIR_GROUP_SELF_PTR (MPIR_Group_builtin + 2) -/* Worlds - - * We need a device-independent way of identifying processes. Assuming the concept of - * "worlds", we can describe a process with (world_idx, world_rank). - * - * The world_idx is a local id because each process may not see all worlds. Thus, - * each process only can maintain a list of worlds as it encounters them. Thus, - * a process id derived from (world_idx, world_rank) is referred as LPID, or - * "local process id". - * - * Each process should maintain a table of worlds with sufficient information so - * processes can match worlds upon connection or making address exchange. - */ - -#define MPIR_NAMESPACE_MAX 128 -struct MPIR_World { - char namespace[MPIR_NAMESPACE_MAX]; - /* other useful fields */ - int num_procs; -}; - -extern struct MPIR_World MPIR_Worlds[]; - -int MPIR_add_world(const char *namespace, int num_procs); -int MPIR_find_world(const char *namespace); - -/* Abstract the integer type for lpid (process id). It is possible to use 32-bit - * in principle, but 64-bit is simpler since we can trivially combine - * (world_idx, world_rank). - */ -typedef uint64_t MPIR_Lpid; - -#define MPIR_LPID_WORLD_INDEX(lpid) ((lpid) >> 32) -#define MPIR_LPID_WORLD_RANK(lpid) ((lpid) & 0xffffffff) -#define MPIR_LPID_FROM(world_idx, world_rank) (((uint64_t)(world_idx) << 32) | (world_rank)) -#define MPIR_LPID_DYNAMIC_MASK ((MPIR_Lpid)0x1 << 63) - struct MPIR_Pmap { int size; /* same as group->size, duplicate here so Pmap is logically complete */ bool use_map; diff --git a/src/include/mpir_lpid.h b/src/include/mpir_lpid.h new file mode 100644 index 00000000000..dfa16ac75c3 --- /dev/null +++ b/src/include/mpir_lpid.h @@ -0,0 +1,46 @@ +/* + * Copyright (C) by Argonne National Laboratory + * See COPYRIGHT in top-level directory + */ + +#ifndef MPIR_LPID_H_INCLUDED +#define MPIR_LPID_H_INCLUDED + +/* Worlds - + * We need a device-independent way of identifying processes. Assuming the concept of + * "worlds", we can describe a process with (world_idx, world_rank). + * + * The world_idx is a local id because each process may not see all worlds. Thus, + * each process only can maintain a list of worlds as it encounters them. Thus, + * a process id derived from (world_idx, world_rank) is referred as LPID, or + * "local process id". + * + * Each process should maintain a table of worlds with sufficient information so + * processes can match worlds upon connection or making address exchange. + */ + +#define MPIR_NAMESPACE_MAX 128 +struct MPIR_World { + char namespace[MPIR_NAMESPACE_MAX]; + /* other useful fields */ + int num_procs; +}; + +extern struct MPIR_World MPIR_Worlds[]; + +int MPIR_add_world(const char *namespace, int num_procs); +int MPIR_find_world(const char *namespace); + +/* Abstract the integer type for lpid (process id). It is possible to use 32-bit + * in principle, but 64-bit is simpler since we can trivially combine + * (world_idx, world_rank). + */ +typedef int64_t MPIR_Lpid; + +#define MPIR_LPID_WORLD_INDEX(lpid) ((lpid) >> 32) +#define MPIR_LPID_WORLD_RANK(lpid) ((lpid) & 0xffffffff) +#define MPIR_LPID_FROM(world_idx, world_rank) (((uint64_t)(world_idx) << 32) | (world_rank)) +#define MPIR_LPID_DYNAMIC_MASK ((MPIR_Lpid)0x1 << 62) /* MPIR_Lpid is signed, avoid using the signed bit */ +#define MPIR_LPID_INVALID 0xffffffff + +#endif /* MPIR_LPID_H_INCLUDED */ From 4f13f239b1b020221af41fcda3f401d4f2a21177 Mon Sep 17 00:00:00 2001 From: Hui Zhou Date: Tue, 24 Dec 2024 10:18:53 -0600 Subject: [PATCH 37/59] group: optimize grouputil * Add check_map_is_strided to detect strided pattern and convert a map into a strided pmap. * In MPIR_Group_check_subset, use MPIR_Group_lpid_to_rank rather than a manual linear search. * Move internal static routines to the bottom of grouputil.c. --- src/mpi/group/grouputil.c | 157 ++++++++++++++++++++++++-------------- 1 file changed, 100 insertions(+), 57 deletions(-) diff --git a/src/mpi/group/grouputil.c b/src/mpi/group/grouputil.c index 5d0c66847f0..37929d7e5e6 100644 --- a/src/mpi/group/grouputil.c +++ b/src/mpi/group/grouputil.c @@ -193,6 +193,9 @@ int MPIR_Group_dup(MPIR_Group * old_group, MPIR_Session * session_ptr, MPIR_Grou goto fn_exit; } +static bool check_map_is_strided(int size, MPIR_Lpid * map, + MPIR_Lpid * offset_out, MPIR_Lpid * stride_out, + MPIR_Lpid * blocksize_out); int MPIR_Group_create_map(int size, int rank, MPIR_Session * session_ptr, MPIR_Lpid * map, MPIR_Group ** new_group_ptr) { @@ -214,10 +217,16 @@ int MPIR_Group_create_map(int size, int rank, MPIR_Session * session_ptr, MPIR_L newgrp->rank = rank; MPIR_Group_set_session_ptr(newgrp, session_ptr); - newgrp->pmap.use_map = true; - newgrp->pmap.u.map = map; + if (check_map_is_strided(size, map, &newgrp->pmap.u.stride.offset, + &newgrp->pmap.u.stride.stride, &newgrp->pmap.u.stride.blocksize)) { + newgrp->pmap.use_map = false; + MPL_free(map); + } else { + newgrp->pmap.use_map = true; + newgrp->pmap.u.map = map; + /* TODO: build hash to accelerate MPIR_Group_lpid_to_rank */ + } - /* TODO: build hash to accelerate MPIR_Group_lpid_to_rank */ *new_group_ptr = newgrp; } @@ -261,36 +270,7 @@ int MPIR_Group_create_stride(int size, int rank, MPIR_Session * session_ptr, goto fn_exit; } -static int pmap_lpid_to_rank(struct MPIR_Pmap *pmap, MPIR_Lpid lpid) -{ - if (pmap->use_map) { - /* Use linear search for now. - * Optimization: build hash map in MPIR_Group_create_map and do O(1) hash lookup - */ - for (int rank = 0; rank < pmap->size; rank++) { - if (pmap->u.map[rank] == lpid) { - return rank; - } - } - return MPI_UNDEFINED; - } else { - lpid -= pmap->u.stride.offset; - MPIR_Lpid i_blk = lpid / pmap->u.stride.stride; - MPIR_Lpid r_blk = lpid % pmap->u.stride.stride; - - if (r_blk >= pmap->u.stride.blocksize) { - return MPI_UNDEFINED; - } - - int rank = i_blk * pmap->u.stride.blocksize + r_blk; - if (rank >= 0 && rank < pmap->size) { - return rank; - } else { - return MPI_UNDEFINED; - } - } -} - +static int pmap_lpid_to_rank(struct MPIR_Pmap *pmap, MPIR_Lpid lpid); int MPIR_Group_lpid_to_rank(MPIR_Group * group, MPIR_Lpid lpid) { return pmap_lpid_to_rank(&group->pmap, lpid); @@ -432,36 +412,16 @@ int MPIR_Group_check_subset(MPIR_Group * group_ptr, MPIR_Comm * comm_ptr) { int mpi_errno = MPI_SUCCESS; - int vsize = comm_ptr->comm_kind == MPIR_COMM_KIND__INTERCOMM ? comm_ptr->local_size : - comm_ptr->remote_size; - /* Initialize the vmap */ - MPIR_Lpid *vmap = MPL_malloc(vsize * sizeof(MPIR_Lpid), MPL_MEM_GROUP); - for (int i = 0; i < vsize; i++) { - /* FIXME: MPID_Comm_get_lpid to be removed */ - uint64_t dev_lpid; - MPID_Comm_get_lpid(comm_ptr, i, &dev_lpid, FALSE); - MPIR_Assert((dev_lpid >> 32) == 0); - vmap[i] = dev_lpid; - } - for (int rank = 0; rank < group_ptr->size; rank++) { MPIR_Lpid lpid = MPIR_Group_rank_to_lpid(group_ptr, rank); - bool found = false; - for (int i = 0; i < vsize; i++) { - if (vmap[i] == lpid) { - found = true; - break; - } - } - if (!found) { - MPIR_ERR_SET1(mpi_errno, MPI_ERR_GROUP, "**groupnotincomm", - "**groupnotincomm %d", rank); - goto fn_fail; + int r = MPIR_Group_lpid_to_rank(comm_ptr->local_group, lpid); + if (r == MPI_UNDEFINED) { + MPIR_ERR_SETANDJUMP1(mpi_errno, MPI_ERR_GROUP, "**groupnotincomm", + "**groupnotincomm %d", rank); } } fn_exit: - MPL_free(vmap); return mpi_errno; fn_fail: goto fn_exit; @@ -478,3 +438,86 @@ void MPIR_Group_set_session_ptr(MPIR_Group * group_ptr, MPIR_Session * session_p MPIR_Session_add_ref(session_ptr); } } + +/* internal static routines */ + +static bool check_map_is_strided(int size, MPIR_Lpid * map, + MPIR_Lpid * offset_out, MPIR_Lpid * stride_out, + MPIR_Lpid * blocksize_out) +{ + MPIR_Assert(size > 0); + if (size == 1) { + *offset_out = map[0]; + *stride_out = 1; + *blocksize_out = 1; + return true; + } else { + MPIR_Lpid offset, stride, blocksize; + offset = map[0]; + + blocksize = 1; + for (int i = 1; i < size; i++) { + if (map[i] - map[i - 1] == 1) { + blocksize++; + } else { + break; + } + } + if (blocksize == size) { + /* consecutive */ + *offset_out = offset; + *stride_out = 1; + *blocksize_out = 1; + return true; + } else { + stride = map[blocksize] - map[0]; + int n_strides = (size + blocksize - 1) / blocksize; + int k = 0; + for (int i = 0; i < n_strides; i++) { + for (int j = 0; j < blocksize; j++) { + if (map[k] != offset + i * stride + j) { + return false; + } + k++; + if (k == size) { + break; + } + } + } + *offset_out = offset; + *stride_out = stride; + *blocksize_out = blocksize; + return true; + } + } +} + +static int pmap_lpid_to_rank(struct MPIR_Pmap *pmap, MPIR_Lpid lpid) +{ + if (pmap->use_map) { + /* Use linear search for now. + * Optimization: build hash map in MPIR_Group_create_map and do O(1) hash lookup + */ + for (int rank = 0; rank < pmap->size; rank++) { + if (pmap->u.map[rank] == lpid) { + return rank; + } + } + return MPI_UNDEFINED; + } else { + lpid -= pmap->u.stride.offset; + MPIR_Lpid i_blk = lpid / pmap->u.stride.stride; + MPIR_Lpid r_blk = lpid % pmap->u.stride.stride; + + if (r_blk >= pmap->u.stride.blocksize) { + return MPI_UNDEFINED; + } + + int rank = i_blk * pmap->u.stride.blocksize + r_blk; + if (rank >= 0 && rank < pmap->size) { + return rank; + } else { + return MPI_UNDEFINED; + } + } +} From 48a4bab1a509285761c0a48b553292b6e9adf374 Mon Sep 17 00:00:00 2001 From: Hui Zhou Date: Tue, 24 Dec 2024 10:37:19 -0600 Subject: [PATCH 38/59] group: simplify strided pmap by removing blocksize A strided group with nontrivial blocksize is rare. By removing the blocksize parameter (i.e. blocksize is always 1), we greatly simplify the code and also improve the performance of lpid lookup in a more common strided group (such as a typical comm_world group or node group). --- src/include/mpir_group.h | 8 ++--- src/mpi/group/grouputil.c | 71 +++++++++------------------------------ 2 files changed, 17 insertions(+), 62 deletions(-) diff --git a/src/include/mpir_group.h b/src/include/mpir_group.h index b2b99e1a80a..a2b4436d551 100644 --- a/src/include/mpir_group.h +++ b/src/include/mpir_group.h @@ -58,7 +58,6 @@ struct MPIR_Pmap { struct { MPIR_Lpid offset; MPIR_Lpid stride; - MPIR_Lpid blocksize; } stride; } u; }; @@ -109,8 +108,7 @@ int MPIR_Group_dup(MPIR_Group * old_group, MPIR_Session * session_ptr, MPIR_Grou int MPIR_Group_create_map(int size, int rank, MPIR_Session * session_ptr, MPIR_Lpid * map, MPIR_Group ** new_group_ptr); int MPIR_Group_create_stride(int size, int rank, MPIR_Session * session_ptr, - MPIR_Lpid offset, MPIR_Lpid stride, MPIR_Lpid blocksize, - MPIR_Group ** new_group_ptr); + MPIR_Lpid offset, MPIR_Lpid stride, MPIR_Group ** new_group_ptr); int MPIR_Group_lpid_to_rank(MPIR_Group * group, MPIR_Lpid lpid); int MPIR_Group_check_subset(MPIR_Group * group_ptr, MPIR_Comm * comm_ptr); @@ -127,9 +125,7 @@ MPL_STATIC_INLINE_PREFIX MPIR_Lpid MPIR_Group_rank_to_lpid(MPIR_Group * group, i if (group->pmap.use_map) { return group->pmap.u.map[rank]; } else { - MPIR_Lpid i_blk = rank / group->pmap.u.stride.blocksize; - MPIR_Lpid r_blk = rank % group->pmap.u.stride.blocksize; - return group->pmap.u.stride.offset + i_blk * group->pmap.u.stride.stride + r_blk; + return group->pmap.u.stride.offset + rank * group->pmap.u.stride.stride; } } diff --git a/src/mpi/group/grouputil.c b/src/mpi/group/grouputil.c index 37929d7e5e6..aa8fb1331dd 100644 --- a/src/mpi/group/grouputil.c +++ b/src/mpi/group/grouputil.c @@ -69,7 +69,6 @@ int MPIR_Group_init(void) pmap->use_map = false; pmap->u.stride.offset = 0; pmap->u.stride.stride = 1; - pmap->u.stride.blocksize = 1; MPIR_Group_builtin[2].handle = MPIR_GROUP_SELF; MPIR_Object_set_ref(&MPIR_Group_builtin[2], 1); @@ -81,7 +80,6 @@ int MPIR_Group_init(void) pmap->use_map = false; pmap->u.stride.offset = MPIR_Process.rank; pmap->u.stride.stride = 1; - pmap->u.stride.blocksize = 1; return mpi_errno; } @@ -194,8 +192,7 @@ int MPIR_Group_dup(MPIR_Group * old_group, MPIR_Session * session_ptr, MPIR_Grou } static bool check_map_is_strided(int size, MPIR_Lpid * map, - MPIR_Lpid * offset_out, MPIR_Lpid * stride_out, - MPIR_Lpid * blocksize_out); + MPIR_Lpid * offset_out, MPIR_Lpid * stride_out); int MPIR_Group_create_map(int size, int rank, MPIR_Session * session_ptr, MPIR_Lpid * map, MPIR_Group ** new_group_ptr) { @@ -218,7 +215,7 @@ int MPIR_Group_create_map(int size, int rank, MPIR_Session * session_ptr, MPIR_L MPIR_Group_set_session_ptr(newgrp, session_ptr); if (check_map_is_strided(size, map, &newgrp->pmap.u.stride.offset, - &newgrp->pmap.u.stride.stride, &newgrp->pmap.u.stride.blocksize)) { + &newgrp->pmap.u.stride.stride)) { newgrp->pmap.use_map = false; MPL_free(map); } else { @@ -237,8 +234,7 @@ int MPIR_Group_create_map(int size, int rank, MPIR_Session * session_ptr, MPIR_L } int MPIR_Group_create_stride(int size, int rank, MPIR_Session * session_ptr, - MPIR_Lpid offset, MPIR_Lpid stride, MPIR_Lpid blocksize, - MPIR_Group ** new_group_ptr) + MPIR_Lpid offset, MPIR_Lpid stride, MPIR_Group ** new_group_ptr) { int mpi_errno = MPI_SUCCESS; @@ -259,7 +255,6 @@ int MPIR_Group_create_stride(int size, int rank, MPIR_Session * session_ptr, newgrp->pmap.use_map = false; newgrp->pmap.u.stride.offset = offset; newgrp->pmap.u.stride.stride = stride; - newgrp->pmap.u.stride.blocksize = blocksize; *new_group_ptr = newgrp; } @@ -442,53 +437,25 @@ void MPIR_Group_set_session_ptr(MPIR_Group * group_ptr, MPIR_Session * session_p /* internal static routines */ static bool check_map_is_strided(int size, MPIR_Lpid * map, - MPIR_Lpid * offset_out, MPIR_Lpid * stride_out, - MPIR_Lpid * blocksize_out) + MPIR_Lpid * offset_out, MPIR_Lpid * stride_out) { MPIR_Assert(size > 0); if (size == 1) { *offset_out = map[0]; *stride_out = 1; - *blocksize_out = 1; return true; } else { - MPIR_Lpid offset, stride, blocksize; + MPIR_Lpid offset, stride; offset = map[0]; - - blocksize = 1; + stride = map[1] - map[0]; for (int i = 1; i < size; i++) { - if (map[i] - map[i - 1] == 1) { - blocksize++; - } else { - break; + if (map[i] - map[i - 1] != stride) { + return false; } } - if (blocksize == size) { - /* consecutive */ - *offset_out = offset; - *stride_out = 1; - *blocksize_out = 1; - return true; - } else { - stride = map[blocksize] - map[0]; - int n_strides = (size + blocksize - 1) / blocksize; - int k = 0; - for (int i = 0; i < n_strides; i++) { - for (int j = 0; j < blocksize; j++) { - if (map[k] != offset + i * stride + j) { - return false; - } - k++; - if (k == size) { - break; - } - } - } - *offset_out = offset; - *stride_out = stride; - *blocksize_out = blocksize; - return true; - } + *offset_out = offset; + *stride_out = stride; + return true; } } @@ -505,19 +472,11 @@ static int pmap_lpid_to_rank(struct MPIR_Pmap *pmap, MPIR_Lpid lpid) } return MPI_UNDEFINED; } else { - lpid -= pmap->u.stride.offset; - MPIR_Lpid i_blk = lpid / pmap->u.stride.stride; - MPIR_Lpid r_blk = lpid % pmap->u.stride.stride; - - if (r_blk >= pmap->u.stride.blocksize) { - return MPI_UNDEFINED; - } - - int rank = i_blk * pmap->u.stride.blocksize + r_blk; - if (rank >= 0 && rank < pmap->size) { - return rank; - } else { + int rank = (lpid - pmap->u.stride.offset) / pmap->u.stride.stride; + if (rank < 0 || rank >= size || + lpid != rank * pmap->u.stride.stride + pmap->u.stride.offset) { return MPI_UNDEFINED; } + return rank; } } From f53004dc6664d2d7a6a32f5da4e3f075f9cba27a Mon Sep 17 00:00:00 2001 From: Hui Zhou Date: Tue, 24 Dec 2024 11:08:10 -0600 Subject: [PATCH 39/59] group: remove pmap->size The pmap is always used inside MPIR_Group, and its size is always the same as group->size. Having a duplicated field creates more opportunities for bugs from inconsistency. --- src/include/mpir_group.h | 3 +-- src/mpi/group/grouputil.c | 14 +++++++------- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/src/include/mpir_group.h b/src/include/mpir_group.h index a2b4436d551..441a542f0ce 100644 --- a/src/include/mpir_group.h +++ b/src/include/mpir_group.h @@ -51,7 +51,6 @@ #define MPIR_GROUP_SELF_PTR (MPIR_Group_builtin + 2) struct MPIR_Pmap { - int size; /* same as group->size, duplicate here so Pmap is logically complete */ bool use_map; union { MPIR_Lpid *map; @@ -118,7 +117,7 @@ int MPIR_Group_finalize(void); MPL_STATIC_INLINE_PREFIX MPIR_Lpid MPIR_Group_rank_to_lpid(MPIR_Group * group, int rank) { - if (rank < 0 || rank >= group->pmap.size) { + if (rank < 0 || rank >= group->size) { return MPI_UNDEFINED; } diff --git a/src/mpi/group/grouputil.c b/src/mpi/group/grouputil.c index aa8fb1331dd..a0c1037e003 100644 --- a/src/mpi/group/grouputil.c +++ b/src/mpi/group/grouputil.c @@ -65,7 +65,6 @@ int MPIR_Group_init(void) MPIR_Group_builtin[1].rank = MPIR_Process.rank; MPIR_Group_builtin[1].session_ptr = NULL; pmap = &MPIR_Group_builtin[1].pmap; - pmap->size = MPIR_Process.size; pmap->use_map = false; pmap->u.stride.offset = 0; pmap->u.stride.stride = 1; @@ -76,7 +75,6 @@ int MPIR_Group_init(void) MPIR_Group_builtin[2].rank = 0; MPIR_Group_builtin[2].session_ptr = NULL; pmap = &MPIR_Group_builtin[2].pmap; - pmap->size = 1; pmap->use_map = false; pmap->u.stride.offset = MPIR_Process.rank; pmap->u.stride.stride = 1; @@ -147,7 +145,6 @@ int MPIR_Group_create(int nproc, MPIR_Group ** new_group_ptr) (*new_group_ptr)->rank = MPI_UNDEFINED; (*new_group_ptr)->session_ptr = NULL; memset(&(*new_group_ptr)->pmap, 0, sizeof(struct MPIR_Pmap)); - (*new_group_ptr)->pmap.size = nproc; #ifdef MPID_DEV_GROUP_DECL mpi_errno = MPID_Group_init_hook(*new_group_ptr); #endif @@ -265,10 +262,10 @@ int MPIR_Group_create_stride(int size, int rank, MPIR_Session * session_ptr, goto fn_exit; } -static int pmap_lpid_to_rank(struct MPIR_Pmap *pmap, MPIR_Lpid lpid); +static int pmap_lpid_to_rank(struct MPIR_Pmap *pmap, int size, MPIR_Lpid lpid); int MPIR_Group_lpid_to_rank(MPIR_Group * group, MPIR_Lpid lpid) { - return pmap_lpid_to_rank(&group->pmap, lpid); + return pmap_lpid_to_rank(&group->pmap, group->size, lpid); } #ifdef HAVE_ERROR_CHECKING @@ -440,6 +437,9 @@ static bool check_map_is_strided(int size, MPIR_Lpid * map, MPIR_Lpid * offset_out, MPIR_Lpid * stride_out) { MPIR_Assert(size > 0); + for (int i = 0; i < size; i++) { + MPIR_Assert(map[i] != MPI_UNDEFINED); + } if (size == 1) { *offset_out = map[0]; *stride_out = 1; @@ -459,13 +459,13 @@ static bool check_map_is_strided(int size, MPIR_Lpid * map, } } -static int pmap_lpid_to_rank(struct MPIR_Pmap *pmap, MPIR_Lpid lpid) +static int pmap_lpid_to_rank(struct MPIR_Pmap *pmap, int size, MPIR_Lpid lpid) { if (pmap->use_map) { /* Use linear search for now. * Optimization: build hash map in MPIR_Group_create_map and do O(1) hash lookup */ - for (int rank = 0; rank < pmap->size; rank++) { + for (int rank = 0; rank < size; rank++) { if (pmap->u.map[rank] == lpid) { return rank; } From 21e75a58227d4554be81bd6ace9a365ba8186f58 Mon Sep 17 00:00:00 2001 From: Hui Zhou Date: Tue, 24 Dec 2024 10:16:53 -0600 Subject: [PATCH 40/59] ch3: return error code in create_vcrt_from_group Replace MPIR_Assert with better error message. --- src/mpi/errhan/errnames.txt | 2 ++ src/mpid/ch3/src/ch3u_comm.c | 4 ++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/src/mpi/errhan/errnames.txt b/src/mpi/errhan/errnames.txt index 9198104035d..54d8d2e49e5 100644 --- a/src/mpi/errhan/errnames.txt +++ b/src/mpi/errhan/errnames.txt @@ -902,6 +902,8 @@ is too big (> MPIU_SHMW_GHND_SZ) **iface_notfound %s:The network interface, \"%s\", specified in MPIR_CVAR_CH3_NETWORK_IFACE was not found. **procnamefailed:Failed to get processor name +**procnotfound:Process not found +**procnotfound %d:Process %d not found **notsuppmultithread:this functionality is not supported when the thread level is greater than MPI_THREAD_SINGLE **valuetoolarge:Value is too large to store diff --git a/src/mpid/ch3/src/ch3u_comm.c b/src/mpid/ch3/src/ch3u_comm.c index 52c6d97210b..8af02db70e3 100644 --- a/src/mpid/ch3/src/ch3u_comm.c +++ b/src/mpid/ch3/src/ch3u_comm.c @@ -156,7 +156,8 @@ static int create_vcrt_from_group(MPIR_Group *group, struct MPIDI_VCRT **vcrt_ou } pg = pg->next; } - MPIR_Assert(found_it); + MPIR_ERR_CHKANDJUMP1(!found_it, mpi_errno, MPI_ERR_OTHER, "**procnotfound", + "**procnotfound %d", i); } } @@ -164,7 +165,6 @@ static int create_vcrt_from_group(MPIR_Group *group, struct MPIDI_VCRT **vcrt_ou return mpi_errno; fn_fail: goto fn_exit; - } int MPIDI_CH3I_Comm_commit_pre_hook(MPIR_Comm *comm) From ef1a145c7e24e7ac3d0e5730b7fc3da3dc9bab2b Mon Sep 17 00:00:00 2001 From: Hui Zhou Date: Wed, 25 Dec 2024 00:07:28 -0600 Subject: [PATCH 41/59] ch3: fixup mpid_vc.c for lpid parameter type --- src/mpid/ch3/include/mpidpost.h | 2 +- src/mpid/ch3/src/mpid_vc.c | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/mpid/ch3/include/mpidpost.h b/src/mpid/ch3/include/mpidpost.h index e45f0fba1c2..b90487fcdf7 100644 --- a/src/mpid/ch3/include/mpidpost.h +++ b/src/mpid/ch3/include/mpidpost.h @@ -182,7 +182,7 @@ static inline int MPID_Progress_test(MPID_Progress_state * state) /* state is un int MPIDI_GPID_GetAllInComm( MPIR_Comm *comm_ptr, int local_size, MPIDI_Gpid local_gpids[], int *singlePG ); int MPIDI_GPID_Get( MPIR_Comm *comm_ptr, int rank, MPIDI_Gpid *gpid ); -int MPIDI_GPID_ToLpidArray( int size, MPIDI_Gpid gpid[], uint64_t lpid[] ); +int MPIDI_GPID_ToLpidArray( int size, MPIDI_Gpid gpid[], MPIR_Lpid lpid[] ); int MPIDI_PG_ForwardPGInfo( MPIR_Comm *peer_ptr, MPIR_Comm *comm_ptr, int nPGids, const MPIDI_Gpid gpids[], int root ); diff --git a/src/mpid/ch3/src/mpid_vc.c b/src/mpid/ch3/src/mpid_vc.c index 68c9e8fae8a..1ac50cb0851 100644 --- a/src/mpid/ch3/src/mpid_vc.c +++ b/src/mpid/ch3/src/mpid_vc.c @@ -305,7 +305,7 @@ int MPIDI_GPID_Get( MPIR_Comm *comm_ptr, int rank, MPIDI_Gpid *in_gpid ) * the GPIDs. Note that this code requires that all processes * have information on the process groups. */ -int MPIDI_GPID_ToLpidArray( int size, MPIDI_Gpid in_gpid[], uint64_t lpid[] ) +int MPIDI_GPID_ToLpidArray( int size, MPIDI_Gpid in_gpid[], MPIR_Lpid lpid[] ) { int i, mpi_errno = MPI_SUCCESS; int pgid; @@ -361,7 +361,7 @@ int MPIDI_GPID_ToLpidArray( int size, MPIDI_Gpid in_gpid[], uint64_t lpid[] ) } static inline int MPIDI_LPID_GetAllInComm(MPIR_Comm *comm_ptr, int local_size, - uint64_t local_lpids[]) + MPIR_Lpid local_lpids[]) { int i; int mpi_errno = MPI_SUCCESS; @@ -379,7 +379,7 @@ static inline int MPIDI_LPID_GetAllInComm(MPIR_Comm *comm_ptr, int local_size, /*@ check_disjoint_lpids - Exchange address mapping for intercomm creation. @*/ -static int check_disjoint_lpids(uint64_t lpids1[], int n1, uint64_t lpids2[], int n2) +static int check_disjoint_lpids(MPIR_Lpid lpids1[], int n1, MPIR_Lpid lpids2[], int n2) { int i, mask_size, idx, bit; uint64_t maxlpid = 0; From 82ba2c9d19b8826b34cc4ce3a40441afa596cc69 Mon Sep 17 00:00:00 2001 From: Hui Zhou Date: Sun, 22 Dec 2024 23:25:46 -0600 Subject: [PATCH 42/59] mpid: remove unused MPID_INTERCOMM_NO_DYNPROC --- src/mpid/ch3/include/mpidpost.h | 2 -- src/mpid/ch4/include/mpidpre.h | 3 --- 2 files changed, 5 deletions(-) diff --git a/src/mpid/ch3/include/mpidpost.h b/src/mpid/ch3/include/mpidpost.h index b90487fcdf7..8ce533044a4 100644 --- a/src/mpid/ch3/include/mpidpost.h +++ b/src/mpid/ch3/include/mpidpost.h @@ -194,8 +194,6 @@ int MPID_Create_intercomm_from_lpids( MPIR_Comm *newcomm_ptr, int size, const MPIR_Lpid lpids[] ); int MPID_Comm_get_lpid(MPIR_Comm *comm_ptr, int idx, MPIR_Lpid *lpid_ptr, bool is_remote); -#define MPID_INTERCOMM_NO_DYNPROC(comm) (0) - /* ULFM support */ MPL_STATIC_INLINE_PREFIX int MPID_Comm_AS_enabled(MPIR_Comm * comm_ptr) { diff --git a/src/mpid/ch4/include/mpidpre.h b/src/mpid/ch4/include/mpidpre.h index 3735200e2c8..7bea7588df9 100644 --- a/src/mpid/ch4/include/mpidpre.h +++ b/src/mpid/ch4/include/mpidpre.h @@ -714,9 +714,6 @@ typedef struct MPIDI_av_entry { #define MPIDI_DYNPROC_MASK (0x80000000U) -#define MPID_INTERCOMM_NO_DYNPROC(comm) \ - (MPIDI_COMM((comm),map).avtid == 0 && MPIDI_COMM((comm),local_map).avtid == 0) - int MPIDI_check_for_failed_procs(void); #ifdef HAVE_SIGNAL From b7622a08a7086a8b9b84a1eba59814ecbd4d12ea Mon Sep 17 00:00:00 2001 From: Hui Zhou Date: Tue, 17 Dec 2024 09:57:15 -0600 Subject: [PATCH 43/59] ch4: add MPIDIU_lpid_to_av and MPIDIU_lpid_to_av_slow We'll create av tables in ch4 according to world_idx and world_rank. MPIDIU_lpid_to_av can look up the av entry from an lpid in the communication path. MPIDIU_lpid_to_av_slow, used in communicator creation paths, will check and allocate the corresponding av table as needed. --- src/mpid/ch4/src/ch4_proc.c | 20 ++++++++++++++++++++ src/mpid/ch4/src/ch4_proc.h | 27 +++++++++++++++++++++++++++ 2 files changed, 47 insertions(+) diff --git a/src/mpid/ch4/src/ch4_proc.c b/src/mpid/ch4/src/ch4_proc.c index 01b182da582..0f10a760013 100644 --- a/src/mpid/ch4/src/ch4_proc.c +++ b/src/mpid/ch4/src/ch4_proc.c @@ -104,6 +104,8 @@ int MPIDIU_new_avt(int size, int *avtid) MPIR_cc_set(&MPIDI_global.avt_mgr.av_tables[*avtid]->ref_count, 0); + /* TODO: to support dynamic processes and dynamic av insertions, we need device hooks to initialize table with invalid entries */ + MPIR_FUNC_EXIT; return mpi_errno; } @@ -207,6 +209,24 @@ int MPIDIU_avt_destroy(void) return MPI_SUCCESS; } +/* used in communicator creation paths when the av entry may not exist or inserted yet */ +MPIDI_av_entry_t *MPIDIU_lpid_to_av_slow(MPIR_Lpid lpid) +{ + int world_idx = MPIR_LPID_WORLD_INDEX(lpid); + int world_rank = MPIR_LPID_WORLD_RANK(lpid); + + MPIR_Assert(world_rank < MPIR_Worlds[world_idx].num_procs); + + if (world_idx >= MPIDI_global.avt_mgr.n_avts) { + /* new world. Add av table for each new world */ + for (int i = MPIDI_global.avt_mgr.n_avts; i < world_idx + 1; i++) { + int avtid; + MPIDIU_new_avt(MPIR_Worlds[i].num_procs, &avtid); + MPIR_Assert(avtid == i); + } + } +} + #ifdef MPIDI_BUILD_CH4_UPID_HASH /* Store the upid, avtid, lpid in a hash to support get_local_upids and upids_to_lupids */ static MPIDI_upid_hash *upid_hash = NULL; diff --git a/src/mpid/ch4/src/ch4_proc.h b/src/mpid/ch4/src/ch4_proc.h index 5462d2e407d..dba4488408d 100644 --- a/src/mpid/ch4/src/ch4_proc.h +++ b/src/mpid/ch4/src/ch4_proc.h @@ -255,6 +255,33 @@ MPL_STATIC_INLINE_PREFIX int MPIDIU_rank_to_lpid(int rank, MPIR_Comm * comm) return ret; } +/* used in fast path where we know the lpid has a valid av, such as from a committed communicator */ +MPL_STATIC_INLINE_PREFIX MPIDI_av_entry_t *MPIDIU_lpid_to_av(MPIR_Lpid lpid) +{ + int world_idx = MPIR_LPID_WORLD_INDEX(lpid); + int world_rank = MPIR_LPID_WORLD_RANK(lpid); + return &MPIDI_global.avt_mgr.av_tables[world_idx]->table[world_rank]; +} + +/* used in communicator creation paths when the av entry may not exist or inserted yet */ +MPL_STATIC_INLINE_PREFIX MPIDI_av_entry_t *MPIDIU_lpid_to_av_slow(MPIR_Lpid lpid) +{ + int world_idx = MPIR_LPID_WORLD_INDEX(lpid); + int world_rank = MPIR_LPID_WORLD_RANK(lpid); + + MPIR_Assert(world_rank < MPIR_Worlds[world_idx].num_procs); + + if (world_idx >= MPIDI_global.avt_mgr.n_avts) { + for (int i = MPIDI_global.avt_mgr.n_avts; i < world_idx + 1; i++) { + int avtid; + MPIDIU_new_avt(MPIR_Worlds[i].num_procs, &avtid); + MPIR_Assert(avtid == i); + } + } + + return MPIDI_global.avt_mgr.av_tables[world_idx]->table[world_rank]; +} + MPL_STATIC_INLINE_PREFIX int MPIDI_rank_is_local(int rank, MPIR_Comm * comm) { int ret; From e7428e5d3cf19828e3dcffb4fedd1e5c1e142595 Mon Sep 17 00:00:00 2001 From: Hui Zhou Date: Tue, 17 Dec 2024 23:19:44 -0600 Subject: [PATCH 44/59] ch4: re-implement MPIDIU_comm_rank_to_av --- src/mpid/ch4/src/ch4_proc.h | 60 ++++--------------------------------- 1 file changed, 5 insertions(+), 55 deletions(-) diff --git a/src/mpid/ch4/src/ch4_proc.h b/src/mpid/ch4/src/ch4_proc.h index dba4488408d..ca2f96c5e4e 100644 --- a/src/mpid/ch4/src/ch4_proc.h +++ b/src/mpid/ch4/src/ch4_proc.h @@ -120,62 +120,12 @@ MPL_STATIC_INLINE_PREFIX MPIDI_av_entry_t *MPIDIU_comm_rank_to_av(MPIR_Comm * co MPIDI_av_entry_t *ret = NULL; MPIR_FUNC_ENTER; - int lpid; - switch (MPIDI_COMM(comm, map).mode) { - case MPIDI_RANK_MAP_DIRECT: - ret = &MPIDI_global.avt_mgr.av_tables[MPIDI_COMM(comm, map).avtid]->table[rank]; - break; - case MPIDI_RANK_MAP_DIRECT_INTRA: - ret = &MPIDI_global.avt_mgr.av_table0->table[rank]; - break; - case MPIDI_RANK_MAP_OFFSET: - ret = &MPIDI_global.avt_mgr.av_tables[MPIDI_COMM(comm, map).avtid] - ->table[rank + MPIDI_COMM(comm, map).reg.offset]; - break; - case MPIDI_RANK_MAP_OFFSET_INTRA: - ret = &MPIDI_global.avt_mgr.av_table0->table[rank + MPIDI_COMM(comm, map).reg.offset]; - break; - case MPIDI_RANK_MAP_STRIDE: - lpid = MPIDI_CALC_STRIDE_SIMPLE(rank, MPIDI_COMM(comm, map).reg.stride.stride, - MPIDI_COMM(comm, map).reg.stride.offset); - ret = &MPIDI_global.avt_mgr.av_tables[MPIDI_COMM(comm, map).avtid]->table[lpid]; - break; - case MPIDI_RANK_MAP_STRIDE_INTRA: - lpid = MPIDI_CALC_STRIDE_SIMPLE(rank, MPIDI_COMM(comm, map).reg.stride.stride, - MPIDI_COMM(comm, map).reg.stride.offset); - ret = &MPIDI_global.avt_mgr.av_table0->table[lpid]; - break; - case MPIDI_RANK_MAP_STRIDE_BLOCK: - lpid = MPIDI_CALC_STRIDE(rank, MPIDI_COMM(comm, map).reg.stride.stride, - MPIDI_COMM(comm, map).reg.stride.blocksize, - MPIDI_COMM(comm, map).reg.stride.offset); - ret = &MPIDI_global.avt_mgr.av_tables[MPIDI_COMM(comm, map).avtid]->table[lpid]; - break; - case MPIDI_RANK_MAP_STRIDE_BLOCK_INTRA: - lpid = MPIDI_CALC_STRIDE(rank, MPIDI_COMM(comm, map).reg.stride.stride, - MPIDI_COMM(comm, map).reg.stride.blocksize, - MPIDI_COMM(comm, map).reg.stride.offset); - ret = &MPIDI_global.avt_mgr.av_table0->table[lpid]; - break; - case MPIDI_RANK_MAP_LUT: - ret = &MPIDI_global.avt_mgr.av_tables[MPIDI_COMM(comm, map).avtid] - ->table[MPIDI_COMM(comm, map).irreg.lut.lpid[rank]]; - break; - case MPIDI_RANK_MAP_LUT_INTRA: - ret = - &MPIDI_global.avt_mgr.av_table0->table[MPIDI_COMM(comm, map).irreg.lut.lpid[rank]]; - break; - case MPIDI_RANK_MAP_MLUT: - ret = &MPIDI_global.avt_mgr.av_tables[MPIDI_COMM(comm, map).irreg.mlut.gpid[rank].avtid] - ->table[MPIDI_COMM(comm, map).irreg.mlut.gpid[rank].lpid]; - break; - case MPIDI_RANK_MAP_NONE: - MPIR_Assert(0); - break; - } + MPIR_Lpid lpid = MPIR_comm_rank_to_lpid(comm, rank); + int world_idx = MPIR_LPID_WORLD_INDEX(lpid); + int world_rank = MPIR_LPID_WORLD_RANK(lpid); + + ret = &MPIDI_global.avt_mgr.av_tables[world_idx]->table[world_rank]; - MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MAP, VERBOSE, - (MPL_DBG_FDEST, " comm_to_av_addr: rank=%d, av addr=%p", rank, (void *) ret)); MPIR_FUNC_EXIT; return ret; } From 315aa09d1bfc2605d470d61b6fc6be8f080d6630 Mon Sep 17 00:00:00 2001 From: Hui Zhou Date: Tue, 17 Dec 2024 11:15:01 -0600 Subject: [PATCH 45/59] ch4: add utils to support dynamic av Dynamic av will be used to support MPID_Comm_connect/accept when we need to create the leader av before we know the correct lpid entries. They are expected to be freed at the end of inter communicator creation. --- src/mpid/ch4/src/ch4_proc.c | 132 ++++++++++++++++++++++++++++++++--- src/mpid/ch4/src/ch4_proc.h | 25 ++----- src/mpid/ch4/src/ch4_types.h | 15 ++++ 3 files changed, 142 insertions(+), 30 deletions(-) diff --git a/src/mpid/ch4/src/ch4_proc.c b/src/mpid/ch4/src/ch4_proc.c index 0f10a760013..e94c3488f13 100644 --- a/src/mpid/ch4/src/ch4_proc.c +++ b/src/mpid/ch4/src/ch4_proc.c @@ -153,6 +153,9 @@ int MPIDIU_avt_release_ref(int avtid) return MPI_SUCCESS; } +static void init_dynamic_av_table(void); +static void destroy_dynamic_av_table(void); + int MPIDIU_avt_init(void) { int mpi_errno = MPI_SUCCESS; @@ -187,6 +190,8 @@ int MPIDIU_avt_init(void) MPIDI_global.avt_mgr.av_tables[0] = MPIDI_global.avt_mgr.av_table0; + init_dynamic_av_table(); + MPIR_FUNC_EXIT; return mpi_errno; } @@ -202,6 +207,8 @@ int MPIDIU_avt_destroy(void) } } + destroy_dynamic_av_table(); + MPL_free(MPIDI_global.avt_mgr.av_tables); memset(&MPIDI_global.avt_mgr, 0, sizeof(MPIDI_global.avt_mgr)); @@ -209,21 +216,124 @@ int MPIDIU_avt_destroy(void) return MPI_SUCCESS; } -/* used in communicator creation paths when the av entry may not exist or inserted yet */ -MPIDI_av_entry_t *MPIDIU_lpid_to_av_slow(MPIR_Lpid lpid) +#define MPIDIU_DYN_AV_TABLE MPIDI_global.avt_mgr.dynamic_av_table +#define MPIDIU_DYN_AV(idx) (MPIDI_av_entry_t *)((char *) MPIDI_global.avt_mgr.dynamic_av_table.table + (idx) * sizeof(MPIDI_av_entry_t)) + +static void init_dynamic_av_table(void) +{ + /* allocate dynamic_av_table */ + int table_size = MPIDIU_DYNAMIC_AV_MAX * sizeof(MPIDI_av_entry_t); + MPIDIU_DYN_AV_TABLE.table = MPL_malloc(table_size, MPL_MEM_ADDRESS); + MPIDIU_DYN_AV_TABLE.size = 0; +} + +static void destroy_dynamic_av_table(void) +{ + MPIR_Assert(MPIDIU_DYN_AV_TABLE.size == 0); + MPL_free(MPIDIU_DYN_AV_TABLE.table); +} + +/* NOTE: The following functions -- + * * MPIDIU_insert_dynamic_upid + * * MPIDIU_free_dynamic_lpid + * * MPIDIU_find_dynamic_av + * are thread-unsafe. Caller should enter (VCI-0) critical section. + */ + +int MPIDIU_insert_dynamic_upid(MPIR_Lpid * lpid_out, const char *upid, int upid_len) +{ + int mpi_errno = MPI_SUCCESS; + + /* allocate idx from dynamic av table */ + int idx = MPIDIU_DYN_AV_TABLE.size; + for (int i = 0; i < MPIDIU_DYN_AV_TABLE.size; i++) { + if (MPIDIU_DYN_AV_TABLE.upids[i] == NULL) { + idx = i; + break; + } + } + if (idx == MPIDIU_DYN_AV_TABLE.size) { + MPIDIU_DYN_AV_TABLE.size++; + if (MPIDIU_DYN_AV_TABLE.size >= MPIDIU_DYNAMIC_AV_MAX) { + MPIR_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**intern"); + } + } + + /* copy the upid */ + char *upid_copy = MPL_malloc(upid_len, MPL_MEM_OTHER); + MPIR_ERR_CHKANDJUMP(!upid_copy, mpi_errno, MPI_ERR_OTHER, "**nomem"); + memcpy(upid_copy, upid, upid_len); + + MPIDIU_DYN_AV_TABLE.upids[idx] = upid_copy; + MPIDIU_DYN_AV_TABLE.upid_sizes[idx] = upid_len; + + /* insert upid */ + *lpid_out = MPIR_LPID_DYNAMIC_MASK | idx; + + /* mpi_errno = MPIDI_NM_insert_upid(*lpid_out, upid, upid_len); */ + /* MPIR_ERR_CHECK(mpi_errno); */ + + fn_exit: + return MPI_SUCCESS; + fn_fail: + goto fn_exit; +} + +int MPIDIU_free_dynamic_lpid(MPIR_Lpid lpid) { - int world_idx = MPIR_LPID_WORLD_INDEX(lpid); - int world_rank = MPIR_LPID_WORLD_RANK(lpid); + MPIR_Assert(lpid & MPIR_LPID_DYNAMIC_MASK); + int idx = lpid & (~MPIR_LPID_DYNAMIC_MASK); + MPIR_Assert(idx >= 0 && idx < MPIDIU_DYN_AV_TABLE.size); + + /* free the upid buffer */ + MPL_free((char *) MPIDIU_DYN_AV_TABLE.upids[idx]); + /* mark the av as free by setting upid to NULL and upid_size to 0 */ + MPIDIU_DYN_AV_TABLE.upids[idx] = NULL; + MPIDIU_DYN_AV_TABLE.upid_sizes[idx] = 0; + + /* if the last entry is empty, reduce size */ + while (MPIDIU_DYN_AV_TABLE.size > 0 && + MPIDIU_DYN_AV_TABLE.upids[MPIDIU_DYN_AV_TABLE.size - 1] == NULL) { + MPIDIU_DYN_AV_TABLE.size--; + } + + return MPI_SUCCESS; +} - MPIR_Assert(world_rank < MPIR_Worlds[world_idx].num_procs); +MPIDI_av_entry_t *MPIDIU_find_dynamic_av(const char *upid, int upid_len) +{ + for (int i = 0; i < MPIDIU_DYN_AV_TABLE.size; i++) { + if (MPIDIU_DYN_AV_TABLE.upid_sizes[i] == upid_len && + memcmp(MPIDIU_DYN_AV_TABLE.upids[i], upid, upid_len) == 0) { + return MPIDIU_DYN_AV(i); + } + } + return NULL; +} - if (world_idx >= MPIDI_global.avt_mgr.n_avts) { - /* new world. Add av table for each new world */ - for (int i = MPIDI_global.avt_mgr.n_avts; i < world_idx + 1; i++) { - int avtid; - MPIDIU_new_avt(MPIR_Worlds[i].num_procs, &avtid); - MPIR_Assert(avtid == i); +/* this version handles dynamic av or av entries that are not allocated yet (e.g. new world) + */ +MPIDI_av_entry_t *MPIDIU_lpid_to_av_slow(MPIR_Lpid lpid) +{ + if (lpid & MPIR_LPID_DYNAMIC_MASK) { + int idx = lpid & (~MPIR_LPID_DYNAMIC_MASK); + MPIR_Assert(idx >= 0 && idx < MPIDIU_DYN_AV_TABLE.size); + return &MPIDIU_DYN_AV_TABLE.table[idx]; + } else { + int world_idx = MPIR_LPID_WORLD_INDEX(lpid); + int world_rank = MPIR_LPID_WORLD_RANK(lpid); + + MPIR_Assert(world_rank < MPIR_Worlds[world_idx].num_procs); + + if (world_idx >= MPIDI_global.avt_mgr.n_avts) { + for (int i = MPIDI_global.avt_mgr.n_avts; i < world_idx + 1; i++) { + int avtid; + MPIDIU_new_avt(MPIR_Worlds[i].num_procs, &avtid); + MPIR_Assert(avtid == i); + } } + + return &MPIDI_global.avt_mgr.av_tables[world_idx]->table[world_rank]; } } diff --git a/src/mpid/ch4/src/ch4_proc.h b/src/mpid/ch4/src/ch4_proc.h index ca2f96c5e4e..e338f4510e2 100644 --- a/src/mpid/ch4/src/ch4_proc.h +++ b/src/mpid/ch4/src/ch4_proc.h @@ -213,25 +213,6 @@ MPL_STATIC_INLINE_PREFIX MPIDI_av_entry_t *MPIDIU_lpid_to_av(MPIR_Lpid lpid) return &MPIDI_global.avt_mgr.av_tables[world_idx]->table[world_rank]; } -/* used in communicator creation paths when the av entry may not exist or inserted yet */ -MPL_STATIC_INLINE_PREFIX MPIDI_av_entry_t *MPIDIU_lpid_to_av_slow(MPIR_Lpid lpid) -{ - int world_idx = MPIR_LPID_WORLD_INDEX(lpid); - int world_rank = MPIR_LPID_WORLD_RANK(lpid); - - MPIR_Assert(world_rank < MPIR_Worlds[world_idx].num_procs); - - if (world_idx >= MPIDI_global.avt_mgr.n_avts) { - for (int i = MPIDI_global.avt_mgr.n_avts; i < world_idx + 1; i++) { - int avtid; - MPIDIU_new_avt(MPIR_Worlds[i].num_procs, &avtid); - MPIR_Assert(avtid == i); - } - } - - return MPIDI_global.avt_mgr.av_tables[world_idx]->table[world_rank]; -} - MPL_STATIC_INLINE_PREFIX int MPIDI_rank_is_local(int rank, MPIR_Comm * comm) { int ret; @@ -265,4 +246,10 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_av_is_local(MPIDI_av_entry_t * av) return ret; } +int MPIDIU_insert_dynamic_upid(MPIR_Lpid * lpid_out, const char *upid, int upid_len); +int MPIDIU_free_dynamic_lpid(MPIR_Lpid lpid); +MPIDI_av_entry_t *MPIDIU_find_dynamic_av(const char *upid, int upid_len); +/* used in communicator creation paths when the av entry may not exist or inserted yet */ +MPIDI_av_entry_t *MPIDIU_lpid_to_av_slow(MPIR_Lpid lpid); + #endif /* CH4_PROC_H_INCLUDED */ diff --git a/src/mpid/ch4/src/ch4_types.h b/src/mpid/ch4/src/ch4_types.h index 88fbb5ac4ab..e618450a745 100644 --- a/src/mpid/ch4/src/ch4_types.h +++ b/src/mpid/ch4/src/ch4_types.h @@ -190,12 +190,27 @@ typedef struct { MPIDI_av_entry_t table[]; } MPIDI_av_table_t; +/* dynamic av is used for building inter communicators, such as MPID_Comm_connect/accept, + * when we need temoprarily establish communication betweer peer group leaders. + * Because the entries are expected to be released once the intercomm is committed, we expect + * the dynamic av table size to remain finite. + * We keep the upid along with the av entry to avoid later duplicate av insertion. + * */ +#define MPIDIU_DYNAMIC_AV_MAX 100 +typedef struct { + int size; + const char *upids[MPIDIU_DYNAMIC_AV_MAX]; + int upid_sizes[MPIDIU_DYNAMIC_AV_MAX]; + MPIDI_av_entry_t *table; +} MPIDI_dyn_av_table_t; + typedef struct { int max_n_avts; int n_avts; int n_free; MPIDI_av_table_t *av_table0; MPIDI_av_table_t **av_tables; + MPIDI_dyn_av_table_t dynamic_av_table; } MPIDIU_avt_manager; #define MPIDIU_get_av_table(avtid) (MPIDI_global.avt_mgr.av_tables[(avtid)]) From b15f6991a6a9b5c721ada245394fb35cd88bf646 Mon Sep 17 00:00:00 2001 From: Hui Zhou Date: Thu, 12 Dec 2024 21:04:05 -0600 Subject: [PATCH 46/59] ch4: new netmod api to support dynamic processes Add - * MPIDI_NM_insert_upid - insert an av entry so the lpid is ready for communication. The lpid can be allocated from a dynamic av table, thus supports temporary communications between intercomm leaders. When later the upid is inserted again into the regular av tables, the dynamic entries are checked and copied over if already exist. * MPIDI_NM_dynamic_sendrecv - used by local group leaders to exchange data over dynamic_av. The dynamic handshakes are susceptible to concurrent interference. Thus the upper layer is assumed to hold the vci-0 critical section. --- src/mpid/ch4/ch4_api.txt | 11 ++ src/mpid/ch4/netmod/ofi/ofi_spawn.c | 193 +++++++++++++++++++++++----- src/mpid/ch4/netmod/ucx/ucx_spawn.c | 137 ++++++++++++++++++-- src/mpid/ch4/src/ch4_proc.c | 4 +- 4 files changed, 304 insertions(+), 41 deletions(-) diff --git a/src/mpid/ch4/ch4_api.txt b/src/mpid/ch4/ch4_api.txt index 9165d4b8ed0..1e0b5410c1c 100644 --- a/src/mpid/ch4/ch4_api.txt +++ b/src/mpid/ch4/ch4_api.txt @@ -89,12 +89,16 @@ Non Native API: SHM*: rank, comm, handler_id, tag, buf-2, count, datatype, src_vci, dst_vci, rreq get_local_upids : int NM : comm, local_upid_size, local_upids + insert_upid: int + NM : lpid, upid, upid_len upids_to_lpids : int NM : size, remote_upid_size, remote_upids, remote_lpids dynamic_send : int NM : remote_lpid, tag, buf, size, timeout dynamic_recv : int NM : tag, buf-2, size, timeout + dynamic_sendrecv : int + NM : remote_lpid, tag, send_buf, send_size, recv_buf, recv_size, timeout mpi_comm_commit_pre_hook : int NM : comm SHM : comm @@ -475,6 +479,7 @@ PARAM: local_upid_size: int ** local_upids: char ** lock_type: int + lpid: MPIR_Lpid made_progress: int * message: MPIR_Request * message_p: MPIR_Request ** @@ -499,6 +504,8 @@ PARAM: recvcounts: const MPI_Aint * recvtype: MPI_Datatype recvtypes: const MPI_Datatype[] + recv_buf: void * + recv_size: int remote_lpid: MPIR_Lpid remote_lpids: MPIR_Lpid * remote_upid_size: int * @@ -517,6 +524,8 @@ PARAM: sendcounts: const MPI_Aint * sendtype: MPI_Datatype sendtypes: const MPI_Datatype[] + send_buf: const void * + send_size: int size: int size_p: MPI_Aint * size-2: MPI_Aint @@ -534,6 +543,8 @@ PARAM: target_rank: int timeout: int type: MPIR_Datatype * + upid: const char * + upid_len: int vci: int void: win: MPIR_Win * diff --git a/src/mpid/ch4/netmod/ofi/ofi_spawn.c b/src/mpid/ch4/netmod/ofi/ofi_spawn.c index 6ccb9bd1cf2..381d1510e0b 100644 --- a/src/mpid/ch4/netmod/ofi/ofi_spawn.c +++ b/src/mpid/ch4/netmod/ofi/ofi_spawn.c @@ -7,6 +7,10 @@ #include "ofi_impl.h" #include "ofi_noinline.h" +/* NOTE: all these functions assume the caller to enter VCI-0 critical section */ + +static int cancel_dynamic_request(MPIDI_OFI_dynamic_process_request_t * dynamic_req, bool is_send); + int MPIDI_OFI_dynamic_send(MPIR_Lpid remote_lpid, int tag, const void *buf, int size, int timeout) { int mpi_errno = MPI_SUCCESS; @@ -20,8 +24,6 @@ int MPIDI_OFI_dynamic_send(MPIR_Lpid remote_lpid, int tag, const void *buf, int int lpid = MPIDIU_GPID_GET_LPID(remote_lpid); fi_addr_t remote_addr = MPIDI_OFI_av_to_phys(&MPIDIU_get_av(avtid, lpid), nic, vci); - MPID_THREAD_CS_ENTER(VCI, MPIDI_VCI(vci).lock); - MPIDI_OFI_dynamic_process_request_t req; req.done = 0; req.event_id = MPIDI_OFI_EVENT_DYNPROC_DONE; @@ -52,24 +54,13 @@ int MPIDI_OFI_dynamic_send(MPIR_Lpid remote_lpid, int tag, const void *buf, int if (!req.done) { /* time out, let's cancel the request */ - int rc; - rc = fi_cancel((fid_t) MPIDI_OFI_global.ctx[ctx_idx].tx, (void *) &req.context); - if (rc && rc != -FI_ENOENT) { - MPIR_ERR_CHKANDJUMP2(rc < 0, mpi_errno, MPI_ERR_OTHER, "**ofid_cancel", - "**ofid_cancel %s %s", MPIDI_OFI_DEFAULT_NIC_NAME, - fi_strerror(-rc)); - - } - while (!req.done) { - mpi_errno = MPIDI_OFI_progress_uninlined(vci); - MPIR_ERR_CHECK(mpi_errno); - } + mpi_errno = cancel_dynamic_request(&req, true); + MPIR_ERR_CHECK(mpi_errno); mpi_errno = MPIX_ERR_TIMEOUT; } fn_exit: - MPID_THREAD_CS_EXIT(VCI, MPIDI_VCI(vci).lock); return mpi_errno; fn_fail: goto fn_exit; @@ -91,8 +82,6 @@ int MPIDI_OFI_dynamic_recv(int tag, void *buf, int size, int timeout) match_bits = MPIDI_OFI_init_recvtag(&mask_bits, 0, MPI_ANY_SOURCE, tag); match_bits |= MPIDI_OFI_DYNPROC_SEND; - MPID_THREAD_CS_ENTER(VCI, MPIDI_VCI(vci).lock); - MPL_time_t time_start, time_now; double time_gap; MPL_wtime(&time_start); @@ -109,24 +98,128 @@ int MPIDI_OFI_dynamic_recv(int tag, void *buf, int size, int timeout) if (!req.done) { /* time out, let's cancel the request */ - int rc; - rc = fi_cancel((fid_t) MPIDI_OFI_global.ctx[ctx_idx].rx, (void *) &req.context); - if (rc && rc != -FI_ENOENT) { - MPIR_ERR_CHKANDJUMP2(rc < 0, mpi_errno, MPI_ERR_OTHER, "**ofid_cancel", - "**ofid_cancel %s %s", MPIDI_OFI_DEFAULT_NIC_NAME, - fi_strerror(-rc)); + mpi_errno = cancel_dynamic_request(&req, false); + MPIR_ERR_CHECK(mpi_errno); + + mpi_errno = MPIX_ERR_TIMEOUT; + } + + fn_exit: + return mpi_errno; + fn_fail: + goto fn_exit; +} +int MPIDI_OFI_dynamic_sendrecv(MPIR_Lpid remote_lpid, int tag, + const void *send_buf, int send_size, void *recv_buf, int recv_size, + int timeout) +{ + int mpi_errno = MPI_SUCCESS; + + /* NOTE: dynamic_sendrecv is always called inside CS of vci 0 */ + int vci = 0; + int nic = 0; + int ctx_idx = 0; +#ifdef MPICH_DEBUG_MUTEX + MPID_THREAD_ASSERT_IN_CS(VCI, (*(MPID_Thread_mutex_t *) MPIR_Request_mem[vci].lock)); +#endif + + MPIDI_av_entry_t *av = MPIDIU_lpid_to_av_slow(remote_lpid); + fi_addr_t remote_addr = MPIDI_OFI_av_to_phys(av, nic, vci); + + MPIDI_OFI_dynamic_process_request_t send_req; + send_req.done = 0; + send_req.event_id = MPIDI_OFI_EVENT_DYNPROC_DONE; + + if (send_size > 0) { + uint64_t match_bits = MPIDI_OFI_DYNPROC_SEND | tag; + if (MPIDI_OFI_ENABLE_DATA) { + MPIDI_OFI_CALL_RETRY(fi_tsenddata(MPIDI_OFI_global.ctx[ctx_idx].tx, + send_buf, send_size, NULL, 0, + remote_addr, match_bits, (void *) &send_req.context), + vci, tsenddata); + } else { + MPIDI_OFI_CALL_RETRY(fi_tsend(MPIDI_OFI_global.ctx[ctx_idx].tx, + send_buf, send_size, NULL, + remote_addr, match_bits, (void *) &send_req.context), + vci, tsend); } - while (!req.done) { - mpi_errno = MPIDI_OFI_progress_uninlined(vci); - MPIR_ERR_CHECK(mpi_errno); + } else { + send_req.done = 1; + } + + MPIDI_OFI_dynamic_process_request_t recv_req; + recv_req.done = 0; + recv_req.event_id = MPIDI_OFI_EVENT_DYNPROC_DONE; + + if (recv_size > 0) { + uint64_t mask_bits = 0; + uint64_t match_bits = MPIDI_OFI_DYNPROC_SEND | tag; + MPIDI_OFI_CALL_RETRY(fi_trecv(MPIDI_OFI_global.ctx[ctx_idx].rx, + recv_buf, recv_size, NULL, + remote_addr, match_bits, mask_bits, &recv_req.context), + vci, trecv); + } else { + recv_req.done = 1; + } + + MPL_time_t time_start; + MPL_wtime(&time_start); + while (!send_req.done || !recv_req.done) { + mpi_errno = MPIDI_OFI_progress_uninlined(vci); + MPIR_ERR_CHECK(mpi_errno); + + if (timeout > 0) { + MPL_time_t time_now; + double time_gap; + MPL_wtime(&time_now); + MPL_wtime_diff(&time_start, &time_now, &time_gap); + if (time_gap > (double) timeout) { + /* timed out, cancel the operations */ + if (!send_req.done) { + mpi_errno = cancel_dynamic_request(&send_req, true); + MPIR_ERR_CHECK(mpi_errno); + } + if (!recv_req.done) { + mpi_errno = cancel_dynamic_request(&recv_req, false); + MPIR_ERR_CHECK(mpi_errno); + } + + mpi_errno = MPIX_ERR_TIMEOUT; + break; + } } + } + + fn_exit: + return mpi_errno; + fn_fail: + goto fn_exit; +} + +static int cancel_dynamic_request(MPIDI_OFI_dynamic_process_request_t * dynamic_req, bool is_send) +{ + int mpi_errno = MPI_SUCCESS; + + struct fid_ep *ep; + if (is_send) { + ep = MPIDI_OFI_global.ctx[0].tx; + } else { + ep = MPIDI_OFI_global.ctx[0].rx; + } + int rc; + rc = fi_cancel((fid_t) ep, (void *) &dynamic_req->context); + if (rc && rc != -FI_ENOENT) { + MPIR_ERR_CHKANDJUMP2(rc < 0, mpi_errno, MPI_ERR_OTHER, "**ofid_cancel", + "**ofid_cancel %s %s", MPIDI_OFI_DEFAULT_NIC_NAME, fi_strerror(-rc)); - mpi_errno = MPIX_ERR_TIMEOUT; + } + while (!dynamic_req->done) { + mpi_errno = MPIDI_OFI_progress_uninlined(0); + MPIR_ERR_CHECK(mpi_errno); } fn_exit: - MPID_THREAD_CS_EXIT(VCI, MPIDI_VCI(vci).lock); return mpi_errno; fn_fail: goto fn_exit; @@ -266,8 +359,8 @@ int MPIDI_OFI_get_local_upids(MPIR_Comm * comm, int **local_upid_size, char **lo size_t sz = MPIDI_OFI_global.addrnamelen;; MPIDI_OFI_addr_t *av = &MPIDI_OFI_AV(MPIDIU_comm_rank_to_av(comm, i)); - MPIDI_OFI_VCI_CALL(fi_av_lookup(MPIDI_OFI_global.ctx[ctx_idx].av, av->dest[nic][0], - temp_buf + idx, &sz), 0, avlookup); + MPIDI_OFI_CALL(fi_av_lookup(MPIDI_OFI_global.ctx[ctx_idx].av, av->dest[nic][0], + temp_buf + idx, &sz), avlookup); idx += (int) sz; (*local_upid_size)[i] = upid_len; @@ -282,3 +375,43 @@ int MPIDI_OFI_get_local_upids(MPIR_Comm * comm, int **local_upid_size, char **lo MPIR_CHKPMEM_REAP(); goto fn_exit; } + +int MPIDI_OFI_insert_upid(MPIR_Lpid lpid, const char *upid, int upid_len) +{ + int mpi_errno = MPI_SUCCESS; + + const char *hostname = upid; + MPIDI_av_entry_t *av = MPIDIU_lpid_to_av_slow(lpid); + + bool do_insert = false; + if (lpid & MPIR_LPID_DYNAMIC_MASK) { + do_insert = true; + } else if (MPIDI_OFI_AV(av).dest[0][0] == FI_ADDR_NOTAVAIL) { + MPIDI_av_entry_t *dynamic_av = MPIDIU_find_dynamic_av(upid, upid_len); + if (dynamic_av) { + /* just copy it over */ + MPIDI_OFI_AV(av).dest[0][0] = MPIDI_OFI_AV(dynamic_av).dest[0][0]; + } else { + do_insert = true; + } + + /* set node_id */ + int node_id; + mpi_errno = MPIR_nodeid_lookup(hostname, &node_id); + MPIR_ERR_CHECK(mpi_errno); + av->node_id = node_id; + } + + if (do_insert) { + const char *addrname = hostname + strlen(hostname) + 1; + /* new entry */ + MPIDI_OFI_CALL(fi_av_insert(MPIDI_OFI_global.ctx[0].av, addrname, + 1, &MPIDI_OFI_AV(av).dest[0][0], 0ULL, NULL), avmap); + MPIR_Assert(MPIDI_OFI_AV(av).dest[0][0] != FI_ADDR_NOTAVAIL); + } + + fn_exit: + return mpi_errno; + fn_fail: + goto fn_exit; +} diff --git a/src/mpid/ch4/netmod/ucx/ucx_spawn.c b/src/mpid/ch4/netmod/ucx/ucx_spawn.c index e78dc2a0af3..87696e370cc 100644 --- a/src/mpid/ch4/netmod/ucx/ucx_spawn.c +++ b/src/mpid/ch4/netmod/ucx/ucx_spawn.c @@ -27,11 +27,7 @@ int MPIDI_UCX_dynamic_send(MPIR_Lpid remote_lpid, int tag, const void *buf, int uint64_t ucx_tag = MPIDI_UCX_DYNPROC_MASK + tag; int vci = 0; - MPID_THREAD_CS_ENTER(VCI, MPIDI_VCI(vci).lock); - - int avtid = MPIDIU_GPID_GET_AVTID(remote_lpid); - int lpid = MPIDIU_GPID_GET_LPID(remote_lpid); - ucp_ep_h ep = MPIDI_UCX_AV_TO_EP(&MPIDIU_get_av(avtid, lpid), vci, vci); + ucp_ep_h ep = MPIDI_UCX_AV_TO_EP(MPIDIU_lpid_to_av(remote_lpid), vci, vci); bool done = false; ucp_request_param_t param = { @@ -68,7 +64,6 @@ int MPIDI_UCX_dynamic_send(MPIR_Lpid remote_lpid, int tag, const void *buf, int } fn_exit: - MPID_THREAD_CS_EXIT(VCI, MPIDI_VCI(vci).lock); return mpi_errno; } @@ -80,8 +75,6 @@ int MPIDI_UCX_dynamic_recv(int tag, void *buf, int size, int timeout) uint64_t tag_mask = 0xffffffffffffffff; int vci = 0; - MPID_THREAD_CS_ENTER(VCI, MPIDI_VCI(vci).lock); - bool done = false; ucp_request_param_t param = { .op_attr_mask = UCP_OP_ATTR_FIELD_CALLBACK | UCP_OP_ATTR_FIELD_USER_DATA, @@ -117,7 +110,93 @@ int MPIDI_UCX_dynamic_recv(int tag, void *buf, int size, int timeout) } fn_exit: - MPID_THREAD_CS_EXIT(VCI, MPIDI_VCI(vci).lock); + return mpi_errno; +} + +int MPIDI_UCX_dynamic_sendrecv(MPIR_Lpid remote_lpid, int tag, + const void *send_buf, int send_size, void *recv_buf, int recv_size, + int timeout) +{ + int mpi_errno = MPI_SUCCESS; + + /* NOTE: dynamic_sendrecv is always called inside CS of vci 0 */ + int vci = 0; +#ifdef MPICH_DEBUG_MUTEX + MPID_THREAD_ASSERT_IN_CS(VCI, (*(MPID_Thread_mutex_t *) MPIR_Request_mem[vci].lock)); +#endif + + uint64_t ucx_tag = MPIDI_UCX_DYNPROC_MASK + tag; + uint64_t tag_mask = 0xffffffffffffffff; /* for recv */ + MPIDI_av_entry_t *av = MPIDIU_lpid_to_av_slow(remote_lpid); + ucp_ep_h ep = MPIDI_UCX_AV_TO_EP(av, vci, vci); + + ucs_status_ptr_t status = UCS_OK; + + /* send */ + bool send_done = false; + if (send_size > 0) { + ucp_request_param_t send_param = { + .op_attr_mask = UCP_OP_ATTR_FIELD_CALLBACK | UCP_OP_ATTR_FIELD_USER_DATA, + .cb.send = dynamic_send_cb, + .user_data = &send_done, + }; + + status = ucp_tag_send_nbx(ep, send_buf, send_size, ucx_tag, &send_param); + if (status == UCS_OK) { + send_done = true; + } else if (UCS_PTR_IS_ERR(status)) { + /* FIXME: better error */ + mpi_errno = MPI_ERR_PORT; + goto fn_exit; + } + } else { + send_done = true; + } + + /* recv */ + bool recv_done = false; + if (recv_size > 0) { + ucp_request_param_t recv_param = { + .op_attr_mask = UCP_OP_ATTR_FIELD_CALLBACK | UCP_OP_ATTR_FIELD_USER_DATA, + .cb.recv = dynamic_recv_cb, + .user_data = &recv_done, + }; + + status = ucp_tag_recv_nbx(MPIDI_UCX_global.ctx[vci].worker, recv_buf, recv_size, + ucx_tag, tag_mask, &recv_param); + if (status == UCS_OK) { + recv_done = true; + } else if (UCS_PTR_IS_ERR(status)) { + /* FIXME: better error */ + mpi_errno = MPI_ERR_PORT; + goto fn_exit; + } + } else { + recv_done = true; + } + + /* wait */ + MPL_time_t time_start; + MPL_wtime(&time_start); + while (!send_done || !recv_done) { + ucp_worker_progress(MPIDI_UCX_global.ctx[vci].worker); + + if (timeout > 0) { + MPL_time_t time_now; + double time_gap; + MPL_wtime(&time_now); + MPL_wtime_diff(&time_start, &time_now, &time_gap); + if (time_gap > (double) timeout) { + mpi_errno = MPIX_ERR_TIMEOUT; + break; + } + } + } + + fn_exit: + if (status != UCS_OK) { + ucp_request_release(status); + } return mpi_errno; } @@ -147,6 +226,46 @@ int MPIDI_UCX_get_local_upids(MPIR_Comm * comm, int **local_upid_size, char **lo goto fn_exit; } +int MPIDI_UCX_insert_upid(MPIR_Lpid lpid, const char *upid, int upid_len) +{ + int mpi_errno = MPI_SUCCESS; + MPIDI_av_entry_t *av = MPIDIU_lpid_to_av_slow(lpid); + + bool is_dynamic = (lpid & MPIR_LPID_DYNAMIC_MASK); + bool do_insert = false; + if (is_dynamic) { + do_insert = true; + } else if (!MPIDI_UCX_AV(av).dest[0][0]) { + MPIDI_av_entry_t *dynamic_av = MPIDIU_find_dynamic_av(upid, upid_len); + if (dynamic_av) { + /* just copy it over */ + MPIDI_UCX_AV(av).dest[0][0] = MPIDI_UCX_AV(dynamic_av).dest[0][0]; + } else { + do_insert = true; + } + } + + if (do_insert) { + /* new entry */ + ucp_ep_params_t ep_params; + ucs_status_t ucx_status; + ep_params.field_mask = UCP_EP_PARAM_FIELD_REMOTE_ADDRESS; + ep_params.address = (ucp_address_t *) upid; + ucx_status = ucp_ep_create(MPIDI_UCX_global.ctx[0].worker, &ep_params, + &MPIDI_UCX_AV(av).dest[0][0]); + MPIDI_UCX_CHK_STATUS(ucx_status); + } + + if (!is_dynamic) { + MPIDIU_upidhash_add(upid, upid_len, lpid); + } + + fn_exit: + return mpi_errno; + fn_fail: + goto fn_exit; +} + int MPIDI_UCX_upids_to_lpids(int size, int *remote_upid_size, char *remote_upids, MPIR_Lpid * remote_lpids) { diff --git a/src/mpid/ch4/src/ch4_proc.c b/src/mpid/ch4/src/ch4_proc.c index e94c3488f13..60ac8a52632 100644 --- a/src/mpid/ch4/src/ch4_proc.c +++ b/src/mpid/ch4/src/ch4_proc.c @@ -270,8 +270,8 @@ int MPIDIU_insert_dynamic_upid(MPIR_Lpid * lpid_out, const char *upid, int upid_ /* insert upid */ *lpid_out = MPIR_LPID_DYNAMIC_MASK | idx; - /* mpi_errno = MPIDI_NM_insert_upid(*lpid_out, upid, upid_len); */ - /* MPIR_ERR_CHECK(mpi_errno); */ + mpi_errno = MPIDI_NM_insert_upid(*lpid_out, upid, upid_len); + MPIR_ERR_CHECK(mpi_errno); fn_exit: return MPI_SUCCESS; From e37d1fce88e41e786a0b9ced8cb4d1767006b3c5 Mon Sep 17 00:00:00 2001 From: Hui Zhou Date: Mon, 16 Dec 2024 17:21:37 -0600 Subject: [PATCH 47/59] ADI/comm: exchange context_id in MPID_Intercomm_exchange We can easily exchange the context_id along with the rest of the remote info rather than do it in a separate step. We can determine is_low_group by comparing world namespace and world_rank entirely in the MPIR layer, thus no longer need it in MPID_Intercomm_exchange. Rename MPID_Intercomm_exchange_map to MPID_Intercomm_exchange to better reflect that it is not just exchanging maps. --- src/mpi/comm/comm_impl.c | 98 +++++++++++++++++---------------- src/mpid/ch3/include/mpidpost.h | 8 +-- src/mpid/ch3/src/mpid_vc.c | 40 ++++++-------- src/mpid/ch4/include/mpidch4.h | 5 +- src/mpid/ch4/src/ch4_comm.c | 71 +++++++++--------------- src/mpid/ch4/src/ch4_impl.h | 4 +- 6 files changed, 102 insertions(+), 124 deletions(-) diff --git a/src/mpi/comm/comm_impl.c b/src/mpi/comm/comm_impl.c index 8c21a760e7b..23d17b3d484 100644 --- a/src/mpi/comm/comm_impl.c +++ b/src/mpi/comm/comm_impl.c @@ -965,75 +965,78 @@ int MPIR_Comm_set_info_impl(MPIR_Comm * comm_ptr, MPIR_Info * info_ptr) goto fn_exit; } +/* arbitrarily determine which group is the low_group by comparing + * world namespaces and world ranks */ +static int determine_low_group(MPIR_Lpid remote_lpid, bool * is_low_group_out) +{ + int mpi_errno = MPI_SUCCESS; + + int my_world_idx = 0; + int my_world_rank = MPIR_Process.rank; + int remote_world_idx = MPIR_LPID_WORLD_INDEX(remote_lpid); + int remote_world_rank = MPIR_LPID_WORLD_RANK(remote_lpid); + + if (my_world_idx == remote_world_idx) { + /* same world, just compare world ranks */ + MPIR_Assert(my_world_idx == 0); + *is_low_group_out = (my_world_rank < remote_world_rank); + } else { + /* different world, compare namespace */ + int cmp_result = strncmp(MPIR_Worlds[my_world_idx].namespace, + MPIR_Worlds[remote_world_idx].namespace, + MPIR_MAX_WORLDS); + MPIR_Assert(cmp_result != 0); + if (cmp_result < 0) + *is_low_group_out = false; + else + *is_low_group_out = true; + } + + fn_exit: + return mpi_errno; + fn_fail: + goto fn_exit; +} + int MPIR_Intercomm_create_impl(MPIR_Comm * local_comm_ptr, int local_leader, MPIR_Comm * peer_comm_ptr, int remote_leader, int tag, MPIR_Comm ** new_intercomm_ptr) { int mpi_errno = MPI_SUCCESS; - int final_context_id, recvcontext_id; int remote_size = 0; - uint64_t *remote_lpids = NULL; + MPIR_Lpid *remote_lpids = NULL; int comm_info[3]; - int is_low_group = 0; MPIR_Session *session_ptr = local_comm_ptr->session_ptr; MPIR_FUNC_ENTER; - /* Shift tag into the tagged coll space */ - tag |= MPIR_TAG_COLL_BIT; - - mpi_errno = MPID_Intercomm_exchange_map(local_comm_ptr, local_leader, - peer_comm_ptr, remote_leader, - &remote_size, &remote_lpids, &is_low_group); - MPIR_ERR_CHECK(mpi_errno); - /* * Create the contexts. Each group will have a context for sending * to the other group. All processes must be involved. Because * we know that the local and remote groups are disjoint, this * step will complete */ - MPL_DBG_MSG_FMT(MPIR_DBG_COMM, VERBOSE, - (MPL_DBG_FDEST, "About to get contextid (local_size=%d) on rank %d", - local_comm_ptr->local_size, local_comm_ptr->rank)); /* In the multi-threaded case, MPIR_Get_contextid_sparse assumes that the * calling routine already holds the single critical section */ /* TODO: Make sure this is tag-safe */ + int recvcontext_id; mpi_errno = MPIR_Get_contextid_sparse(local_comm_ptr, &recvcontext_id, FALSE); MPIR_ERR_CHECK(mpi_errno); MPIR_Assert(recvcontext_id != 0); - MPL_DBG_MSG_FMT(MPIR_DBG_COMM, VERBOSE, (MPL_DBG_FDEST, "Got contextid=%d", recvcontext_id)); - /* Leaders can now swap context ids and then broadcast the value - * to the local group of processes */ - if (local_comm_ptr->rank == local_leader) { - int remote_context_id; - - mpi_errno = - MPIC_Sendrecv(&recvcontext_id, 1, MPIR_CONTEXT_ID_T_DATATYPE, remote_leader, tag, - &remote_context_id, 1, MPIR_CONTEXT_ID_T_DATATYPE, remote_leader, tag, - peer_comm_ptr, MPI_STATUS_IGNORE, MPIR_ERR_NONE); - MPIR_ERR_CHECK(mpi_errno); - - final_context_id = remote_context_id; + /* Shift tag into the tagged coll space */ + tag |= MPIR_TAG_COLL_BIT; - /* Now, send all of our local processes the remote_lpids, - * along with the final context id */ - comm_info[0] = final_context_id; - MPL_DBG_MSG(MPIR_DBG_COMM, VERBOSE, "About to bcast on local_comm"); - mpi_errno = MPIR_Bcast(comm_info, 1, MPI_INT, local_leader, local_comm_ptr, MPIR_ERR_NONE); - MPIR_ERR_CHECK(mpi_errno); - MPL_DBG_MSG_D(MPIR_DBG_COMM, VERBOSE, "end of bcast on local_comm of size %d", - local_comm_ptr->local_size); - } else { - /* we're the other processes */ - MPL_DBG_MSG(MPIR_DBG_COMM, VERBOSE, "About to receive bcast on local_comm"); - mpi_errno = MPIR_Bcast(comm_info, 1, MPI_INT, local_leader, local_comm_ptr, MPIR_ERR_NONE); - MPIR_ERR_CHECK(mpi_errno); + int remote_context_id; + mpi_errno = MPID_Intercomm_exchange(local_comm_ptr, local_leader, + peer_comm_ptr, remote_leader, tag, + recvcontext_id, &remote_context_id, + &remote_size, &remote_lpids); + MPIR_ERR_CHECK(mpi_errno); - /* Extract the context and group sign information */ - final_context_id = comm_info[0]; - } + bool is_low_group; + mpi_errno = determine_low_group(remote_lpids[0], &is_low_group); + MPIR_ERR_CHECK(mpi_errno); /* At last, we now have the information that we need to build the * intercommunicator */ @@ -1041,10 +1044,9 @@ int MPIR_Intercomm_create_impl(MPIR_Comm * local_comm_ptr, int local_leader, /* All processes in the local_comm now build the communicator */ mpi_errno = MPIR_Comm_create(new_intercomm_ptr); - if (mpi_errno) - goto fn_fail; + MPIR_ERR_CHECK(mpi_errno); - (*new_intercomm_ptr)->context_id = final_context_id; + (*new_intercomm_ptr)->context_id = remote_context_id; (*new_intercomm_ptr)->recvcontext_id = recvcontext_id; (*new_intercomm_ptr)->remote_size = remote_size; (*new_intercomm_ptr)->local_size = local_comm_ptr->local_size; @@ -1059,12 +1061,12 @@ int MPIR_Intercomm_create_impl(MPIR_Comm * local_comm_ptr, int local_leader, /* construct remote_group */ mpi_errno = MPIR_Group_create_map(remote_size, MPI_UNDEFINED, session_ptr, remote_lpids, &(*new_intercomm_ptr)->remote_group); + MPIR_ERR_CHECK(mpi_errno); MPIR_Comm_set_session_ptr(*new_intercomm_ptr, session_ptr); mpi_errno = MPID_Create_intercomm_from_lpids(*new_intercomm_ptr, remote_size, remote_lpids); - if (mpi_errno) - goto fn_fail; + MPIR_ERR_CHECK(mpi_errno); MPIR_Comm_map_dup(*new_intercomm_ptr, local_comm_ptr, MPIR_COMM_MAP_DIR__L2L); diff --git a/src/mpid/ch3/include/mpidpost.h b/src/mpid/ch3/include/mpidpost.h index 8ce533044a4..d49191b3cfd 100644 --- a/src/mpid/ch3/include/mpidpost.h +++ b/src/mpid/ch3/include/mpidpost.h @@ -186,10 +186,10 @@ int MPIDI_GPID_ToLpidArray( int size, MPIDI_Gpid gpid[], MPIR_Lpid lpid[] ); int MPIDI_PG_ForwardPGInfo( MPIR_Comm *peer_ptr, MPIR_Comm *comm_ptr, int nPGids, const MPIDI_Gpid gpids[], int root ); -int MPID_Intercomm_exchange_map( MPIR_Comm *local_comm_ptr, int local_leader, - MPIR_Comm *peer_comm_ptr, int remote_leader, - int *remote_size, MPIR_Lpid **remote_lpids, - int *is_low_group); +int MPID_Intercomm_exchange(MPIR_Comm *local_comm_ptr, int local_leader, + MPIR_Comm *peer_comm_ptr, int remote_leader, + int tag, int context_id, int *remote_context_id, + int *remote_size, MPIR_Lpid **remote_lpids, int timeout); int MPID_Create_intercomm_from_lpids( MPIR_Comm *newcomm_ptr, int size, const MPIR_Lpid lpids[] ); int MPID_Comm_get_lpid(MPIR_Comm *comm_ptr, int idx, MPIR_Lpid *lpid_ptr, bool is_remote); diff --git a/src/mpid/ch3/src/mpid_vc.c b/src/mpid/ch3/src/mpid_vc.c index 1ac50cb0851..28fb92e7cef 100644 --- a/src/mpid/ch3/src/mpid_vc.c +++ b/src/mpid/ch3/src/mpid_vc.c @@ -441,24 +441,20 @@ static int check_disjoint_lpids(MPIR_Lpid lpids1[], int n1, MPIR_Lpid lpids2[], #endif /* HAVE_ERROR_CHECKING */ /*@ - MPID_Intercomm_exchange_map - Exchange address mapping for intercomm creation. + MPID_Intercomm_exchange - Exchange remote info for intercomm creation. @*/ -int MPID_Intercomm_exchange_map(MPIR_Comm *local_comm_ptr, int local_leader, - MPIR_Comm *peer_comm_ptr, int remote_leader, - int *remote_size, MPIR_Lpid **remote_lpids, - int *is_low_group) +int MPID_Intercomm_exchange(MPIR_Comm *local_comm_ptr, int local_leader, + MPIR_Comm *peer_comm_ptr, int remote_leader, int tag, + int context_id, int *remote_context_id, + int *remote_size, MPIR_Lpid **remote_lpids, int timeout /* unused */) { int mpi_errno = MPI_SUCCESS; int singlePG; int local_size; MPIR_Lpid *local_lpids=0; MPIDI_Gpid *local_gpids=NULL, *remote_gpids=NULL; - int comm_info[2]; - int cts_tag; MPIR_CHKLMEM_DECL(3); - cts_tag = 0 | MPIR_TAG_COLL_BIT; - if (local_comm_ptr->rank == local_leader) { /* First, exchange the group information. If we were certain @@ -472,13 +468,15 @@ int MPID_Intercomm_exchange_map(MPIR_Comm *local_comm_ptr, int local_leader, /* printf( "About to sendrecv in intercomm_create\n" );fflush(stdout);*/ MPL_DBG_MSG_FMT(MPIDI_CH3_DBG_OTHER,VERBOSE,(MPL_DBG_FDEST,"rank %d sendrecv to rank %d", peer_comm_ptr->rank, remote_leader)); - mpi_errno = MPIC_Sendrecv( &local_size, 1, MPI_INT, - remote_leader, cts_tag, - remote_size, 1, MPI_INT, - remote_leader, cts_tag, - peer_comm_ptr, MPI_STATUS_IGNORE, MPIR_ERR_NONE ); + int local_ints[2] = {local_size, context_id}; + int remote_ints[2]; + mpi_errno = MPIC_Sendrecv(local_ints, 2, MPI_INT, remote_leader, tag, + remote_ints, 2, MPI_INT, remote_leader, tag, + peer_comm_ptr, MPI_STATUS_IGNORE, MPIR_ERR_NONE ); MPIR_ERR_CHECK(mpi_errno); + *remote_size = remote_ints[0]; + *remote_context_id = remote_ints[1]; MPL_DBG_MSG_FMT(MPIDI_CH3_DBG_OTHER,VERBOSE,(MPL_DBG_FDEST, "local size = %d, remote size = %d", local_size, *remote_size )); /* With this information, we can now send and receive the @@ -493,9 +491,9 @@ int MPID_Intercomm_exchange_map(MPIR_Comm *local_comm_ptr, int local_leader, /* Exchange the lpid arrays */ mpi_errno = MPIC_Sendrecv( local_gpids, local_size*sizeof(MPIDI_Gpid), MPI_BYTE, - remote_leader, cts_tag, + remote_leader, tag, remote_gpids, (*remote_size)*sizeof(MPIDI_Gpid), MPI_BYTE, - remote_leader, cts_tag, peer_comm_ptr, + remote_leader, tag, peer_comm_ptr, MPI_STATUS_IGNORE, MPIR_ERR_NONE ); MPIR_ERR_CHECK(mpi_errno); @@ -521,22 +519,18 @@ int MPID_Intercomm_exchange_map(MPIR_Comm *local_comm_ptr, int local_leader, } # endif /* HAVE_ERROR_CHECKING */ - /* Make an arbitrary decision about which group of process is - the low group. The LEADERS do this by comparing the - local process ids of the 0th member of the two groups */ - (*is_low_group) = local_lpids[0] < (*remote_lpids)[0]; - /* At this point, we're done with the local lpids; they'll be freed with the other local memory on exit */ } /* End of the first phase of the leader communication */ /* Leaders can now swap context ids and then broadcast the value to the local group of processes */ + int comm_info[3]; if (local_comm_ptr->rank == local_leader) { /* Now, send all of our local processes the remote_lpids, along with the final context id */ comm_info[0] = *remote_size; - comm_info[1] = *is_low_group; + comm_info[1] = *remote_context_id; MPL_DBG_MSG(MPIDI_CH3_DBG_OTHER,VERBOSE,"About to bcast on local_comm"); mpi_errno = MPIR_Bcast( comm_info, 2, MPI_INT, local_leader, local_comm_ptr, MPIR_ERR_NONE ); MPIR_ERR_CHECK(mpi_errno); @@ -560,7 +554,7 @@ int MPID_Intercomm_exchange_map(MPIR_Comm *local_comm_ptr, int local_leader, MPIR_ERR_CHECK(mpi_errno); /* Extract the context and group sign information */ - *is_low_group = comm_info[1]; + *remote_context_id = comm_info[1]; } /* Finish up by giving the device the opportunity to update diff --git a/src/mpid/ch4/include/mpidch4.h b/src/mpid/ch4/include/mpidch4.h index f3f57a722c9..30698dc67a9 100644 --- a/src/mpid/ch4/include/mpidch4.h +++ b/src/mpid/ch4/include/mpidch4.h @@ -167,7 +167,10 @@ int MPID_Type_commit_hook(MPIR_Datatype *); int MPID_Type_free_hook(MPIR_Datatype *); int MPID_Op_commit_hook(MPIR_Op *); int MPID_Op_free_hook(MPIR_Op *); -int MPID_Intercomm_exchange_map(MPIR_Comm *, int, MPIR_Comm *, int, int *, MPIR_Lpid **, int *); +int MPID_Intercomm_exchange(MPIR_Comm * local_comm, int local_leader, + MPIR_Comm * peer_comm, int remote_leader, int tag, + int context_id, int *remote_context_id_out, + int *remote_size_out, MPIR_Lpid ** remote_lpids_out, int timeout); int MPID_Create_intercomm_from_lpids(MPIR_Comm *, int, const MPIR_Lpid[]); int MPID_Comm_commit_pre_hook(MPIR_Comm *); int MPID_Comm_free_hook(MPIR_Comm *); diff --git a/src/mpid/ch4/src/ch4_comm.c b/src/mpid/ch4/src/ch4_comm.c index 8429acb8290..d7c20f615b4 100644 --- a/src/mpid/ch4/src/ch4_comm.c +++ b/src/mpid/ch4/src/ch4_comm.c @@ -393,16 +393,16 @@ int MPID_Comm_set_hints(MPIR_Comm * comm_ptr, MPIR_Info * info_ptr) goto fn_exit; } -int MPID_Intercomm_exchange_map(MPIR_Comm * local_comm, int local_leader, MPIR_Comm * peer_comm, - int remote_leader, int *remote_size, MPIR_Lpid ** remote_lpids, - int *is_low_group) +int MPID_Intercomm_exchange(MPIR_Comm * local_comm, int local_leader, + MPIR_Comm * peer_comm, int remote_leader, int tag, + int context_id, int *remote_context_id, + int *remote_size, MPIR_Lpid ** remote_lpids) { int mpi_errno = MPI_SUCCESS; int i; int avtid = 0, lpid = -1; int local_avtid = 0, remote_avtid = 0; int local_size_send = 0, remote_size_recv = 0; - int cts_tag = 0; int pure_intracomm = 1; int local_size = 0; MPIR_Lpid *local_lpids = NULL; @@ -419,7 +419,6 @@ int MPID_Intercomm_exchange_map(MPIR_Comm * local_comm, int local_leader, MPIR_C MPIR_CHKPMEM_DECL(1); MPIR_CHKLMEM_DECL(5); - cts_tag = 0 | MPIR_TAG_COLL_BIT; local_size = local_comm->local_size; /* @@ -450,13 +449,15 @@ int MPID_Intercomm_exchange_map(MPIR_Comm * local_comm, int local_leader, MPIR_C MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_COMM, VERBOSE, (MPL_DBG_FDEST, "rank %d sendrecv to rank %d", peer_comm->rank, remote_leader)); - mpi_errno = MPIC_Sendrecv(&local_size_send, 1, MPI_INT, - remote_leader, cts_tag, - &remote_size_recv, 1, MPI_INT, - remote_leader, cts_tag, peer_comm, MPI_STATUS_IGNORE, - MPIR_ERR_NONE); + int local_ints[2] = { local_size_send, context_id }; + int remote_ints[2]; + mpi_errno = MPIC_Sendrecv(local_ints, 2, MPI_INT, remote_leader, tag, + remote_ints, 2, MPI_INT, remote_leader, tag, peer_comm, + MPI_STATUS_IGNORE, MPIR_ERR_NONE); MPIR_ERR_CHECK(mpi_errno); + remote_size_recv = remote_ints[0]; + *remote_context_id = remote_ints[1]; if (remote_size_recv & MPIDI_DYNPROC_MASK) pure_intracomm = 0; (*remote_size) = remote_size_recv & (~MPIDI_DYNPROC_MASK); @@ -488,9 +489,9 @@ int MPID_Intercomm_exchange_map(MPIR_Comm * local_comm, int local_leader, MPIR_C mpi_errno = MPIDI_NM_get_local_upids(local_comm, &local_upid_size, &local_upids); MPIR_ERR_CHECK(mpi_errno); mpi_errno = MPIC_Sendrecv(local_upid_size, local_size, MPI_INT, - remote_leader, cts_tag, + remote_leader, tag, remote_upid_size, *remote_size, MPI_INT, - remote_leader, cts_tag, + remote_leader, tag, peer_comm, MPI_STATUS_IGNORE, MPIR_ERR_NONE); MPIR_ERR_CHECK(mpi_errno); upid_send_size = 0; @@ -502,9 +503,9 @@ int MPID_Intercomm_exchange_map(MPIR_Comm * local_comm, int local_leader, MPIR_C MPIR_CHKLMEM_MALLOC(remote_upids, char *, upid_recv_size * sizeof(char), mpi_errno, "remote_upids", MPL_MEM_ADDRESS); mpi_errno = MPIC_Sendrecv(local_upids, upid_send_size, MPI_BYTE, - remote_leader, cts_tag, + remote_leader, tag, remote_upids, upid_recv_size, MPI_BYTE, - remote_leader, cts_tag, + remote_leader, tag, peer_comm, MPI_STATUS_IGNORE, MPIR_ERR_NONE); MPIR_ERR_CHECK(mpi_errno); @@ -513,9 +514,9 @@ int MPID_Intercomm_exchange_map(MPIR_Comm * local_comm, int local_leader, MPIR_C } else { /* Stage 1.1f only exchange GPIDS if no dynamic process involved */ mpi_errno = MPIC_Sendrecv(local_lpids, local_size, MPI_UINT64_T, - remote_leader, cts_tag, + remote_leader, tag, *remote_lpids, *remote_size, MPI_UINT64_T, - remote_leader, cts_tag, + remote_leader, tag, peer_comm, MPI_STATUS_IGNORE, MPIR_ERR_NONE); MPIR_ERR_CHECK(mpi_errno); } @@ -547,28 +548,6 @@ int MPID_Intercomm_exchange_map(MPIR_Comm * local_comm, int local_leader, MPIR_C } #endif /* HAVE_ERROR_CHECKING */ - /* - * Make an arbitrary decision about which group of process is - * the low group. The LEADERS do this by comparing the - * local process ids of the 0th member of the two groups - * GPID itself is not enough for determine is_low_group because both - * local group is always smaller than remote - */ - if (pure_intracomm) { - *is_low_group = local_lpids[0] < (*remote_lpids)[0]; - } else { - if (local_upid_size[0] == remote_upid_size[0]) { - *is_low_group = memcmp(local_upids, remote_upids, local_upid_size[0]); - MPIR_Assert(*is_low_group != 0); - if (*is_low_group < 0) - *is_low_group = 0; - else - *is_low_group = 1; - } else { - *is_low_group = local_upid_size[0] < remote_upid_size[0]; - } - } - /* At this point, we're done with the local lpids; they'll * be freed with the other local memory on exit */ local_lpids = NULL; @@ -580,7 +559,7 @@ int MPID_Intercomm_exchange_map(MPIR_Comm * local_comm, int local_leader, MPIR_C MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_COMM, VERBOSE, (MPL_DBG_FDEST, "Intercomm map exchange stage 2: intra-group")); mpi_errno = MPIDIU_Intercomm_map_bcast_intra(local_comm, local_leader, - remote_size, is_low_group, pure_intracomm, + remote_size, remote_context_id, pure_intracomm, remote_upid_size, remote_upids, remote_lpids); MPIR_ERR_CHECK(mpi_errno); @@ -598,14 +577,14 @@ int MPID_Intercomm_exchange_map(MPIR_Comm * local_comm, int local_leader, MPIR_C } int MPIDIU_Intercomm_map_bcast_intra(MPIR_Comm * local_comm, int local_leader, int *remote_size, - int *is_low_group, int pure_intracomm, - int *remote_upid_size, char *remote_upids, + int *remote_context_id, + int pure_intracomm, int *remote_upid_size, char *remote_upids, MPIR_Lpid ** remote_lpids) { int mpi_errno = MPI_SUCCESS; int i; int upid_recv_size = 0; - int map_info[4]; + int map_info[5]; int *_remote_upid_size = NULL; char *_remote_upids = NULL; @@ -630,8 +609,8 @@ int MPIDIU_Intercomm_map_bcast_intra(MPIR_Comm * local_comm, int local_leader, i } map_info[0] = *remote_size; map_info[1] = upid_recv_size; - map_info[2] = *is_low_group; - map_info[3] = pure_intracomm; + map_info[2] = pure_intracomm; + map_info[3] = *remote_context_id; mpi_errno = MPIR_Bcast_allcomm_auto(map_info, 4, MPI_INT, local_leader, local_comm, MPIR_ERR_NONE); MPIR_ERR_CHECK(mpi_errno); @@ -653,8 +632,8 @@ int MPIDIU_Intercomm_map_bcast_intra(MPIR_Comm * local_comm, int local_leader, i MPIR_ERR_CHECK(mpi_errno); *remote_size = map_info[0]; upid_recv_size = map_info[1]; - *is_low_group = map_info[2]; - pure_intracomm = map_info[3]; + pure_intracomm = map_info[2]; + *remote_context_id = map_info[3]; MPIR_CHKPMEM_MALLOC((*remote_lpids), MPIR_Lpid *, (*remote_size) * sizeof(MPIR_Lpid), mpi_errno, "remote_lpids", MPL_MEM_COMM); diff --git a/src/mpid/ch4/src/ch4_impl.h b/src/mpid/ch4/src/ch4_impl.h index 7726ee90992..11ffe068a13 100644 --- a/src/mpid/ch4/src/ch4_impl.h +++ b/src/mpid/ch4/src/ch4_impl.h @@ -14,8 +14,8 @@ #include "ch4_vci.h" int MPIDIU_Intercomm_map_bcast_intra(MPIR_Comm * local_comm, int local_leader, int *remote_size, - int *is_low_group, int pure_intracomm, - int *remote_upid_size, char *remote_upids, + int *remote_context_id, bool * is_low_group, + int pure_intracomm, int *remote_upid_size, char *remote_upids, MPIR_Lpid ** remote_lpids); int MPIDIG_get_context_index(uint64_t context_id); uint64_t MPIDIG_generate_win_id(MPIR_Comm * comm_ptr); From e8d8e8240bbbd12dcf706f0b3d7c4446d4be2255 Mon Sep 17 00:00:00 2001 From: Hui Zhou Date: Wed, 18 Dec 2024 10:09:20 -0600 Subject: [PATCH 48/59] ADI: remove MPID_Comm_get_lpid This is fully replaced with MPIR_comm_rank_to_lpid or MPIR_Group_rank_to_lpid. --- src/mpi/coll/algorithms/treealgo/treeutil.c | 10 ++++---- src/mpi/comm/comm_impl.c | 11 +++------ src/mpid/ch3/include/mpidpost.h | 1 - src/mpid/ch3/src/mpid_vc.c | 26 ++------------------- src/mpid/ch4/include/mpidch4.h | 1 - src/mpid/ch4/src/ch4_impl.h | 2 +- src/mpid/ch4/src/ch4_init.c | 20 ---------------- 7 files changed, 11 insertions(+), 60 deletions(-) diff --git a/src/mpi/coll/algorithms/treealgo/treeutil.c b/src/mpi/coll/algorithms/treealgo/treeutil.c index ad9b003c369..38abfd928eb 100644 --- a/src/mpi/coll/algorithms/treealgo/treeutil.c +++ b/src/mpi/coll/algorithms/treealgo/treeutil.c @@ -504,10 +504,10 @@ static int MPII_Treeutil_hierarchy_populate(MPIR_Comm * comm, int rank, int nran MPIR_Assert(upper_level != NULL); /* Get wrank from the communicator as the coords are stored with wrank */ - uint64_t temp = 0; - MPID_Comm_get_lpid(comm, r, &temp, FALSE); - int wrank = (int) temp; - if (wrank < 0) + MPIR_Lpid temp = MPIR_comm_rank_to_lpid(comm, r); + int world_idx = MPIR_LPID_WORLD_INDEX(temp); + int wrank = MPIR_LPID_WORLD_RANK(temp); + if (world_idx != 0) goto fn_fail; MPIR_Assert(0 <= wrank && wrank < MPIR_Process.size); @@ -1129,7 +1129,7 @@ int MPII_Treeutil_tree_topology_wave_init(MPIR_Comm * comm, int k, int root, boo heap_vector minHeaps; heap_vector_init(&minHeaps); - /* To build hierarchy of ranks, swiches and groups */ + /* To build hierarchy of ranks, switches and groups */ int dim = MPIR_Process.coords_dims - 1; for (dim = MPIR_Process.coords_dims - 1; dim >= 0; --dim) tree_ut_hierarchy_init(&hierarchy[dim]); diff --git a/src/mpi/comm/comm_impl.c b/src/mpi/comm/comm_impl.c index 23d17b3d484..d1406d4a4c8 100644 --- a/src/mpi/comm/comm_impl.c +++ b/src/mpi/comm/comm_impl.c @@ -73,9 +73,7 @@ static int comm_create_local_group(MPIR_Comm * comm_ptr) MPIR_Lpid *map = MPL_malloc(n * sizeof(MPIR_Lpid), MPL_MEM_GROUP); for (int i = 0; i < n; i++) { - uint64_t lpid; - (void) MPID_Comm_get_lpid(comm_ptr, i, &lpid, FALSE); - map[i] = lpid; + map[i] = MPIR_Group_rank_to_lpid(comm_ptr->local_group, i); } mpi_errno = MPIR_Group_create_map(n, comm_ptr->rank, comm_ptr->session_ptr, map, @@ -239,8 +237,7 @@ int MPII_Comm_create_calculate_mapping(MPIR_Group * group_ptr, /* FIXME : BUBBLE SORT */ mapping[i] = -1; for (j = 0; j < comm_ptr->local_size; j++) { - uint64_t comm_lpid; - MPID_Comm_get_lpid(comm_ptr, j, &comm_lpid, FALSE); + MPIR_Lpid comm_lpid = MPIR_Group_rank_to_lpid(comm_ptr->local_group, j); if (comm_lpid == MPIR_Group_rank_to_lpid(group_ptr, i)) { mapping[i] = j; break; @@ -929,9 +926,7 @@ int MPIR_Comm_remote_group_impl(MPIR_Comm * comm_ptr, MPIR_Group ** group_ptr) MPIR_Lpid *map = MPL_malloc(n * sizeof(MPIR_Lpid), MPL_MEM_GROUP); for (int i = 0; i < n; i++) { - uint64_t lpid; - (void) MPID_Comm_get_lpid(comm_ptr, i, &lpid, TRUE); - map[i] = lpid; + map[i] = MPIR_Group_rank_to_lpid(comm_ptr->remote_group, i); } mpi_errno = MPIR_Group_create_map(n, MPI_UNDEFINED, comm_ptr->session_ptr, map, &comm_ptr->remote_group); diff --git a/src/mpid/ch3/include/mpidpost.h b/src/mpid/ch3/include/mpidpost.h index d49191b3cfd..231bccec1bf 100644 --- a/src/mpid/ch3/include/mpidpost.h +++ b/src/mpid/ch3/include/mpidpost.h @@ -192,7 +192,6 @@ int MPID_Intercomm_exchange(MPIR_Comm *local_comm_ptr, int local_leader, int *remote_size, MPIR_Lpid **remote_lpids, int timeout); int MPID_Create_intercomm_from_lpids( MPIR_Comm *newcomm_ptr, int size, const MPIR_Lpid lpids[] ); -int MPID_Comm_get_lpid(MPIR_Comm *comm_ptr, int idx, MPIR_Lpid *lpid_ptr, bool is_remote); /* ULFM support */ MPL_STATIC_INLINE_PREFIX int MPID_Comm_AS_enabled(MPIR_Comm * comm_ptr) diff --git a/src/mpid/ch3/src/mpid_vc.c b/src/mpid/ch3/src/mpid_vc.c index 28fb92e7cef..83e2a67c1b2 100644 --- a/src/mpid/ch3/src/mpid_vc.c +++ b/src/mpid/ch3/src/mpid_vc.c @@ -222,25 +222,6 @@ int MPIDI_VCR_Dup(MPIDI_VCR orig_vcr, MPIDI_VCR * new_vcr) return MPI_SUCCESS; } -/*@ - MPID_Comm_get_lpid - Get the local process ID for a given VC reference - @*/ -int MPID_Comm_get_lpid(MPIR_Comm *comm_ptr, int idx, MPIR_Lpid *lpid_ptr, bool is_remote) -{ - - MPIR_FUNC_ENTER; - - if (comm_ptr->comm_kind == MPIR_COMM_KIND__INTRACOMM) - *lpid_ptr = comm_ptr->dev.vcrt->vcr_table[idx]->lpid; - else if (is_remote) - *lpid_ptr = comm_ptr->dev.vcrt->vcr_table[idx]->lpid; - else - *lpid_ptr = comm_ptr->dev.local_vcrt->vcr_table[idx]->lpid; - - MPIR_FUNC_EXIT; - return MPI_SUCCESS; -} - /* * The following routines convert to/from the global pids, which are * represented as pairs of ints (process group id, rank in that process group) @@ -363,13 +344,10 @@ int MPIDI_GPID_ToLpidArray( int size, MPIDI_Gpid in_gpid[], MPIR_Lpid lpid[] ) static inline int MPIDI_LPID_GetAllInComm(MPIR_Comm *comm_ptr, int local_size, MPIR_Lpid local_lpids[]) { - int i; int mpi_errno = MPI_SUCCESS; MPIR_Assert( comm_ptr->local_size == local_size ); - for (i=0; ilocal_size; i++) { - MPIR_Lpid tmp_lpid; - mpi_errno |= MPID_Comm_get_lpid( comm_ptr, i, &tmp_lpid, FALSE ); - local_lpids[i] = tmp_lpid; + for (int i=0; ilocal_size; i++) { + local_lpids[i] = comm_ptr->dev.vcrt->vcr_table[i]->lpid; } return mpi_errno; } diff --git a/src/mpid/ch4/include/mpidch4.h b/src/mpid/ch4/include/mpidch4.h index 30698dc67a9..ba4b043b14a 100644 --- a/src/mpid/ch4/include/mpidch4.h +++ b/src/mpid/ch4/include/mpidch4.h @@ -26,7 +26,6 @@ int MPID_Comm_get_all_failed_procs(MPIR_Comm *, MPIR_Group **, int); int MPID_Comm_revoke(MPIR_Comm *, int); int MPID_Comm_failure_ack(MPIR_Comm *); MPL_STATIC_INLINE_PREFIX int MPID_Comm_AS_enabled(MPIR_Comm *) MPL_STATIC_INLINE_SUFFIX; -int MPID_Comm_get_lpid(MPIR_Comm *, int, MPIR_Lpid *, bool); int MPID_CS_finalize(void); int MPID_Finalize(void); int MPID_Get_universe_size(int *); diff --git a/src/mpid/ch4/src/ch4_impl.h b/src/mpid/ch4/src/ch4_impl.h index 11ffe068a13..fce50d1dd0b 100644 --- a/src/mpid/ch4/src/ch4_impl.h +++ b/src/mpid/ch4/src/ch4_impl.h @@ -385,7 +385,7 @@ MPL_STATIC_INLINE_PREFIX int MPIDIU_valid_group_rank(MPIR_Comm * comm, int rank, MPIR_FUNC_ENTER; - MPID_Comm_get_lpid(comm, rank, &lpid, FALSE); + lpid = MPIR_comm_rank_to_lpid(comm, rank); for (z = 0; z < size; ++z) { if (lpid == MPIR_Group_rank_to_lpid(grp, z)) { diff --git a/src/mpid/ch4/src/ch4_init.c b/src/mpid/ch4/src/ch4_init.c index e09357352c7..6842fed8b76 100644 --- a/src/mpid/ch4/src/ch4_init.c +++ b/src/mpid/ch4/src/ch4_init.c @@ -1073,26 +1073,6 @@ int MPID_Free_mem(void *user_buf) goto fn_exit; } -int MPID_Comm_get_lpid(MPIR_Comm * comm_ptr, int idx, MPIR_Lpid * lpid_ptr, bool is_remote) -{ - int mpi_errno = MPI_SUCCESS; - int avtid = 0, lpid = 0; - MPIR_FUNC_ENTER; - - if (comm_ptr->comm_kind == MPIR_COMM_KIND__INTRACOMM) - MPIDIU_comm_rank_to_pid(comm_ptr, idx, &lpid, &avtid); - else if (is_remote) - MPIDIU_comm_rank_to_pid(comm_ptr, idx, &lpid, &avtid); - else { - MPIDIU_comm_rank_to_pid_local(comm_ptr, idx, &lpid, &avtid); - } - - *lpid_ptr = MPIDIU_GPID_CREATE(avtid, lpid); - - MPIR_FUNC_EXIT; - return mpi_errno; -} - int MPID_Get_node_id(MPIR_Comm * comm, int rank, int *id_p) { int mpi_errno = MPI_SUCCESS; From c605347676bda59c5e83eef0233cb6bec2f04683 Mon Sep 17 00:00:00 2001 From: Hui Zhou Date: Mon, 16 Dec 2024 15:10:37 -0600 Subject: [PATCH 49/59] ch4: refactor MPID_Intercomm_exchange Refactor MPID_Intercomm_exchange to Maximize common parts for MPI_Intercomm_create, MPI_Comm_connect/accept, and MPI_Intercomm_create_from_group. They differ in the first step in how to establish a leader-to-leader communication. In ch4, this is to establish an av for remote leader. Once the av is established, the intercomm exchange parts are common. We no longer generate lpid from ch4-layer. Rather, we exchange world information and convert lpids by swapping world_idx. The lpids will be used directly as index to ch4 av tables and upids (address names) are inserted into the av table entries. --- src/mpid/ch4/src/ch4_comm.c | 553 +++++++++++++++++++++--------------- src/mpid/ch4/src/ch4_comm.h | 3 + src/mpid/ch4/src/ch4_impl.h | 4 - 3 files changed, 329 insertions(+), 231 deletions(-) diff --git a/src/mpid/ch4/src/ch4_comm.c b/src/mpid/ch4/src/ch4_comm.c index d7c20f615b4..dab6c776a95 100644 --- a/src/mpid/ch4/src/ch4_comm.c +++ b/src/mpid/ch4/src/ch4_comm.c @@ -393,280 +393,379 @@ int MPID_Comm_set_hints(MPIR_Comm * comm_ptr, MPIR_Info * info_ptr) goto fn_exit; } +/* Stages of forming inter communicator: + * 0. establish leader communication - get dynamic_av via PMI, peer_comm, or connect/accept. + * 1. leader exchange data. + * 2. leader broadcast over local_comm. + */ +static int leader_exchange(MPIR_Comm * local_comm, MPIR_Lpid remote_lpid, int tag, + int context_id, int *remote_data_size_out, void **remote_data_out, + int timeout); +static int prepare_local_lpids(MPIR_Comm * local_comm, MPIR_Lpid ** lpids_out, + int *num_worlds_out, int **worlds_out); +static void convert_local_lpids(int local_size, MPIR_Lpid * lpids, int num_worlds, int *worlds); +static int prepare_local_data(int local_size, int context_id, MPIR_Lpid * lpids, + int num_worlds, int *world_idx_array, + int *upid_sizes, char *upids, int *data_size_out, void **data_out); +static int extract_remote_data(void *remote_data, int *remote_size_out, + int *remote_context_id_out, MPIR_Lpid ** remote_lpids_out, + int **remote_upid_sizes_out, char **remote_upids_out); + int MPID_Intercomm_exchange(MPIR_Comm * local_comm, int local_leader, MPIR_Comm * peer_comm, int remote_leader, int tag, - int context_id, int *remote_context_id, - int *remote_size, MPIR_Lpid ** remote_lpids) + int context_id, int *remote_context_id_out, + int *remote_size_out, MPIR_Lpid ** remote_lpids_out, int timeout) { int mpi_errno = MPI_SUCCESS; - int i; - int avtid = 0, lpid = -1; - int local_avtid = 0, remote_avtid = 0; - int local_size_send = 0, remote_size_recv = 0; - int pure_intracomm = 1; - int local_size = 0; - MPIR_Lpid *local_lpids = NULL; - int *local_upid_size = NULL, *remote_upid_size = NULL; - int upid_send_size = 0, upid_recv_size = 0; - char *local_upids = NULL, *remote_upids = NULL; - - /* - * CH4 only cares about GPID. UPID extraction and exchange should be done - * by netmod - */ MPIR_FUNC_ENTER; - MPIR_CHKPMEM_DECL(1); - MPIR_CHKLMEM_DECL(5); - - local_size = local_comm->local_size; + bool is_local_leader = (local_comm->rank == local_leader); + struct bcast_data_t { + int mpi_errno; + int remote_data_size; + }; + struct bcast_data_t bcast_data; + + /* Stage 1: exchange between leaders */ + int remote_data_size = 0; + void *remote_data = NULL; + if (is_local_leader) { + MPID_THREAD_CS_ENTER(VCI, MPIDI_VCI(0).lock); + MPIR_Lpid remote_lpid = MPIR_comm_rank_to_lpid(peer_comm, remote_leader); + mpi_errno = leader_exchange(local_comm, remote_lpid, tag, context_id, + &remote_data_size, &remote_data, timeout); + MPID_THREAD_CS_EXIT(VCI, MPIDI_VCI(0).lock); + } - /* - * Stage 1: UPID exchange and GPID conversion in leaders - */ - if (local_comm->rank == local_leader) { - /* We need to check all processes in local group to decide there - * is no dynamic spawned process. */ - for (i = 0; i < local_size; i++) { - MPIDIU_comm_rank_to_pid(local_comm, i, &lpid, &local_avtid); - if (local_avtid > 0) { - pure_intracomm = 0; - break; - } - } - if (pure_intracomm) { - /* check if remote leader is dynamic spawned process */ - MPIDIU_comm_rank_to_pid(peer_comm, remote_leader, &lpid, &remote_avtid); - if (remote_avtid > 0) - pure_intracomm = 0; - } - local_size_send = local_size; - if (!pure_intracomm) { - /* embedded dynamic process info in size */ - local_size_send |= MPIDI_DYNPROC_MASK; - } + /* Stage 2: Broadcast inside local_group */ + if (is_local_leader) { + bcast_data.mpi_errno = mpi_errno; + bcast_data.remote_data_size = remote_data_size; + } + mpi_errno = MPIR_Bcast_impl(&bcast_data, 2, MPI_INT, local_leader, local_comm, MPIR_ERR_NONE); + MPIR_ERR_CHECK(mpi_errno); - MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_COMM, VERBOSE, - (MPL_DBG_FDEST, "rank %d sendrecv to rank %d", - peer_comm->rank, remote_leader)); - int local_ints[2] = { local_size_send, context_id }; - int remote_ints[2]; - mpi_errno = MPIC_Sendrecv(local_ints, 2, MPI_INT, remote_leader, tag, - remote_ints, 2, MPI_INT, remote_leader, tag, peer_comm, - MPI_STATUS_IGNORE, MPIR_ERR_NONE); + /* error checking of previous leader exchange */ + if (is_local_leader) { + mpi_errno = bcast_data.mpi_errno; MPIR_ERR_CHECK(mpi_errno); + } else { + MPIR_ERR_CHKANDJUMP(bcast_data.mpi_errno, mpi_errno, MPI_ERR_PORT, "**spawn"); + remote_data_size = bcast_data.remote_data_size; + } - remote_size_recv = remote_ints[0]; - *remote_context_id = remote_ints[1]; - if (remote_size_recv & MPIDI_DYNPROC_MASK) - pure_intracomm = 0; - (*remote_size) = remote_size_recv & (~MPIDI_DYNPROC_MASK); - - MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_COMM, VERBOSE, - (MPL_DBG_FDEST, "local size = %d, remote size = %d, pure_intracomm = %d", - local_size, *remote_size, pure_intracomm)); - - MPIR_CHKPMEM_MALLOC((*remote_lpids), MPIR_Lpid *, (*remote_size) * sizeof(MPIR_Lpid), - mpi_errno, "remote_lpids", MPL_MEM_ADDRESS); - MPIR_CHKLMEM_MALLOC(local_lpids, MPIR_Lpid *, local_size * sizeof(MPIR_Lpid), - mpi_errno, "local_lpids", MPL_MEM_ADDRESS); - for (i = 0; i < local_size; i++) { - MPIDIU_comm_rank_to_pid(local_comm, i, &lpid, &avtid); - local_lpids[i] = MPIDIU_GPID_CREATE(avtid, lpid); - } + /* bcast remote data */ + if (!is_local_leader) { + remote_data = MPL_malloc(remote_data_size, MPL_MEM_OTHER); + MPIR_ERR_CHKANDJUMP(!remote_data, mpi_errno, MPI_ERR_OTHER, "**nomem"); + } - /* TODO: optimizations -- - * if local_size is 1, we can skip send and local bcast; - * if remote_size is 1, we can skip recv. - */ - MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_COMM, VERBOSE, - (MPL_DBG_FDEST, "Intercomm map exchange stage 1: leaders")); - if (!pure_intracomm) { - /* Stage 1.1 UPID exchange between leaders */ - MPIR_CHKLMEM_MALLOC(remote_upid_size, int *, (*remote_size) * sizeof(int), - mpi_errno, "remote_upid_size", MPL_MEM_ADDRESS); - - mpi_errno = MPIDI_NM_get_local_upids(local_comm, &local_upid_size, &local_upids); - MPIR_ERR_CHECK(mpi_errno); - mpi_errno = MPIC_Sendrecv(local_upid_size, local_size, MPI_INT, - remote_leader, tag, - remote_upid_size, *remote_size, MPI_INT, - remote_leader, tag, - peer_comm, MPI_STATUS_IGNORE, MPIR_ERR_NONE); - MPIR_ERR_CHECK(mpi_errno); - upid_send_size = 0; - for (i = 0; i < local_size; i++) - upid_send_size += local_upid_size[i]; - upid_recv_size = 0; - for (i = 0; i < *remote_size; i++) - upid_recv_size += remote_upid_size[i]; - MPIR_CHKLMEM_MALLOC(remote_upids, char *, upid_recv_size * sizeof(char), - mpi_errno, "remote_upids", MPL_MEM_ADDRESS); - mpi_errno = MPIC_Sendrecv(local_upids, upid_send_size, MPI_BYTE, - remote_leader, tag, - remote_upids, upid_recv_size, MPI_BYTE, - remote_leader, tag, - peer_comm, MPI_STATUS_IGNORE, MPIR_ERR_NONE); - MPIR_ERR_CHECK(mpi_errno); + mpi_errno = MPIR_Bcast_impl(remote_data, remote_data_size, MPI_BYTE, local_leader, local_comm, + MPIR_ERR_NONE); + MPIR_ERR_CHECK(mpi_errno); - /* Stage 1.2 convert remote UPID to GPID and get GPID for local group */ - MPIDIU_upids_to_lpids(*remote_size, remote_upid_size, remote_upids, *remote_lpids); - } else { - /* Stage 1.1f only exchange GPIDS if no dynamic process involved */ - mpi_errno = MPIC_Sendrecv(local_lpids, local_size, MPI_UINT64_T, - remote_leader, tag, - *remote_lpids, *remote_size, MPI_UINT64_T, - remote_leader, tag, - peer_comm, MPI_STATUS_IGNORE, MPIR_ERR_NONE); - MPIR_ERR_CHECK(mpi_errno); - } - /* Stage 1.3 check if local/remote groups are disjoint */ - - /* - * Error checking for this routine requires care. Because this - * routine is collective over two different sets of processes, - * it is relatively easy for the user to try to create an - * intercommunicator from two overlapping groups of processes. - * This is made more likely by inconsistencies in the MPI-1 - * specification (clarified in MPI-2) that seemed to allow - * the groups to overlap. Because of that, we first check that the - * groups are in fact disjoint before performing any collective - * operations. - */ + /* Stage 3: Each process extract data (if necessary: add worlds, convert lpids) */ + MPIR_Lpid *remote_lpids; + int *remote_upid_sizes; + char *remote_upids; + MPID_THREAD_CS_ENTER(VCI, MPIDI_VCI(0).lock); + /* need be inside CS because we are potentially introducing new worlds */ + mpi_errno = extract_remote_data(remote_data, remote_size_out, remote_context_id_out, + &remote_lpids, &remote_upid_sizes, &remote_upids); + MPID_THREAD_CS_EXIT(VCI, MPIDI_VCI(0).lock); + MPIR_ERR_CHECK(mpi_errno); #ifdef HAVE_ERROR_CHECKING - { - MPID_BEGIN_ERROR_CHECKS; - { - /* Now that we have both the local and remote processes, - * check for any overlap */ - mpi_errno = MPIDI_check_disjoint_lpids(local_lpids, local_size, - *remote_lpids, *remote_size); - MPIR_ERR_CHECK(mpi_errno); - } - MPID_END_ERROR_CHECKS; - } -#endif /* HAVE_ERROR_CHECKING */ - - /* At this point, we're done with the local lpids; they'll - * be freed with the other local memory on exit */ - local_lpids = NULL; + /* Now that we have both the local and remote processes, + * check for any overlap */ + MPIR_Lpid *local_lpids; + local_lpids = MPL_malloc(local_comm->local_size * sizeof(MPIR_Lpid), MPL_MEM_GROUP); + MPIR_ERR_CHKANDJUMP(!local_lpids, mpi_errno, MPI_ERR_OTHER, "**nomem"); + for (int i = 0; i < local_comm->local_size; i++) { + local_lpids[i] = MPIR_Group_rank_to_lpid(local_comm->local_group, i); } - /* - * Stage 2. Bcast UPID to non-leaders (intra-group) - */ - MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_COMM, VERBOSE, - (MPL_DBG_FDEST, "Intercomm map exchange stage 2: intra-group")); - mpi_errno = MPIDIU_Intercomm_map_bcast_intra(local_comm, local_leader, - remote_size, remote_context_id, pure_intracomm, - remote_upid_size, remote_upids, remote_lpids); + mpi_errno = MPIDI_check_disjoint_lpids(local_lpids, local_comm->local_size, + remote_lpids, *remote_size_out); + MPL_free(local_lpids); MPIR_ERR_CHECK(mpi_errno); +#endif + + /* insert upids */ + char *upid = remote_upids; + MPID_THREAD_CS_ENTER(VCI, MPIDI_VCI(0).lock); + for (int i = 0; i < *remote_size_out; i++) { + mpi_errno = MPIDI_NM_insert_upid(remote_lpids[i], upid, remote_upid_sizes[i]); + if (mpi_errno) { + break; + } + upid += remote_upid_sizes[i]; + } + MPID_THREAD_CS_EXIT(VCI, MPIDI_VCI(0).lock); + MPIR_ERR_CHECK(mpi_errno); + + /* make a copy of remote_lpids (because it points to remote_data and it will freed) */ + *remote_lpids_out = MPL_malloc((*remote_size_out) * sizeof(MPIR_Lpid), MPL_MEM_GROUP); + MPIR_ERR_CHKANDJUMP(!(*remote_lpids_out), mpi_errno, MPI_ERR_OTHER, "**nomem"); + memcpy(*remote_lpids_out, remote_lpids, (*remote_size_out) * sizeof(MPIR_Lpid)); + + MPL_free(remote_data); - MPIR_CHKPMEM_COMMIT(); fn_exit: - MPL_free(local_upid_size); - MPL_free(local_upids); - MPIR_CHKLMEM_FREEALL(); MPIR_FUNC_EXIT; return mpi_errno; fn_fail: - MPIR_CHKPMEM_REAP(); - *remote_lpids = NULL; goto fn_exit; } -int MPIDIU_Intercomm_map_bcast_intra(MPIR_Comm * local_comm, int local_leader, int *remote_size, - int *remote_context_id, - int pure_intracomm, int *remote_upid_size, char *remote_upids, - MPIR_Lpid ** remote_lpids) +/* Allocate and fill local lpids data. We assume remote will be from + * different worlds, so we need worlds info so remote can match worlds + * and convert lpids. + */ +static int prepare_local_lpids(MPIR_Comm * local_comm, MPIR_Lpid ** lpids_out, + int *num_worlds_out, int **worlds_out) { int mpi_errno = MPI_SUCCESS; - int i; - int upid_recv_size = 0; - int map_info[5]; - int *_remote_upid_size = NULL; - char *_remote_upids = NULL; - MPIR_CHKPMEM_DECL(1); - MPIR_CHKLMEM_DECL(3); + int local_size = local_comm->local_size; - MPIR_FUNC_ENTER; + MPIR_Lpid *lpids; + lpids = MPL_malloc(local_size * sizeof(MPIR_Lpid), MPL_MEM_GROUP); + MPIR_ERR_CHKANDJUMP(!lpids, mpi_errno, MPI_ERR_OTHER, "**nomem"); - MPI_Datatype lpid_datatype; - if (sizeof(MPIR_Lpid) == 8) { - lpid_datatype = MPI_UINT64_T; - } else { - MPIR_Assert(sizeof(MPIR_Lpid) == 4); - lpid_datatype = MPI_UINT32_T; + /* a make-shift hash for world_idx's, consider typically only a few worlds (or just 0) + * It is OK to use static array here because the entire leader exchange will be + * under (VCI 0) critical section. + */ +#define MAX_WORLDS 100 + static int world_hash[MAX_WORLDS] = { 0 }; + int num_worlds = 0; + + for (int i = 0; i < local_size; i++) { + lpids[i] = MPIR_Group_rank_to_lpid(local_comm->local_group, i); + int world_idx = MPIR_LPID_WORLD_INDEX(lpids[i]); + + bool found = false; + for (int j = 0; j < num_worlds; j++) { + if (world_hash[j] == world_idx) { + found = true; + break; + } + } + if (!found) { + world_hash[num_worlds++] = world_idx; + MPIR_Assert(num_worlds < MAX_WORLDS); + } } - if (local_comm->rank == local_leader) { - if (!pure_intracomm) { - for (i = 0; i < (*remote_size); i++) { - upid_recv_size += remote_upid_size[i]; + fn_exit: + *lpids_out = lpids; + *num_worlds_out = num_worlds; + *worlds_out = world_hash; + return mpi_errno; + fn_fail: + goto fn_exit; +} + +static void convert_local_lpids(int local_size, MPIR_Lpid * lpids, int num_worlds, int *worlds) +{ + for (int i = 0; i < local_size; i++) { + int world_idx = MPIR_LPID_WORLD_INDEX(lpids[i]); + int world_rank = MPIR_LPID_WORLD_RANK(lpids[i]); + int transit_world_idx = -1; + for (int j = 0; j < num_worlds; j++) { + if (worlds[j] == world_idx) { + transit_world_idx = j; + break; } } - map_info[0] = *remote_size; - map_info[1] = upid_recv_size; - map_info[2] = pure_intracomm; - map_info[3] = *remote_context_id; - mpi_errno = - MPIR_Bcast_allcomm_auto(map_info, 4, MPI_INT, local_leader, local_comm, MPIR_ERR_NONE); - MPIR_ERR_CHECK(mpi_errno); + MPIR_Assert(transit_world_idx >= 0); + lpids[i] = MPIR_LPID_FROM(transit_world_idx, world_rank); + } +} - if (!pure_intracomm) { - mpi_errno = MPIR_Bcast_allcomm_auto(remote_upid_size, *remote_size, MPI_INT, - local_leader, local_comm, MPIR_ERR_NONE); - MPIR_ERR_CHECK(mpi_errno); - mpi_errno = MPIR_Bcast_allcomm_auto(remote_upids, upid_recv_size, MPI_BYTE, - local_leader, local_comm, MPIR_ERR_NONE); - MPIR_ERR_CHECK(mpi_errno); - } else { - mpi_errno = MPIR_Bcast_allcomm_auto(*remote_lpids, *remote_size, lpid_datatype, - local_leader, local_comm, MPIR_ERR_NONE); - } - } else { - mpi_errno = - MPIR_Bcast_allcomm_auto(map_info, 4, MPI_INT, local_leader, local_comm, MPIR_ERR_NONE); - MPIR_ERR_CHECK(mpi_errno); - *remote_size = map_info[0]; - upid_recv_size = map_info[1]; - pure_intracomm = map_info[2]; - *remote_context_id = map_info[3]; - - MPIR_CHKPMEM_MALLOC((*remote_lpids), MPIR_Lpid *, (*remote_size) * sizeof(MPIR_Lpid), - mpi_errno, "remote_lpids", MPL_MEM_COMM); - if (!pure_intracomm) { - MPIR_CHKLMEM_MALLOC(_remote_upid_size, int *, (*remote_size) * sizeof(int), - mpi_errno, "_remote_upid_size", MPL_MEM_COMM); - mpi_errno = MPIR_Bcast_allcomm_auto(_remote_upid_size, *remote_size, MPI_INT, - local_leader, local_comm, MPIR_ERR_NONE); - MPIR_ERR_CHECK(mpi_errno); - MPIR_CHKLMEM_MALLOC(_remote_upids, char *, upid_recv_size * sizeof(char), - mpi_errno, "_remote_upids", MPL_MEM_COMM); - mpi_errno = MPIR_Bcast_allcomm_auto(_remote_upids, upid_recv_size, MPI_BYTE, - local_leader, local_comm, MPIR_ERR_NONE); - MPIR_ERR_CHECK(mpi_errno); +static int prepare_local_data(int local_size, int context_id, MPIR_Lpid * lpids, + int num_worlds, int *world_idx_array, + int *upid_sizes, char *upids, int *data_size_out, void **data_out) +{ + int mpi_errno = MPI_SUCCESS; + + /* layout: + * local_size + * context_id + * lpids[local_size] + * num_worlds + * namespace[num_worlds][MPIR_NAMESPACE_MAX] + * world_sizes[num_worlds] + * upid_sizes[local_size] + * upids[] + */ + int total_upid_size = 0; + for (int i = 0; i < local_size; i++) { + total_upid_size += upid_sizes[i]; + } + + int len = 0; + len += sizeof(int) * 2 + local_size * sizeof(MPIR_Lpid); + len += sizeof(int) + num_worlds * MPIR_NAMESPACE_MAX + num_worlds * sizeof(int); + len += local_size * sizeof(int); + len += total_upid_size; + + char *data = MPL_malloc(len, MPL_MEM_OTHER); + char *s = data; + + *(int *) (s) = local_size; + s += sizeof(int); + *(int *) (s) = context_id; + s += sizeof(int); + + memcpy(s, lpids, local_size * sizeof(MPIR_Lpid)); + s += local_size * sizeof(MPIR_Lpid); + + for (int i = 0; i < num_worlds; i++) { + strncpy(s, MPIR_Worlds[world_idx_array[i]].namespace, MPIR_NAMESPACE_MAX); + s += MPIR_NAMESPACE_MAX; + } + for (int i = 0; i < num_worlds; i++) { + *(int *) (s) = MPIR_Worlds[world_idx_array[i]].num_procs; + s += sizeof(int); + } + + memcpy(s, upid_sizes, local_size * sizeof(int)); + s += local_size * sizeof(int); + + memcpy(s, upids, total_upid_size); + + *data_size_out = len; + *data_out = data; + + return mpi_errno; +} + +/* NOTE: will add worlds and convert lpids if necessary */ +static int extract_remote_data(void *remote_data, int *remote_size_out, + int *remote_context_id_out, MPIR_Lpid ** remote_lpids_out, + int **remote_upid_sizes_out, char **remote_upids_out) +{ + int mpi_errno = MPI_SUCCESS; + char *s = remote_data; + + *remote_size_out = *(int *) s; + s += sizeof(int); + int remote_size = *remote_size_out; + + *remote_context_id_out = *(int *) s; + s += sizeof(int); - MPIDIU_upids_to_lpids(*remote_size, _remote_upid_size, _remote_upids, *remote_lpids); - } else { - mpi_errno = MPIR_Bcast_allcomm_auto(*remote_lpids, *remote_size, lpid_datatype, - local_leader, local_comm, MPIR_ERR_NONE); + *remote_lpids_out = (void *) s; + s += remote_size * sizeof(MPIR_Lpid); + + int num_worlds = *(int *) s; + s += sizeof(int); + + char *p_worlds = s; + s += num_worlds * MPIR_NAMESPACE_MAX; + + int *p_world_sizes = (void *) s; + s += num_worlds * sizeof(int); + + *remote_upid_sizes_out = (void *) s; + s += remote_size * sizeof(int); + + *remote_upids_out = s; + + /* Find or add new worlds */ + int world_hash[MAX_WORLDS]; + for (int i = 0; i < num_worlds; i++) { + char *namespace = p_worlds + i * MPIR_NAMESPACE_MAX; + world_hash[i] = MPIR_find_world(namespace); + if (world_hash[i] == -1) { + world_hash[i] = MPIR_add_world(namespace, p_world_sizes[i]); } } - MPIR_CHKPMEM_COMMIT(); + /* convert remote lpids */ + for (int i = 0; i < remote_size; i++) { + MPIR_Lpid lpid = (*remote_lpids_out)[i]; + int world_idx = MPIR_LPID_WORLD_INDEX(lpid); + int world_rank = MPIR_LPID_WORLD_RANK(lpid); + (*remote_lpids_out)[i] = MPIR_LPID_FROM(world_hash[world_idx], world_rank); + } + + return mpi_errno; +} + +/* exchange data between leaders */ +static int leader_exchange(MPIR_Comm * local_comm, MPIR_Lpid remote_lpid, int tag, int context_id, + int *remote_data_size_out, void **remote_data_out, int timeout) +{ + int mpi_errno = MPI_SUCCESS; + MPIR_CHKLMEM_DECL(4); + MPIR_FUNC_ENTER; + + /* I am the leader of local_comm, remote_lpid is the remote leader of remote_comm. + * + * 1. Send data sizes + * 2. Send data + * + * Future optimizations + * * Eager mode + * * Optionally skip upids exchange + */ + + /* local prepare */ + int local_size = local_comm->local_size; + MPIR_Lpid *local_lpids; + int num_local_worlds; + int *local_worlds; + mpi_errno = prepare_local_lpids(local_comm, &local_lpids, &num_local_worlds, &local_worlds); + MPIR_ERR_CHECK(mpi_errno); + MPIR_CHKLMEM_ADD(local_lpids); + + /* convert local world_idx to transit world_idx */ + convert_local_lpids(local_size, local_lpids, num_local_worlds, local_worlds); + + int *local_upid_sizes; + char *local_upids; + mpi_errno = MPIDI_NM_get_local_upids(local_comm, &local_upid_sizes, &local_upids); + MPIR_ERR_CHECK(mpi_errno); + MPIR_CHKLMEM_ADD(local_upid_sizes); + MPIR_CHKLMEM_ADD(local_upids); + + int local_data_size; + void *local_data; + mpi_errno = prepare_local_data(local_size, context_id, local_lpids, + num_local_worlds, local_worlds, local_upid_sizes, local_upids, + &local_data_size, &local_data); + MPIR_ERR_CHECK(mpi_errno); + MPIR_CHKLMEM_ADD(local_data); + + /* exchange */ + int remote_data_size; + void *remote_data; + mpi_errno = MPIDI_NM_dynamic_sendrecv(remote_lpid, tag, &local_data_size, sizeof(int), + &remote_data_size, sizeof(int), timeout); + MPIR_ERR_CHECK(mpi_errno); + + remote_data = MPL_malloc(remote_data_size, MPL_MEM_OTHER); + MPIR_ERR_CHKANDJUMP(!remote_data, mpi_errno, MPI_ERR_OTHER, "**nomem"); + + mpi_errno = MPIDI_NM_dynamic_sendrecv(remote_lpid, tag, local_data, local_data_size, + remote_data, remote_data_size, timeout); + MPIR_ERR_CHECK(mpi_errno); + + *remote_data_size_out = remote_data_size; + *remote_data_out = remote_data; + fn_exit: MPIR_CHKLMEM_FREEALL(); MPIR_FUNC_EXIT; return mpi_errno; fn_fail: - MPIR_CHKPMEM_REAP(); - *remote_lpids = NULL; goto fn_exit; } +/* ---- */ int MPID_Create_intercomm_from_lpids(MPIR_Comm * newcomm_ptr, int size, const MPIR_Lpid lpids[]) { int mpi_errno = MPI_SUCCESS, i; diff --git a/src/mpid/ch4/src/ch4_comm.h b/src/mpid/ch4/src/ch4_comm.h index dfc9e18967a..45cb787459c 100644 --- a/src/mpid/ch4/src/ch4_comm.h +++ b/src/mpid/ch4/src/ch4_comm.h @@ -15,6 +15,9 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_set_comm_hint_sender_vci(MPIR_Comm * comm, in MPL_STATIC_INLINE_PREFIX int MPIDI_set_comm_hint_receiver_vci(MPIR_Comm * comm, int type, int value); MPL_STATIC_INLINE_PREFIX int MPIDI_set_comm_hint_vci(MPIR_Comm * comm, int type, int value); +int MPIDI_Intercomm_exchange(MPIR_Comm * local_comm, int local_leader, MPIR_Lpid remote_lpid, + int tag, int context_id, int *remote_context_id_out, + int *remote_size_out, MPIR_Lpid ** remote_lpids_out); int MPIDI_Comm_create_multi_leaders(MPIR_Comm * comm); int MPIDI_Comm_create_multi_leader_subcomms(MPIR_Comm * comm, int num_leads); diff --git a/src/mpid/ch4/src/ch4_impl.h b/src/mpid/ch4/src/ch4_impl.h index fce50d1dd0b..f8da365d187 100644 --- a/src/mpid/ch4/src/ch4_impl.h +++ b/src/mpid/ch4/src/ch4_impl.h @@ -13,10 +13,6 @@ #include "ch4_self.h" #include "ch4_vci.h" -int MPIDIU_Intercomm_map_bcast_intra(MPIR_Comm * local_comm, int local_leader, int *remote_size, - int *remote_context_id, bool * is_low_group, - int pure_intracomm, int *remote_upid_size, char *remote_upids, - MPIR_Lpid ** remote_lpids); int MPIDIG_get_context_index(uint64_t context_id); uint64_t MPIDIG_generate_win_id(MPIR_Comm * comm_ptr); From 45add2f820d914c72217a5665e778a2c48952f02 Mon Sep 17 00:00:00 2001 From: Hui Zhou Date: Tue, 17 Dec 2024 18:06:50 -0600 Subject: [PATCH 50/59] ch4: refactor ch4_spawn In MPID_Comm_connect/accept, simply establish remote_lpid and call MPIR_Intercomm_create_impl. --- src/include/mpir_comm.h | 8 +- src/mpi/comm/comm_impl.c | 66 +++------------- src/mpid/ch4/src/ch4_comm.c | 2 + src/mpid/ch4/src/ch4_spawn.c | 147 +++++++++++++++-------------------- 4 files changed, 81 insertions(+), 142 deletions(-) diff --git a/src/include/mpir_comm.h b/src/include/mpir_comm.h index 16c90bfe5b4..f510a6a3dc3 100644 --- a/src/include/mpir_comm.h +++ b/src/include/mpir_comm.h @@ -386,10 +386,6 @@ int MPIR_Comm_commit(MPIR_Comm *); int MPIR_Comm_is_parent_comm(MPIR_Comm *); -/* peer intercomm is an internal 1-to-1 intercomm used for connecting dynamic processes */ -int MPIR_peer_intercomm_create(int context_id, int recvcontext_id, - uint64_t remote_lpid, int is_low_group, MPIR_Comm ** newcomm); - #define MPIR_Comm_rank(comm_ptr) ((comm_ptr)->rank) #define MPIR_Comm_size(comm_ptr) ((comm_ptr)->local_size) @@ -429,6 +425,10 @@ int MPIR_Comm_split_type(MPIR_Comm * comm_ptr, int split_type, int key, MPIR_Inf int MPIR_Comm_split_type_neighborhood(MPIR_Comm * comm_ptr, int split_type, int key, MPIR_Info * info_ptr, MPIR_Comm ** newcomm_ptr); +int MPIR_Intercomm_create_timeout(MPIR_Comm * local_comm_ptr, int local_leader, + MPIR_Comm * peer_comm_ptr, int remote_leader, + int tag, int timeout, MPIR_Comm ** new_intercomm_ptr); + /* Preallocated comm objects. There are 3: comm_world, comm_self, and a private (non-user accessible) dup of comm world that is provided if needed in MPI_Finalize. Having a separate version of comm_world diff --git a/src/mpi/comm/comm_impl.c b/src/mpi/comm/comm_impl.c index d1406d4a4c8..7666c3fdf21 100644 --- a/src/mpi/comm/comm_impl.c +++ b/src/mpi/comm/comm_impl.c @@ -996,6 +996,14 @@ static int determine_low_group(MPIR_Lpid remote_lpid, bool * is_low_group_out) int MPIR_Intercomm_create_impl(MPIR_Comm * local_comm_ptr, int local_leader, MPIR_Comm * peer_comm_ptr, int remote_leader, int tag, MPIR_Comm ** new_intercomm_ptr) +{ + return MPIR_Intercomm_create_timeout(local_comm_ptr, local_leader, + peer_comm_ptr, remote_leader, tag, 0, new_intercomm_ptr); +} + +int MPIR_Intercomm_create_timeout(MPIR_Comm * local_comm_ptr, int local_leader, + MPIR_Comm * peer_comm_ptr, int remote_leader, + int tag, int timeout, MPIR_Comm ** new_intercomm_ptr) { int mpi_errno = MPI_SUCCESS; int remote_size = 0; @@ -1014,7 +1022,7 @@ int MPIR_Intercomm_create_impl(MPIR_Comm * local_comm_ptr, int local_leader, /* In the multi-threaded case, MPIR_Get_contextid_sparse assumes that the * calling routine already holds the single critical section */ /* TODO: Make sure this is tag-safe */ - int recvcontext_id; + int recvcontext_id = MPIR_INVALID_CONTEXT_ID; mpi_errno = MPIR_Get_contextid_sparse(local_comm_ptr, &recvcontext_id, FALSE); MPIR_ERR_CHECK(mpi_errno); MPIR_Assert(recvcontext_id != 0); @@ -1026,7 +1034,7 @@ int MPIR_Intercomm_create_impl(MPIR_Comm * local_comm_ptr, int local_leader, mpi_errno = MPID_Intercomm_exchange(local_comm_ptr, local_leader, peer_comm_ptr, remote_leader, tag, recvcontext_id, &remote_context_id, - &remote_size, &remote_lpids); + &remote_size, &remote_lpids, timeout); MPIR_ERR_CHECK(mpi_errno); bool is_low_group; @@ -1082,59 +1090,9 @@ int MPIR_Intercomm_create_impl(MPIR_Comm * local_comm_ptr, int local_leader, MPIR_FUNC_EXIT; return mpi_errno; fn_fail: - goto fn_exit; -} - -/* Peer intercomm is a 1-to-1 intercomm, internally created by device layer - * to facilitate connecting dynamic processes */ - -int MPIR_peer_intercomm_create(int context_id, int recvcontext_id, - uint64_t remote_lpid, int is_low_group, MPIR_Comm ** newcomm) -{ - int mpi_errno = MPI_SUCCESS; - - mpi_errno = MPIR_Comm_create(newcomm); - MPIR_ERR_CHECK(mpi_errno); - - (*newcomm)->context_id = context_id; - (*newcomm)->recvcontext_id = recvcontext_id; - (*newcomm)->remote_size = 1; - (*newcomm)->local_size = 1; - (*newcomm)->rank = 0; - (*newcomm)->comm_kind = MPIR_COMM_KIND__INTERCOMM; - (*newcomm)->local_comm = 0; - (*newcomm)->is_low_group = is_low_group; - - mpi_errno = MPID_Create_intercomm_from_lpids(*newcomm, 1, &remote_lpid); - MPIR_ERR_CHECK(mpi_errno); - - MPIR_Comm *comm_self = MPIR_Process.comm_self; - MPIR_Comm_map_dup(*newcomm, comm_self, MPIR_COMM_MAP_DIR__L2L); - - /* Inherit the error handler */ - MPID_THREAD_CS_ENTER(VCI, comm_self->mutex); - (*newcomm)->errhandler = comm_self->errhandler; - if (comm_self->errhandler) { - MPIR_Errhandler_add_ref(comm_self->errhandler); + if (recvcontext_id != MPIR_INVALID_CONTEXT_ID) { + MPIR_Free_contextid(recvcontext_id); } - MPID_THREAD_CS_EXIT(VCI, comm_self->mutex); - - MPIR_Session *session_ptr = NULL; /* Can we just use NULL session since peer_intercomm is always temporary? */ - MPIR_Lpid my_lpid = MPIR_Group_rank_to_lpid(comm_self->local_group, 0); - mpi_errno = MPIR_Group_create_stride(1, 0, session_ptr, my_lpid, 1, 1, - &(*newcomm)->local_group); - MPIR_ERR_CHECK(mpi_errno); - mpi_errno = MPIR_Group_create_stride(1, 0, session_ptr, remote_lpid, 1, 1, - &(*newcomm)->remote_group); - MPIR_ERR_CHECK(mpi_errno); - - (*newcomm)->tainted = 1; - mpi_errno = MPIR_Comm_commit(*newcomm); - MPIR_ERR_CHECK(mpi_errno); - - fn_exit: - return mpi_errno; - fn_fail: goto fn_exit; } diff --git a/src/mpid/ch4/src/ch4_comm.c b/src/mpid/ch4/src/ch4_comm.c index dab6c776a95..5b5cc3e9def 100644 --- a/src/mpid/ch4/src/ch4_comm.c +++ b/src/mpid/ch4/src/ch4_comm.c @@ -622,6 +622,8 @@ static int prepare_local_data(int local_size, int context_id, MPIR_Lpid * lpids, memcpy(s, lpids, local_size * sizeof(MPIR_Lpid)); s += local_size * sizeof(MPIR_Lpid); + *(int *) (s) = num_worlds; + s += sizeof(int); for (int i = 0; i < num_worlds; i++) { strncpy(s, MPIR_Worlds[world_idx_array[i]].namespace, MPIR_NAMESPACE_MAX); s += MPIR_NAMESPACE_MAX; diff --git a/src/mpid/ch4/src/ch4_spawn.c b/src/mpid/ch4/src/ch4_spawn.c index 10241261336..ba8a4ada024 100644 --- a/src/mpid/ch4/src/ch4_spawn.c +++ b/src/mpid/ch4/src/ch4_spawn.c @@ -273,87 +273,52 @@ int MPID_Close_port(const char *port_name) /* MPID_Comm_accept, MPID_Comm_connect */ -static int peer_intercomm_create(char *remote_addrname, int len, int tag, int timeout, - bool is_sender, MPIR_Comm ** newcomm); -static int dynamic_intercomm_create(const char *port_name, MPIR_Info * info, int root, - MPIR_Comm * comm_ptr, int timeout, bool is_sender, - MPIR_Comm ** newcomm); - -struct dynproc_conn_hdr { - int context_id; - int addrname_len; - char addrname[MPIDI_DYNPROC_NAME_MAX]; -}; - -static int peer_intercomm_create(char *remote_addrname, int len, int tag, - int timeout, bool is_sender, MPIR_Comm ** newcomm) +static int establish_peer_conn(char *remote_addrname, int remote_addrname_len, int tag, + int timeout, bool is_sender, MPIR_Lpid * remote_lpid_out) { int mpi_errno = MPI_SUCCESS; - int context_id, recvcontext_id; - MPIR_Lpid remote_lpid; + MPIR_Lpid remote_lpid = MPIR_LPID_INVALID; - mpi_errno = MPIR_Get_contextid_sparse(MPIR_Process.comm_self, &recvcontext_id, FALSE); - MPIR_ERR_CHECK(mpi_errno); + struct dynproc_conn_hdr { + int addrname_len; + char addrname[MPIDI_DYNPROC_NAME_MAX]; + } hdr; - struct dynproc_conn_hdr hdr; if (is_sender) { /* insert remote address */ - int addrname_len = len; - MPIR_Lpid *remote_lpids = &remote_lpid; - mpi_errno = MPIDIU_upids_to_lpids(1, &addrname_len, remote_addrname, remote_lpids); + mpi_errno = MPIDIU_insert_dynamic_upid(&remote_lpid, remote_addrname, remote_addrname_len); MPIR_ERR_CHECK(mpi_errno); - /* fill hdr with context_id and addrname */ - hdr.context_id = recvcontext_id; - - char *addrname; - int *addrname_size; - mpi_errno = MPIDI_NM_get_local_upids(MPIR_Process.comm_self, &addrname_size, &addrname); + /* get my addrname and send it to remote */ + char *my_addrname; + int *my_addrname_len; + mpi_errno = MPIDI_NM_get_local_upids(MPIR_Process.comm_self, + &my_addrname_len, &my_addrname); MPIR_ERR_CHECK(mpi_errno); - MPIR_Assert(addrname_size[0] <= MPIDI_DYNPROC_NAME_MAX); - memcpy(hdr.addrname, addrname, addrname_size[0]); - hdr.addrname_len = addrname_size[0]; - - /* send remote context_id + addrname */ + MPIR_Assert(my_addrname_len[0] <= MPIDI_DYNPROC_NAME_MAX); + memcpy(hdr.addrname, my_addrname, my_addrname_len[0]); + hdr.addrname_len = my_addrname_len[0]; + /* send it to remote */ int hdr_sz = sizeof(hdr) - MPIDI_DYNPROC_NAME_MAX + hdr.addrname_len; mpi_errno = MPIDI_NM_dynamic_send(remote_lpid, tag, &hdr, hdr_sz, timeout); - MPL_free(addrname); - MPL_free(addrname_size); + MPL_free(my_addrname); + MPL_free(my_addrname_len); MPIR_ERR_CHECK(mpi_errno); - - mpi_errno = MPIDI_NM_dynamic_recv(tag, &hdr, sizeof(hdr), timeout); - MPIR_ERR_CHECK(mpi_errno); - context_id = hdr.context_id; } else { /* recv remote address */ mpi_errno = MPIDI_NM_dynamic_recv(tag, &hdr, sizeof(hdr), timeout); MPIR_ERR_CHECK(mpi_errno); - context_id = hdr.context_id; /* insert remote address */ - int addrname_len = hdr.addrname_len; - MPIR_Lpid *remote_lpids = &remote_lpid; - mpi_errno = MPIDIU_upids_to_lpids(1, &addrname_len, hdr.addrname, remote_lpids); - MPIR_ERR_CHECK(mpi_errno); - - /* send remote context_id */ - hdr.context_id = recvcontext_id; - mpi_errno = MPIDI_NM_dynamic_send(remote_lpid, tag, &hdr, sizeof(hdr.context_id), timeout); + mpi_errno = MPIDIU_insert_dynamic_upid(&remote_lpid, hdr.addrname, hdr.addrname_len); MPIR_ERR_CHECK(mpi_errno); } - /* create peer intercomm */ - mpi_errno = MPIR_peer_intercomm_create(context_id, recvcontext_id, - remote_lpid, is_sender, newcomm); - MPIR_ERR_CHECK(mpi_errno); - fn_exit: + *remote_lpid_out = remote_lpid; return mpi_errno; fn_fail: - if (recvcontext_id) { - MPIR_Free_contextid(recvcontext_id); - } goto fn_exit; } @@ -362,15 +327,13 @@ static int dynamic_intercomm_create(const char *port_name, MPIR_Info * info, int MPIR_Comm ** newcomm) { int mpi_errno = MPI_SUCCESS; - - MPIR_Comm *peer_intercomm = NULL; + MPIR_Lpid remote_lpid = MPIR_LPID_INVALID; + MPIR_Comm *peer_comm = NULL; int tag; - int bcast_ints[2]; /* used to bcast tag and errno */ if (comm_ptr->rank == root) { /* NOTE: do not goto fn_fail on error, or it will leave children hanging */ mpi_errno = get_tag_from_port(port_name, &tag); - if (mpi_errno) - goto bcast_tag_and_errno; + MPIR_ERR_CHECK(mpi_errno); char remote_addrname[MPIDI_DYNPROC_NAME_MAX]; char *addrname; @@ -379,43 +342,59 @@ static int dynamic_intercomm_create(const char *port_name, MPIR_Info * info, int addrname = remote_addrname; mpi_errno = get_conn_name_from_port(port_name, remote_addrname, MPIDI_DYNPROC_NAME_MAX, &len); - if (mpi_errno) - goto bcast_tag_and_errno; + MPIR_ERR_CHECK(mpi_errno); } else { - /* Use NULL for better error behavior */ addrname = NULL; len = 0; } - mpi_errno = peer_intercomm_create(addrname, len, tag, timeout, is_sender, &peer_intercomm); - bcast_tag_and_errno: - bcast_ints[0] = tag; - bcast_ints[1] = mpi_errno; - mpi_errno = MPIR_Bcast_allcomm_auto(bcast_ints, 2, MPI_INT, root, comm_ptr, MPIR_ERR_NONE); - MPIR_ERR_CHECK(mpi_errno); - mpi_errno = bcast_ints[1]; + MPID_THREAD_CS_ENTER(VCI, MPIDI_VCI(0).lock); + mpi_errno = establish_peer_conn(addrname, len, tag, timeout, is_sender, &remote_lpid); + MPID_THREAD_CS_EXIT(VCI, MPIDI_VCI(0).lock); MPIR_ERR_CHECK(mpi_errno); + + /* create peer intercomm - + * Since we will only use peer intercomm to call back MPID_Intercomm_exchange, which + * just need to extract remote_lpid from the peer_comm, we can cheat a bit here - just + * fill peer_comm->remote_group. + */ + peer_comm = (MPIR_Comm *) MPIR_Handle_obj_alloc(&MPIR_Comm_mem); + MPIR_ERR_CHKANDJUMP(!peer_comm, mpi_errno, MPI_ERR_OTHER, "**nomem"); + + peer_comm->comm_kind = MPIR_COMM_KIND__INTERCOMM; + peer_comm->remote_size = 1; + peer_comm->local_size = 1; + peer_comm->rank = 0; + peer_comm->local_group = NULL; + + MPIR_Group_create_stride(1, 0, NULL, remote_lpid, 1, &peer_comm->remote_group); + + fn_fail: + /* In case root fails, we bcast mpi_errno so other ranks will abort too */ + MPIR_Bcast_impl(&mpi_errno, 1, MPI_INT, root, comm_ptr, MPIR_ERR_NONE); } else { - mpi_errno = MPIR_Bcast_allcomm_auto(bcast_ints, 2, MPI_INT, root, comm_ptr, MPIR_ERR_NONE); - MPIR_ERR_CHECK(mpi_errno); - if (bcast_ints[1]) { - /* errno from root cannot be directly returned */ + int root_errno; + MPIR_Bcast_impl(&root_errno, 1, MPI_INT, root, comm_ptr, MPIR_ERR_NONE); + if (root_errno) { MPIR_ERR_SET(mpi_errno, MPI_ERR_PORT, "**comm_connect_fail"); - goto fn_fail; } - tag = bcast_ints[0]; } - mpi_errno = MPIR_Intercomm_create_impl(comm_ptr, root, peer_intercomm, 0, tag, newcomm); - MPIR_ERR_CHECK(mpi_errno); + if (mpi_errno == MPI_SUCCESS) { + mpi_errno = MPIR_Intercomm_create_timeout(comm_ptr, root, peer_comm, 0, tag, timeout, + newcomm); + } - fn_exit: - if (peer_intercomm) { - MPIR_Comm_free_impl(peer_intercomm); + if (comm_ptr->rank == root && peer_comm) { + /* destroy peer_comm */ + MPID_THREAD_CS_ENTER(VCI, MPIDI_VCI(0).lock); + MPIDIU_free_dynamic_lpid(remote_lpid); + MPIR_Group_release(peer_comm->remote_group); + MPIR_Handle_obj_free(&MPIR_Comm_mem, peer_comm); + MPID_THREAD_CS_EXIT(VCI, MPIDI_VCI(0).lock); } + return mpi_errno; - fn_fail: - goto fn_exit; } int MPID_Comm_accept(const char *port_name, MPIR_Info * info, int root, MPIR_Comm * comm, From e096d2b64f78a5c27e359980ac1ad7c54f7ff788 Mon Sep 17 00:00:00 2001 From: Hui Zhou Date: Wed, 18 Dec 2024 10:40:43 -0600 Subject: [PATCH 51/59] comm: remove mapper The local_group and remote_group fully captures the mapper functions. --- src/include/mpir_comm.h | 54 ------- src/mpi/comm/comm_impl.c | 286 +++++------------------------------- src/mpi/comm/comm_split.c | 34 +++-- src/mpi/comm/commutil.c | 149 ------------------- src/mpi/comm/contextid.c | 3 +- src/mpid/ch4/src/ch4_comm.c | 5 - 6 files changed, 57 insertions(+), 474 deletions(-) diff --git a/src/include/mpir_comm.h b/src/include/mpir_comm.h index f510a6a3dc3..2820359d26b 100644 --- a/src/include/mpir_comm.h +++ b/src/include/mpir_comm.h @@ -31,44 +31,6 @@ typedef enum MPIR_Comm_hierarchy_kind_t { MPIR_COMM_HIERARCHY_KIND__MULTI_LEADS = 4, /* is the multi_leaders_comm for a node */ } MPIR_Comm_hierarchy_kind_t; -typedef enum { - MPIR_COMM_MAP_TYPE__DUP, - MPIR_COMM_MAP_TYPE__IRREGULAR -} MPIR_Comm_map_type_t; - -/* direction of mapping: local to local, local to remote, remote to - * local, remote to remote */ -typedef enum { - MPIR_COMM_MAP_DIR__L2L, - MPIR_COMM_MAP_DIR__L2R, - MPIR_COMM_MAP_DIR__R2L, - MPIR_COMM_MAP_DIR__R2R -} MPIR_Comm_map_dir_t; - -typedef struct MPIR_Comm_map { - MPIR_Comm_map_type_t type; - - struct MPIR_Comm *src_comm; - - /* mapping direction for intercomms, which contain local and - * remote groups */ - MPIR_Comm_map_dir_t dir; - - /* only valid for irregular map type */ - int src_mapping_size; - int *src_mapping; - int free_mapping; /* we allocated the mapping */ - - struct MPIR_Comm_map *next; -} MPIR_Comm_map_t; - -int MPIR_Comm_map_irregular(struct MPIR_Comm *newcomm, struct MPIR_Comm *src_comm, - int *src_mapping, int src_mapping_size, - MPIR_Comm_map_dir_t dir, MPIR_Comm_map_t ** map); -int MPIR_Comm_map_dup(struct MPIR_Comm *newcomm, struct MPIR_Comm *src_comm, - MPIR_Comm_map_dir_t dir); -int MPIR_Comm_map_free(struct MPIR_Comm *comm); - /* Communicator info hint */ #define MPIR_COMM_HINT_TYPE_BOOL 0 #define MPIR_COMM_HINT_TYPE_INT 1 @@ -254,12 +216,6 @@ struct MPIR_Comm { hcoll_comm_priv_t hcoll_priv; #endif /* HAVE_HCOLL */ - /* the mapper is temporarily filled out in order to allow the - * device to setup its network addresses. it will be freed after - * the device has initialized the comm. */ - MPIR_Comm_map_t *mapper_head; - MPIR_Comm_map_t *mapper_tail; - enum { MPIR_STREAM_COMM_NONE, MPIR_STREAM_COMM_SINGLE, MPIR_STREAM_COMM_MULTIPLEX } stream_comm_type; union { @@ -458,16 +414,6 @@ int MPII_Comm_copy_data(MPIR_Comm * comm_ptr, MPIR_Info * info, MPIR_Comm ** out int MPII_Setup_intercomm_localcomm(MPIR_Comm *); -/* comm_create helper functions, used by both comm_create and comm_create_group */ -int MPII_Comm_create_calculate_mapping(MPIR_Group * group_ptr, - MPIR_Comm * comm_ptr, - int **mapping_out, MPIR_Comm ** mapping_comm); - -int MPII_Comm_create_map(int local_n, - int remote_n, - int *local_mapping, - int *remote_mapping, MPIR_Comm * mapping_comm, MPIR_Comm * newcomm); - int MPII_Comm_set_hints(MPIR_Comm * comm_ptr, MPIR_Info * info, bool in_comm_create); int MPII_Comm_get_hints(MPIR_Comm * comm_ptr, MPIR_Info * info); int MPII_Comm_check_hints(MPIR_Comm * comm_ptr); diff --git a/src/mpi/comm/comm_impl.c b/src/mpi/comm/comm_impl.c index 7666c3fdf21..be1e858cffa 100644 --- a/src/mpi/comm/comm_impl.c +++ b/src/mpi/comm/comm_impl.c @@ -158,133 +158,6 @@ int MPIR_Comm_compare_impl(MPIR_Comm * comm_ptr1, MPIR_Comm * comm_ptr2, int *re goto fn_exit; } -/* This function allocates and calculates an array (*mapping_out) such that - * (*mapping_out)[i] is the rank in (*mapping_comm) corresponding to local - * rank i in the given group_ptr. - * - * Ownership of the (*mapping_out) array is transferred to the caller who is - * responsible for freeing it. */ -int MPII_Comm_create_calculate_mapping(MPIR_Group * group_ptr, - MPIR_Comm * comm_ptr, - int **mapping_out, MPIR_Comm ** mapping_comm) -{ - int mpi_errno = MPI_SUCCESS; - int subsetOfWorld = 0; - int i, j; - int n; - int *mapping = 0; - MPIR_CHKPMEM_DECL(1); - - MPIR_FUNC_ENTER; - - *mapping_out = NULL; - *mapping_comm = comm_ptr; - - n = group_ptr->size; - MPIR_CHKPMEM_MALLOC(mapping, int *, n * sizeof(int), mpi_errno, "mapping", MPL_MEM_ADDRESS); - - /* Make sure that the processes for this group are contained within - * the input communicator. Also identify the mapping from the ranks of - * the old communicator to the new communicator. - * We do this by matching the lpids of the members of the group - * with the lpids of the members of the input communicator. - * It is an error if the group contains a reference to an lpid that - * does not exist in the communicator. - * - * An important special case is groups (and communicators) that - * are subsets of MPI_COMM_WORLD. In this case, the lpids are - * exactly the same as the ranks in comm world. - */ - - /* Optimize for groups contained within MPI_COMM_WORLD. */ - if (comm_ptr->comm_kind == MPIR_COMM_KIND__INTRACOMM) { - int wsize; - subsetOfWorld = 1; - wsize = MPIR_Process.size; - for (i = 0; i < n; i++) { - MPIR_Lpid g_lpid = MPIR_Group_rank_to_lpid(group_ptr, i); - - /* This mapping is relative to comm world */ - MPL_DBG_MSG_FMT(MPIR_DBG_COMM, VERBOSE, - (MPL_DBG_FDEST, - "comm-create - mapping into world[%d] = %" PRIu64, i, g_lpid)); - if (g_lpid < wsize) { - mapping[i] = g_lpid; - } else { - subsetOfWorld = 0; - break; - } - } - } - MPL_DBG_MSG_D(MPIR_DBG_COMM, VERBOSE, "subsetOfWorld=%d", subsetOfWorld); - if (subsetOfWorld) { -#ifdef HAVE_ERROR_CHECKING - { - MPID_BEGIN_ERROR_CHECKS; - { - mpi_errno = MPIR_Group_check_subset(group_ptr, comm_ptr); - MPIR_ERR_CHECK(mpi_errno); - } - MPID_END_ERROR_CHECKS; - } -#endif - /* Override the comm to be used with the mapping array. */ - *mapping_comm = MPIR_Process.comm_world; - } else { - for (i = 0; i < n; i++) { - /* mapping[i] is the rank in the communicator of the process - * that is the ith element of the group */ - /* FIXME : BUBBLE SORT */ - mapping[i] = -1; - for (j = 0; j < comm_ptr->local_size; j++) { - MPIR_Lpid comm_lpid = MPIR_Group_rank_to_lpid(comm_ptr->local_group, j); - if (comm_lpid == MPIR_Group_rank_to_lpid(group_ptr, i)) { - mapping[i] = j; - break; - } - } - MPIR_ERR_CHKANDJUMP1(mapping[i] == -1, mpi_errno, MPI_ERR_GROUP, - "**groupnotincomm", "**groupnotincomm %d", i); - } - } - - MPIR_Assert(mapping != NULL); - *mapping_out = mapping; - MPL_VG_CHECK_MEM_IS_DEFINED(*mapping_out, n * sizeof(**mapping_out)); - - MPIR_CHKPMEM_COMMIT(); - fn_exit: - MPIR_FUNC_EXIT; - return mpi_errno; - fn_fail: - MPIR_CHKPMEM_REAP(); - goto fn_exit; -} - -/* mapping[i] is equivalent network mapping between the old - * communicator and the new communicator. Index 'i' in the old - * communicator has the same network address as 'mapping[i]' in the - * new communicator. */ -/* WARNING: local_mapping and remote_mapping are stored in this - * function. The caller is responsible for their storage and will - * need to retain them till Comm_commit. */ -int MPII_Comm_create_map(int local_n, - int remote_n, - int *local_mapping, - int *remote_mapping, MPIR_Comm * mapping_comm, MPIR_Comm * newcomm) -{ - int mpi_errno = MPI_SUCCESS; - - MPIR_Comm_map_irregular(newcomm, mapping_comm, local_mapping, - local_n, MPIR_COMM_MAP_DIR__L2L, NULL); - if (mapping_comm->comm_kind == MPIR_COMM_KIND__INTERCOMM) { - MPIR_Comm_map_irregular(newcomm, mapping_comm, remote_mapping, - remote_n, MPIR_COMM_MAP_DIR__R2R, NULL); - } - return mpi_errno; -} - - /* comm create impl for intracommunicators, assumes that the standard error * checking has already taken place in the calling function */ int MPIR_Comm_create_intra(MPIR_Comm * comm_ptr, MPIR_Group * group_ptr, MPIR_Comm ** newcomm_ptr) @@ -297,6 +170,10 @@ int MPIR_Comm_create_intra(MPIR_Comm * comm_ptr, MPIR_Group * group_ptr, MPIR_Co MPIR_FUNC_ENTER; MPIR_Assert(comm_ptr->comm_kind == MPIR_COMM_KIND__INTRACOMM); +#ifdef HAVE_ERROR_CHECKING + mpi_errno = MPIR_Group_check_subset(group_ptr, comm_ptr); + MPIR_ERR_CHECK(mpi_errno); +#endif n = group_ptr->size; *newcomm_ptr = NULL; @@ -314,12 +191,6 @@ int MPIR_Comm_create_intra(MPIR_Comm * comm_ptr, MPIR_Group * group_ptr, MPIR_Co MPIR_Assert(new_context_id != 0); if (group_ptr->rank != MPI_UNDEFINED) { - MPIR_Comm *mapping_comm = NULL; - - mpi_errno = MPII_Comm_create_calculate_mapping(group_ptr, comm_ptr, - &mapping, &mapping_comm); - MPIR_ERR_CHECK(mpi_errno); - /* Get the new communicator structure and context id */ mpi_errno = MPIR_Comm_create(newcomm_ptr); @@ -340,11 +211,6 @@ int MPIR_Comm_create_intra(MPIR_Comm * comm_ptr, MPIR_Group * group_ptr, MPIR_Co MPIR_Comm_set_session_ptr(*newcomm_ptr, comm_ptr->session_ptr); - /* Setup the communicator's network address mapping. This is for the remote group, - * which is the same as the local group for intracommunicators */ - mpi_errno = MPII_Comm_create_map(n, 0, mapping, NULL, mapping_comm, *newcomm_ptr); - MPIR_ERR_CHECK(mpi_errno); - (*newcomm_ptr)->tainted = comm_ptr->tainted; mpi_errno = MPIR_Comm_commit(*newcomm_ptr); MPIR_ERR_CHECK(mpi_errno); @@ -376,10 +242,7 @@ int MPIR_Comm_create_intra(MPIR_Comm * comm_ptr, MPIR_Group * group_ptr, MPIR_Co int MPIR_Comm_create_inter(MPIR_Comm * comm_ptr, MPIR_Group * group_ptr, MPIR_Comm ** newcomm_ptr) { int mpi_errno = MPI_SUCCESS; - int new_context_id; - int *mapping = NULL; - MPIR_CHKLMEM_DECL(1); - + MPIR_CHKLMEM_DECL(2); MPIR_FUNC_ENTER; MPIR_Assert(comm_ptr->comm_kind == MPIR_COMM_KIND__INTERCOMM); @@ -397,15 +260,12 @@ int MPIR_Comm_create_inter(MPIR_Comm * comm_ptr, MPIR_Group * group_ptr, MPIR_Co if (!comm_ptr->local_comm) { MPII_Setup_intercomm_localcomm(comm_ptr); } + int new_context_id; mpi_errno = MPIR_Get_contextid_sparse(comm_ptr->local_comm, &new_context_id, FALSE); MPIR_ERR_CHECK(mpi_errno); MPIR_Assert(new_context_id != 0); MPIR_Assert(new_context_id != comm_ptr->recvcontext_id); - MPIR_Comm *mapping_comm; - mpi_errno = MPII_Comm_create_calculate_mapping(group_ptr, comm_ptr, &mapping, &mapping_comm); - MPIR_ERR_CHECK(mpi_errno); - /* There is an additional step. We must communicate the * information on the local context id and the group members, * given by the ranks so that the remote process can construct the @@ -416,6 +276,7 @@ int MPIR_Comm_create_inter(MPIR_Comm * comm_ptr, MPIR_Group * group_ptr, MPIR_Co * context in the original intercommunicator */ int remote_size = -1; + int context_id; int *remote_mapping; /* a list of remote ranks */ int rinfo[2]; @@ -428,11 +289,21 @@ int MPIR_Comm_create_inter(MPIR_Comm * comm_ptr, MPIR_Group * group_ptr, MPIR_Co rinfo, 2, MPI_INT, 0, 0, comm_ptr, MPI_STATUS_IGNORE, MPIR_ERR_NONE); MPIR_ERR_CHECK(mpi_errno); + context_id = rinfo[0]; remote_size = rinfo[1]; - MPIR_CHKLMEM_MALLOC(remote_mapping, int *, - remote_size * sizeof(int), - mpi_errno, "remote_mapping", MPL_MEM_ADDRESS); + int *mapping; + MPIR_CHKLMEM_MALLOC(mapping, int *, group_ptr->size * sizeof(int), + mpi_errno, "mapping", MPL_MEM_OTHER); + + /* effectively MPIR_Group_translate_ranks_impl */ + for (int i = 0; i < group_ptr->size; i++) { + MPIR_Lpid lpid = MPIR_Group_rank_to_lpid(group_ptr, i); + mapping[i] = MPIR_Group_lpid_to_rank(comm_ptr->local_group, lpid); + } + + MPIR_CHKLMEM_MALLOC(remote_mapping, int *, remote_size * sizeof(int), + mpi_errno, "remote_mapping", MPL_MEM_OTHER); /* Populate and exchange the ranks */ mpi_errno = MPIC_Sendrecv(mapping, group_ptr->size, MPI_INT, 0, 0, @@ -452,10 +323,12 @@ int MPIR_Comm_create_inter(MPIR_Comm * comm_ptr, MPIR_Group * group_ptr, MPIR_Co mpi_errno = MPIR_Bcast(rinfo, 2, MPI_INT, 0, comm_ptr->local_comm, MPIR_ERR_NONE); MPIR_ERR_CHECK(mpi_errno); + context_id = rinfo[0]; remote_size = rinfo[1]; - MPIR_CHKLMEM_MALLOC(remote_mapping, int *, - remote_size * sizeof(int), - mpi_errno, "remote_mapping", MPL_MEM_ADDRESS); + + MPIR_CHKLMEM_MALLOC(remote_mapping, int *, remote_size * sizeof(int), + mpi_errno, "remote_mapping", MPL_MEM_OTHER); + mpi_errno = MPIR_Bcast(remote_mapping, remote_size, MPI_INT, 0, comm_ptr->local_comm, MPIR_ERR_NONE); MPIR_ERR_CHECK(mpi_errno); @@ -478,54 +351,26 @@ int MPIR_Comm_create_inter(MPIR_Comm * comm_ptr, MPIR_Group * group_ptr, MPIR_Co if (group_ptr->rank != MPI_UNDEFINED) { /* Get the new communicator structure and context id */ mpi_errno = MPIR_Comm_create(newcomm_ptr); - if (mpi_errno) - goto fn_fail; + MPIR_ERR_CHECK(mpi_errno); - (*newcomm_ptr)->context_id = rinfo[0]; - (*newcomm_ptr)->remote_size = rinfo[1]; + (*newcomm_ptr)->context_id = context_id; + (*newcomm_ptr)->remote_size = remote_size; (*newcomm_ptr)->recvcontext_id = new_context_id; (*newcomm_ptr)->rank = group_ptr->rank; (*newcomm_ptr)->comm_kind = comm_ptr->comm_kind; /* Since the group has been provided, let the new communicator know * about the group */ (*newcomm_ptr)->local_comm = 0; + (*newcomm_ptr)->local_size = group_ptr->size; (*newcomm_ptr)->local_group = group_ptr; MPIR_Group_add_ref(group_ptr); - (*newcomm_ptr)->local_size = group_ptr->size; - (*newcomm_ptr)->remote_group = 0; + mpi_errno = MPIR_Group_incl_impl(comm_ptr->remote_group, rinfo[1], remote_mapping, + &(*newcomm_ptr)->remote_group); (*newcomm_ptr)->is_low_group = comm_ptr->is_low_group; MPIR_Comm_set_session_ptr(*newcomm_ptr, session_ptr); - } - - if (group_ptr->rank != MPI_UNDEFINED) { - /* Now, everyone has the remote_mapping, and can apply that to - * the network address mapping. */ - - /* Setup the communicator's network addresses from the local mapping. */ - mpi_errno = MPII_Comm_create_map(group_ptr->size, - remote_size, - mapping, remote_mapping, mapping_comm, *newcomm_ptr); - MPIR_ERR_CHECK(mpi_errno); - - /* create remote_group. - * FIXME: we can directly exchange group maps once we get rid of comm mappers */ - MPIR_Group *remote_group; - - MPIR_Lpid *remote_map; - remote_map = MPL_malloc(remote_size * sizeof(MPIR_Lpid), MPL_MEM_GROUP); - MPIR_ERR_CHKANDJUMP(!remote_map, mpi_errno, MPI_ERR_OTHER, "**nomem"); - - MPIR_Group *mapping_group = mapping_comm->remote_group; - MPIR_Assert(mapping_group); - for (int i = 0; i < remote_size; i++) { - remote_map[i] = MPIR_Group_rank_to_lpid(mapping_group, remote_mapping[i]); - } - mpi_errno = MPIR_Group_create_map(remote_size, MPI_UNDEFINED, session_ptr, remote_map, - &remote_group); - (*newcomm_ptr)->remote_group = remote_group; (*newcomm_ptr)->tainted = comm_ptr->tainted; mpi_errno = MPIR_Comm_commit(*newcomm_ptr); @@ -534,8 +379,6 @@ int MPIR_Comm_create_inter(MPIR_Comm * comm_ptr, MPIR_Group * group_ptr, MPIR_Co fn_exit: MPIR_CHKLMEM_FREEALL(); - MPL_free(mapping); - MPIR_FUNC_EXIT; return mpi_errno; fn_fail: @@ -567,7 +410,6 @@ int MPIR_Comm_create_group_impl(MPIR_Comm * comm_ptr, MPIR_Group * group_ptr, in { int mpi_errno = MPI_SUCCESS; int new_context_id = 0; - int *mapping = NULL; int n; MPIR_FUNC_ENTER; @@ -583,8 +425,6 @@ int MPIR_Comm_create_group_impl(MPIR_Comm * comm_ptr, MPIR_Group * group_ptr, in /* Create a new communicator from the specified group members */ if (group_ptr->rank != MPI_UNDEFINED) { - MPIR_Comm *mapping_comm = NULL; - /* For this routine, creation of the id is collective over the input *group*, so processes not in the group do not participate. */ @@ -592,10 +432,6 @@ int MPIR_Comm_create_group_impl(MPIR_Comm * comm_ptr, MPIR_Group * group_ptr, in MPIR_ERR_CHECK(mpi_errno); MPIR_Assert(new_context_id != 0); - mpi_errno = MPII_Comm_create_calculate_mapping(group_ptr, comm_ptr, - &mapping, &mapping_comm); - MPIR_ERR_CHECK(mpi_errno); - /* Get the new communicator structure and context id */ mpi_errno = MPIR_Comm_create(newcomm_ptr); @@ -616,11 +452,6 @@ int MPIR_Comm_create_group_impl(MPIR_Comm * comm_ptr, MPIR_Group * group_ptr, in MPIR_Comm_set_session_ptr(*newcomm_ptr, group_ptr->session_ptr); - /* Setup the communicator's vc table. This is for the remote group, - * which is the same as the local group for intracommunicators */ - mpi_errno = MPII_Comm_create_map(n, 0, mapping, NULL, mapping_comm, *newcomm_ptr); - MPIR_ERR_CHECK(mpi_errno); - (*newcomm_ptr)->tainted = comm_ptr->tainted; mpi_errno = MPIR_Comm_commit(*newcomm_ptr); MPIR_ERR_CHECK(mpi_errno); @@ -630,8 +461,6 @@ int MPIR_Comm_create_group_impl(MPIR_Comm * comm_ptr, MPIR_Group * group_ptr, in } fn_exit: - MPL_free(mapping); - MPIR_FUNC_EXIT; return mpi_errno; fn_fail: @@ -979,7 +808,7 @@ static int determine_low_group(MPIR_Lpid remote_lpid, bool * is_low_group_out) /* different world, compare namespace */ int cmp_result = strncmp(MPIR_Worlds[my_world_idx].namespace, MPIR_Worlds[remote_world_idx].namespace, - MPIR_MAX_WORLDS); + MPIR_NAMESPACE_MAX); MPIR_Assert(cmp_result != 0); if (cmp_result < 0) *is_low_group_out = false; @@ -987,10 +816,7 @@ static int determine_low_group(MPIR_Lpid remote_lpid, bool * is_low_group_out) *is_low_group_out = true; } - fn_exit: return mpi_errno; - fn_fail: - goto fn_exit; } int MPIR_Intercomm_create_impl(MPIR_Comm * local_comm_ptr, int local_leader, @@ -1008,7 +834,6 @@ int MPIR_Intercomm_create_timeout(MPIR_Comm * local_comm_ptr, int local_leader, int mpi_errno = MPI_SUCCESS; int remote_size = 0; MPIR_Lpid *remote_lpids = NULL; - int comm_info[3]; MPIR_Session *session_ptr = local_comm_ptr->session_ptr; MPIR_FUNC_ENTER; @@ -1071,8 +896,6 @@ int MPIR_Intercomm_create_timeout(MPIR_Comm * local_comm_ptr, int local_leader, mpi_errno = MPID_Create_intercomm_from_lpids(*new_intercomm_ptr, remote_size, remote_lpids); MPIR_ERR_CHECK(mpi_errno); - MPIR_Comm_map_dup(*new_intercomm_ptr, local_comm_ptr, MPIR_COMM_MAP_DIR__L2L); - /* Inherit the error handler (if any) */ MPID_THREAD_CS_ENTER(VCI, local_comm_ptr->mutex); (*new_intercomm_ptr)->errhandler = local_comm_ptr->errhandler; @@ -1096,38 +919,6 @@ int MPIR_Intercomm_create_timeout(MPIR_Comm * local_comm_ptr, int local_leader, goto fn_exit; } -/* This function creates mapping for new communicator - * basing on network addresses of existing communicator. - */ - -static int create_and_map(MPIR_Comm * comm_ptr, int local_high, MPIR_Comm * new_intracomm_ptr) -{ - int mpi_errno = MPI_SUCCESS; - int i; - - /* Now we know which group comes first. Build the new mapping - * from the existing comm */ - if (local_high) { - /* remote group first */ - MPIR_Comm_map_dup(new_intracomm_ptr, comm_ptr, MPIR_COMM_MAP_DIR__R2L); - - MPIR_Comm_map_dup(new_intracomm_ptr, comm_ptr, MPIR_COMM_MAP_DIR__L2L); - for (i = 0; i < comm_ptr->local_size; i++) - if (i == comm_ptr->rank) - new_intracomm_ptr->rank = comm_ptr->remote_size + i; - } else { - /* local group first */ - MPIR_Comm_map_dup(new_intracomm_ptr, comm_ptr, MPIR_COMM_MAP_DIR__L2L); - for (i = 0; i < comm_ptr->local_size; i++) - if (i == comm_ptr->rank) - new_intracomm_ptr->rank = i; - - MPIR_Comm_map_dup(new_intracomm_ptr, comm_ptr, MPIR_COMM_MAP_DIR__R2L); - } - - return mpi_errno; -} - int MPIR_Intercomm_merge_impl(MPIR_Comm * comm_ptr, int high, MPIR_Comm ** new_intracomm_ptr) { int mpi_errno = MPI_SUCCESS; @@ -1197,8 +988,8 @@ int MPIR_Intercomm_merge_impl(MPIR_Comm * comm_ptr, int high, MPIR_Comm ** new_i } (*new_intracomm_ptr)->recvcontext_id = (*new_intracomm_ptr)->context_id; (*new_intracomm_ptr)->remote_size = (*new_intracomm_ptr)->local_size = new_size; - (*new_intracomm_ptr)->rank = -1; (*new_intracomm_ptr)->comm_kind = MPIR_COMM_KIND__INTRACOMM; + (*new_intracomm_ptr)->remote_group = NULL; MPIR_Comm_set_session_ptr(*new_intracomm_ptr, comm_ptr->session_ptr); @@ -1233,10 +1024,7 @@ int MPIR_Intercomm_merge_impl(MPIR_Comm * comm_ptr, int high, MPIR_Comm ** new_i (*new_intracomm_ptr)->local_group = new_local_group; MPIR_Group_add_ref(new_local_group); - /* Now we know which group comes first. Build the new mapping - * from the existing comm */ - mpi_errno = create_and_map(comm_ptr, local_high, (*new_intracomm_ptr)); - MPIR_ERR_CHECK(mpi_errno); + (*new_intracomm_ptr)->rank = myrank; /* We've setup a temporary context id, based on the context id * used by the intercomm. This allows us to perform the allreduce @@ -1265,17 +1053,15 @@ int MPIR_Intercomm_merge_impl(MPIR_Comm * comm_ptr, int high, MPIR_Comm ** new_i MPIR_ERR_CHECK(mpi_errno); (*new_intracomm_ptr)->remote_size = (*new_intracomm_ptr)->local_size = new_size; - (*new_intracomm_ptr)->rank = -1; + (*new_intracomm_ptr)->rank = myrank; (*new_intracomm_ptr)->comm_kind = MPIR_COMM_KIND__INTRACOMM; (*new_intracomm_ptr)->context_id = new_context_id; (*new_intracomm_ptr)->recvcontext_id = new_context_id; + (*new_intracomm_ptr)->remote_group = NULL; MPIR_Comm_set_session_ptr(*new_intracomm_ptr, comm_ptr->session_ptr); (*new_intracomm_ptr)->local_group = new_local_group; - mpi_errno = create_and_map(comm_ptr, local_high, (*new_intracomm_ptr)); - MPIR_ERR_CHECK(mpi_errno); - (*new_intracomm_ptr)->tainted = 1; mpi_errno = MPIR_Comm_commit((*new_intracomm_ptr)); MPIR_ERR_CHECK(mpi_errno); diff --git a/src/mpi/comm/comm_split.c b/src/mpi/comm/comm_split.c index 4c0e5a826c2..3d3d95187de 100644 --- a/src/mpi/comm/comm_split.c +++ b/src/mpi/comm/comm_split.c @@ -89,7 +89,6 @@ int MPIR_Comm_split_impl(MPIR_Comm * comm_ptr, int color, int key, MPIR_Comm ** first_entry = 0, first_remote_entry = 0, *last_ptr; int in_newcomm; /* TRUE iff *newcomm should be populated */ int new_context_id, remote_context_id; - MPIR_Comm_map_t *mapper; MPIR_CHKLMEM_DECL(4); rank = comm_ptr->rank; @@ -283,18 +282,20 @@ int MPIR_Comm_split_impl(MPIR_Comm * comm_ptr, int color, int key, MPIR_Comm ** * corresponding process in the input communicator */ MPIU_Sort_inttable(remotekeytable, new_remote_size); - MPIR_Comm_map_irregular(*newcomm_ptr, comm_ptr, NULL, - new_size, MPIR_COMM_MAP_DIR__L2L, &mapper); + int *local_ranks; + local_ranks = MPL_malloc(new_size * sizeof(int), MPL_MEM_OTHER); + MPIR_ERR_CHKANDJUMP(!local_ranks, mpi_errno, MPI_ERR_OTHER, "**nomem"); for (i = 0; i < new_size; i++) { - mapper->src_mapping[i] = keytable[i].color; + local_ranks[i] = keytable[i].color; if (keytable[i].color == comm_ptr->rank) (*newcomm_ptr)->rank = i; } - mpi_errno = MPIR_Group_incl_impl(comm_ptr->local_group, new_size, mapper->src_mapping, + mpi_errno = MPIR_Group_incl_impl(comm_ptr->local_group, new_size, local_ranks, &(*newcomm_ptr)->local_group); MPIR_ERR_CHECK(mpi_errno); + MPL_free(local_ranks); /* For the remote group, the situation is more complicated. * We need to find the size of our "partner" group in the @@ -311,16 +312,19 @@ int MPIR_Comm_split_impl(MPIR_Comm * comm_ptr, int color, int key, MPIR_Comm ** * is required to return MPI_COMM_NULL instead of an intercomm * with an empty remote group. */ - MPIR_Comm_map_irregular(*newcomm_ptr, comm_ptr, NULL, - new_remote_size, MPIR_COMM_MAP_DIR__R2R, &mapper); + int *remote_ranks; + remote_ranks = MPL_malloc(new_remote_size * sizeof(int), MPL_MEM_OTHER); + MPIR_ERR_CHKANDJUMP(!remote_ranks, mpi_errno, MPI_ERR_OTHER, "**nomem"); - for (i = 0; i < new_remote_size; i++) - mapper->src_mapping[i] = remotekeytable[i].color; + for (i = 0; i < new_remote_size; i++) { + remote_ranks[i] = remotekeytable[i].color; + } mpi_errno = MPIR_Group_incl_impl(comm_ptr->remote_group, - new_remote_size, mapper->src_mapping, + new_remote_size, remote_ranks, &(*newcomm_ptr)->remote_group); MPIR_ERR_CHECK(mpi_errno); + MPL_free(remote_ranks); (*newcomm_ptr)->context_id = remote_context_id; (*newcomm_ptr)->remote_size = new_remote_size; @@ -332,18 +336,20 @@ int MPIR_Comm_split_impl(MPIR_Comm * comm_ptr, int color, int key, MPIR_Comm ** (*newcomm_ptr)->context_id = (*newcomm_ptr)->recvcontext_id; (*newcomm_ptr)->remote_size = new_size; - MPIR_Comm_map_irregular(*newcomm_ptr, comm_ptr, NULL, - new_size, MPIR_COMM_MAP_DIR__L2L, &mapper); + int *local_ranks; + local_ranks = MPL_malloc(new_size * sizeof(int), MPL_MEM_OTHER); + MPIR_ERR_CHKANDJUMP(!local_ranks, mpi_errno, MPI_ERR_OTHER, "**nomem"); for (i = 0; i < new_size; i++) { - mapper->src_mapping[i] = keytable[i].color; + local_ranks[i] = keytable[i].color; if (keytable[i].color == comm_ptr->rank) (*newcomm_ptr)->rank = i; } - mpi_errno = MPIR_Group_incl_impl(comm_ptr->local_group, new_size, mapper->src_mapping, + mpi_errno = MPIR_Group_incl_impl(comm_ptr->local_group, new_size, local_ranks, &(*newcomm_ptr)->local_group); MPIR_ERR_CHECK(mpi_errno); + MPL_free(local_ranks); } /* Inherit the error handler (if any) */ diff --git a/src/mpi/comm/commutil.c b/src/mpi/comm/commutil.c index da824bff420..df0540087bc 100644 --- a/src/mpi/comm/commutil.c +++ b/src/mpi/comm/commutil.c @@ -309,8 +309,6 @@ int MPII_Comm_init(MPIR_Comm * comm_p) /* Initialize the revoked flag as false */ comm_p->revoked = 0; - comm_p->mapper_head = NULL; - comm_p->mapper_tail = NULL; comm_p->threadcomm = NULL; MPIR_stream_comm_init(comm_p); @@ -407,10 +405,6 @@ int MPII_Setup_intercomm_localcomm(MPIR_Comm * intercomm_ptr) localcomm_ptr->local_size = intercomm_ptr->local_size; localcomm_ptr->rank = intercomm_ptr->rank; - MPIR_Comm_map_dup(localcomm_ptr, intercomm_ptr, MPIR_COMM_MAP_DIR__L2L); - - /* TODO More advanced version: if the group is available, dup it by - * increasing the reference count instead of recreating it later */ /* FIXME : No local functions for the topology routines */ intercomm_ptr->local_comm = localcomm_ptr; @@ -428,99 +422,6 @@ int MPII_Setup_intercomm_localcomm(MPIR_Comm * intercomm_ptr) return mpi_errno; } -int MPIR_Comm_map_irregular(MPIR_Comm * newcomm, MPIR_Comm * src_comm, - int *src_mapping, int src_mapping_size, - MPIR_Comm_map_dir_t dir, MPIR_Comm_map_t ** map) -{ - int mpi_errno = MPI_SUCCESS; - MPIR_Comm_map_t *mapper; - MPIR_CHKPMEM_DECL(3); - - MPIR_FUNC_ENTER; - - MPIR_CHKPMEM_MALLOC(mapper, MPIR_Comm_map_t *, sizeof(MPIR_Comm_map_t), mpi_errno, "mapper", - MPL_MEM_COMM); - - mapper->type = MPIR_COMM_MAP_TYPE__IRREGULAR; - mapper->src_comm = src_comm; - mapper->dir = dir; - mapper->src_mapping_size = src_mapping_size; - - if (src_mapping) { - mapper->src_mapping = src_mapping; - mapper->free_mapping = 0; - } else { - MPIR_CHKPMEM_MALLOC(mapper->src_mapping, int *, - src_mapping_size * sizeof(int), mpi_errno, "mapper mapping", - MPL_MEM_COMM); - mapper->free_mapping = 1; - } - - mapper->next = NULL; - - LL_APPEND(newcomm->mapper_head, newcomm->mapper_tail, mapper); - - if (map) - *map = mapper; - - fn_exit: - MPIR_CHKPMEM_COMMIT(); - MPIR_FUNC_EXIT; - return mpi_errno; - fn_fail: - MPIR_CHKPMEM_REAP(); - goto fn_exit; -} - -int MPIR_Comm_map_dup(MPIR_Comm * newcomm, MPIR_Comm * src_comm, MPIR_Comm_map_dir_t dir) -{ - int mpi_errno = MPI_SUCCESS; - MPIR_Comm_map_t *mapper; - MPIR_CHKPMEM_DECL(1); - - MPIR_FUNC_ENTER; - - MPIR_CHKPMEM_MALLOC(mapper, MPIR_Comm_map_t *, sizeof(MPIR_Comm_map_t), mpi_errno, "mapper", - MPL_MEM_COMM); - - mapper->type = MPIR_COMM_MAP_TYPE__DUP; - mapper->src_comm = src_comm; - mapper->dir = dir; - - mapper->next = NULL; - - LL_APPEND(newcomm->mapper_head, newcomm->mapper_tail, mapper); - - fn_exit: - MPIR_CHKPMEM_COMMIT(); - MPIR_FUNC_EXIT; - return mpi_errno; - fn_fail: - MPIR_CHKPMEM_REAP(); - goto fn_exit; -} - - -int MPIR_Comm_map_free(MPIR_Comm * comm) -{ - int mpi_errno = MPI_SUCCESS; - MPIR_Comm_map_t *mapper, *tmp; - - MPIR_FUNC_ENTER; - - for (mapper = comm->mapper_head; mapper;) { - tmp = mapper->next; - if (mapper->type == MPIR_COMM_MAP_TYPE__IRREGULAR && mapper->free_mapping) - MPL_free(mapper->src_mapping); - MPL_free(mapper); - mapper = tmp; - } - comm->mapper_head = NULL; - - MPIR_FUNC_EXIT; - return mpi_errno; -} - static int get_node_count(MPIR_Comm * comm, int *node_count) { int mpi_errno = MPI_SUCCESS; @@ -585,8 +486,6 @@ static int MPIR_Comm_commit_internal(MPIR_Comm * comm) mpi_errno = get_node_count(comm, &comm->node_count); MPIR_ERR_CHECK(mpi_errno); - MPIR_Comm_map_free(comm); - fn_exit: MPIR_FUNC_EXIT; return mpi_errno; @@ -698,9 +597,6 @@ int MPIR_Comm_create_subcomms(MPIR_Comm * comm) &comm->node_comm->local_group); MPIR_ERR_CHECK(mpi_errno); - /* mapper */ - MPIR_Comm_map_irregular(comm->node_comm, comm, local_procs, num_local, - MPIR_COMM_MAP_DIR__L2L, NULL); mpi_errno = MPIR_Comm_commit_internal(comm->node_comm); MPIR_ERR_CHECK(mpi_errno); } @@ -733,9 +629,6 @@ int MPIR_Comm_create_subcomms(MPIR_Comm * comm) &comm->node_roots_comm->local_group); MPIR_ERR_CHECK(mpi_errno); - /* mapper */ - MPIR_Comm_map_irregular(comm->node_roots_comm, comm, external_procs, num_external, - MPIR_COMM_MAP_DIR__L2L, NULL); mpi_errno = MPIR_Comm_commit_internal(comm->node_roots_comm); MPIR_ERR_CHECK(mpi_errno); } @@ -939,7 +832,6 @@ int MPII_Comm_copy(MPIR_Comm * comm_ptr, int size, MPIR_Info * info, MPIR_Comm * int mpi_errno = MPI_SUCCESS; int new_context_id, new_recvcontext_id; MPIR_Comm *newcomm_ptr = NULL; - MPIR_Comm_map_t *map = NULL; MPIR_FUNC_ENTER; @@ -990,37 +882,6 @@ int MPII_Comm_copy(MPIR_Comm * comm_ptr, int size, MPIR_Info * info, MPIR_Comm * MPIR_Comm_set_session_ptr(newcomm_ptr, comm_ptr->session_ptr); - /* There are two cases here - size is the same as the old communicator, - * or it is smaller. If the size is the same, we can just add a reference. - * Otherwise, we need to create a new network address mapping. Note that this is the - * test that matches the test on rank above. */ - if (size == comm_ptr->local_size) { - /* Duplicate the network address mapping */ - if (comm_ptr->comm_kind == MPIR_COMM_KIND__INTRACOMM) - MPIR_Comm_map_dup(newcomm_ptr, comm_ptr, MPIR_COMM_MAP_DIR__L2L); - else - MPIR_Comm_map_dup(newcomm_ptr, comm_ptr, MPIR_COMM_MAP_DIR__R2R); - } else { - int i; - - if (comm_ptr->comm_kind == MPIR_COMM_KIND__INTRACOMM) - MPIR_Comm_map_irregular(newcomm_ptr, comm_ptr, NULL, size, MPIR_COMM_MAP_DIR__L2L, - &map); - else - MPIR_Comm_map_irregular(newcomm_ptr, comm_ptr, NULL, size, MPIR_COMM_MAP_DIR__R2R, - &map); - for (i = 0; i < size; i++) { - /* For rank i in the new communicator, find the corresponding - * rank in the input communicator */ - map->src_mapping[i] = i; - } - } - - /* If it is an intercomm, duplicate the local network address references */ - if (comm_ptr->comm_kind == MPIR_COMM_KIND__INTERCOMM) { - MPIR_Comm_map_dup(newcomm_ptr, comm_ptr, MPIR_COMM_MAP_DIR__L2L); - } - /* Set the sizes and ranks */ newcomm_ptr->rank = comm_ptr->rank; if (comm_ptr->comm_kind == MPIR_COMM_KIND__INTERCOMM) { @@ -1093,16 +954,6 @@ int MPII_Comm_copy_data(MPIR_Comm * comm_ptr, MPIR_Info * info, MPIR_Comm ** out MPIR_Group_add_ref(comm_ptr->remote_group); } - if (comm_ptr->comm_kind == MPIR_COMM_KIND__INTRACOMM) - MPIR_Comm_map_dup(newcomm_ptr, comm_ptr, MPIR_COMM_MAP_DIR__L2L); - else - MPIR_Comm_map_dup(newcomm_ptr, comm_ptr, MPIR_COMM_MAP_DIR__R2R); - - /* If it is an intercomm, duplicate the network address mapping */ - if (comm_ptr->comm_kind == MPIR_COMM_KIND__INTERCOMM) { - MPIR_Comm_map_dup(newcomm_ptr, comm_ptr, MPIR_COMM_MAP_DIR__L2L); - } - /* Set the sizes and ranks */ newcomm_ptr->rank = comm_ptr->rank; newcomm_ptr->local_size = comm_ptr->local_size; diff --git a/src/mpi/comm/contextid.c b/src/mpi/comm/contextid.c index 9fab45bc789..c0a72060eca 100644 --- a/src/mpi/comm/contextid.c +++ b/src/mpi/comm/contextid.c @@ -753,7 +753,7 @@ static int sched_cb_gcn_allocate_cid(MPIR_Comm * comm, int tag, void *state) * Therefore, we set tag_up as lower bound for the operation. tag_ub is used by * most of the other blocking operations, but tag is always >0, so this * should be fine. - * 2.) We need odering between multiple idup operations on the same communicator. + * 2.) We need ordering between multiple idup operations on the same communicator. * The problem here is that the iallreduce operations of the first iteration * are not necessarily completed in the same order as they are issued, also on the * same communicator. To avoid deadlocks, we cannot add the elements to the @@ -790,7 +790,6 @@ static int sched_cb_gcn_allocate_cid(MPIR_Comm * comm, int tag, void *state) /* In the case of failure, the new communicator was half created. * So we need to clean the memory allocated for it. */ MPII_COMML_FORGET(st->new_comm); - MPIR_Comm_map_free(st->new_comm); MPIR_Handle_obj_free(&MPIR_Comm_mem, st->new_comm); MPL_free(st); goto fn_exit; diff --git a/src/mpid/ch4/src/ch4_comm.c b/src/mpid/ch4/src/ch4_comm.c index 5b5cc3e9def..96fa8b44e6b 100644 --- a/src/mpid/ch4/src/ch4_comm.c +++ b/src/mpid/ch4/src/ch4_comm.c @@ -875,9 +875,6 @@ int MPIDI_Comm_create_multi_leaders(MPIR_Comm * comm) MPL_pof2(MPIDI_COMM(comm, multi_leads_comm)->local_size); MPIDI_COMM(comm, multi_leads_comm)->remote_size = num_external; - MPIR_Comm_map_irregular(MPIDI_COMM(comm, multi_leads_comm), comm, - external_procs, num_external, MPIR_COMM_MAP_DIR__L2L, NULL); - mpi_errno = MPIR_Group_incl_impl(comm->local_group, num_external, external_procs, &MPIDI_COMM(comm, multi_leads_comm)->local_group); MPIR_ERR_CHECK(mpi_errno); @@ -896,8 +893,6 @@ int MPIDI_Comm_create_multi_leaders(MPIR_Comm * comm) mpi_errno = MPID_Comm_commit_post_hook(MPIDI_COMM(comm, multi_leads_comm)); if (mpi_errno) MPIR_ERR_CHECK(mpi_errno); - - MPIR_Comm_map_free(MPIDI_COMM(comm, multi_leads_comm)); } } From 29b4dfd080e8300f8fd3c3cb8727318768481638 Mon Sep 17 00:00:00 2001 From: Hui Zhou Date: Fri, 20 Dec 2024 10:37:34 -0600 Subject: [PATCH 52/59] ch4: remove MPIDI_rank_map_t We have switched to use MPIR_Lpid to address in ch4 av table manager. Both map and local_map in ch4 MPIDI_Devcomm_t no longer needed. --- src/mpid/ch4/include/mpidpre.h | 65 --- src/mpid/ch4/src/ch4_comm.c | 119 +---- src/mpid/ch4/src/ch4i_comm.c | 900 --------------------------------- src/mpid/ch4/src/ch4i_comm.h | 1 - src/mpid/ch4/src/init_comm.c | 11 - 5 files changed, 2 insertions(+), 1094 deletions(-) diff --git a/src/mpid/ch4/include/mpidpre.h b/src/mpid/ch4/include/mpidpre.h index 7bea7588df9..7a4010ff95c 100644 --- a/src/mpid/ch4/include/mpidpre.h +++ b/src/mpid/ch4/include/mpidpre.h @@ -561,69 +561,6 @@ typedef struct MPIDIG_comm_t { #endif } MPIDIG_comm_t; -#define MPIDI_CALC_STRIDE(rank, stride, blocksize, offset) \ - ((rank) / (blocksize) * ((stride) - (blocksize)) + (rank) + (offset)) - -#define MPIDI_CALC_STRIDE_SIMPLE(rank, stride, offset) \ - ((rank) * (stride) + (offset)) - -typedef enum { - MPIDI_RANK_MAP_DIRECT, - MPIDI_RANK_MAP_DIRECT_INTRA, - MPIDI_RANK_MAP_OFFSET, - MPIDI_RANK_MAP_OFFSET_INTRA, - MPIDI_RANK_MAP_STRIDE, - MPIDI_RANK_MAP_STRIDE_INTRA, - MPIDI_RANK_MAP_STRIDE_BLOCK, - MPIDI_RANK_MAP_STRIDE_BLOCK_INTRA, - MPIDI_RANK_MAP_LUT, - MPIDI_RANK_MAP_LUT_INTRA, - MPIDI_RANK_MAP_MLUT, - MPIDI_RANK_MAP_NONE -} MPIDI_rank_map_mode; - -typedef int MPIDI_lpid_t; -typedef struct { - int avtid; - int lpid; -} MPIDI_gpid_t; - -typedef struct { - MPIR_cc_t ref_count; - MPIDI_lpid_t lpid[]; -} MPIDI_rank_map_lut_t; - -typedef struct { - MPIR_cc_t ref_count; - MPIDI_gpid_t gpid[]; -} MPIDI_rank_map_mlut_t; - -typedef struct { - MPIDI_rank_map_mode mode; - int avtid; - int size; - - union { - int offset; - struct { - int offset; - int stride; - int blocksize; - } stride; - } reg; - - union { - struct { - MPIDI_rank_map_lut_t *t; - MPIDI_lpid_t *lpid; - } lut; - struct { - MPIDI_rank_map_mlut_t *t; - MPIDI_gpid_t *gpid; - } mlut; - } irreg; -} MPIDI_rank_map_t; - typedef struct MPIDI_Devcomm_t { struct { /* The first fields are used by the AM(MPIDIG) apis */ @@ -638,8 +575,6 @@ typedef struct MPIDI_Devcomm_t { MPIDI_SHM_COMM_DECL} shm; #endif - MPIDI_rank_map_t map; - MPIDI_rank_map_t local_map; struct MPIR_Comm *multi_leads_comm; /* sub communicators related for multi-leaders based implementation */ struct MPIR_Comm *inter_node_leads_comm, *sub_node_comm, *intra_node_leads_comm; diff --git a/src/mpid/ch4/src/ch4_comm.c b/src/mpid/ch4/src/ch4_comm.c index 96fa8b44e6b..c84bc1ea89f 100644 --- a/src/mpid/ch4/src/ch4_comm.c +++ b/src/mpid/ch4/src/ch4_comm.c @@ -118,23 +118,6 @@ int MPIDI_Comm_split_type(MPIR_Comm * user_comm_ptr, int split_type, int key, MP /* --END ERROR HANDLING-- */ } -static void mlut_update_avt_reference(int size, MPIDI_gpid_t * gpid, bool is_release) -{ - int n_avts = MPIDIU_get_n_avts(); - int *uniq_avtids = (int *) MPL_calloc(n_avts, sizeof(int), MPL_MEM_ADDRESS); - for (int i = 0; i < size; i++) { - if (uniq_avtids[gpid[i].avtid] == 0) { - uniq_avtids[gpid[i].avtid] = 1; - if (is_release) { - MPIDIU_avt_release_ref(gpid[i].avtid); - } else { - MPIDIU_avt_add_ref(gpid[i].avtid); - } - } - } - MPL_free(uniq_avtids); -} - int MPID_Comm_commit_pre_hook(MPIR_Comm * comm) { int mpi_errno; @@ -144,45 +127,8 @@ int MPID_Comm_commit_pre_hook(MPIR_Comm * comm) MPIR_Assert(comm->comm_kind == MPIR_COMM_KIND__INTRACOMM || comm->remote_group); if (comm == MPIR_Process.comm_world) { - MPIDI_COMM(comm, map).mode = MPIDI_RANK_MAP_DIRECT_INTRA; - MPIDI_COMM(comm, map).avtid = 0; - MPIDI_COMM(comm, map).size = MPIR_Process.size; - MPIDI_COMM(comm, local_map).mode = MPIDI_RANK_MAP_NONE; - MPIDIU_avt_add_ref(0); - mpi_errno = MPIDI_world_pre_init(); MPIR_ERR_CHECK(mpi_errno); - } else if (comm == MPIR_Process.comm_self) { - MPIDI_COMM(comm, map).mode = MPIDI_RANK_MAP_OFFSET_INTRA; - MPIDI_COMM(comm, map).avtid = 0; - MPIDI_COMM(comm, map).size = 1; - MPIDI_COMM(comm, map).reg.offset = MPIR_Process.rank; - MPIDI_COMM(comm, local_map).mode = MPIDI_RANK_MAP_NONE; - MPIDIU_avt_add_ref(0); - } else { - MPIDI_comm_create_rank_map(comm); - /* add ref to avts */ - switch (MPIDI_COMM(comm, map).mode) { - case MPIDI_RANK_MAP_NONE: - break; - case MPIDI_RANK_MAP_MLUT: - mlut_update_avt_reference(MPIDI_COMM(comm, map).size, - MPIDI_COMM(comm, map).irreg.mlut.gpid, false); - break; - default: - MPIDIU_avt_add_ref(MPIDI_COMM(comm, map).avtid); - } - - switch (MPIDI_COMM(comm, local_map).mode) { - case MPIDI_RANK_MAP_NONE: - break; - case MPIDI_RANK_MAP_MLUT: - mlut_update_avt_reference(MPIDI_COMM(comm, local_map).size, - MPIDI_COMM(comm, local_map).irreg.mlut.gpid, false); - break; - default: - MPIDIU_avt_add_ref(MPIDI_COMM(comm, local_map).avtid); - } } MPIDI_COMM(comm, multi_leads_comm) = NULL; @@ -309,46 +255,6 @@ int MPID_Comm_free_hook(MPIR_Comm * comm) MPL_free(MPIDI_COMM(comm, allreduce_comp_info)); } - - - /* release ref to avts */ - switch (MPIDI_COMM(comm, map).mode) { - case MPIDI_RANK_MAP_NONE: - break; - case MPIDI_RANK_MAP_MLUT: - mlut_update_avt_reference(MPIDI_COMM(comm, map).size, - MPIDI_COMM(comm, map).irreg.mlut.gpid, true); - break; - default: - MPIDIU_avt_release_ref(MPIDI_COMM(comm, map).avtid); - } - - switch (MPIDI_COMM(comm, local_map).mode) { - case MPIDI_RANK_MAP_NONE: - break; - case MPIDI_RANK_MAP_MLUT: - mlut_update_avt_reference(MPIDI_COMM(comm, local_map).size, - MPIDI_COMM(comm, local_map).irreg.mlut.gpid, true); - break; - default: - MPIDIU_avt_release_ref(MPIDI_COMM(comm, local_map).avtid); - } - - if (MPIDI_COMM(comm, map).mode == MPIDI_RANK_MAP_LUT - || MPIDI_COMM(comm, map).mode == MPIDI_RANK_MAP_LUT_INTRA) { - MPIDIU_release_lut(MPIDI_COMM(comm, map).irreg.lut.t); - } - if (MPIDI_COMM(comm, local_map).mode == MPIDI_RANK_MAP_LUT - || MPIDI_COMM(comm, local_map).mode == MPIDI_RANK_MAP_LUT_INTRA) { - MPIDIU_release_lut(MPIDI_COMM(comm, local_map).irreg.lut.t); - } - if (MPIDI_COMM(comm, map).mode == MPIDI_RANK_MAP_MLUT) { - MPIDIU_release_mlut(MPIDI_COMM(comm, map).irreg.mlut.t); - } - if (MPIDI_COMM(comm, local_map).mode == MPIDI_RANK_MAP_MLUT) { - MPIDIU_release_mlut(MPIDI_COMM(comm, local_map).irreg.mlut.t); - } - mpi_errno = MPIDI_NM_mpi_comm_free_hook(comm); MPIR_ERR_CHECK(mpi_errno); #ifndef MPIDI_CH4_DIRECT_NETMOD @@ -770,32 +676,11 @@ static int leader_exchange(MPIR_Comm * local_comm, MPIR_Lpid remote_lpid, int ta /* ---- */ int MPID_Create_intercomm_from_lpids(MPIR_Comm * newcomm_ptr, int size, const MPIR_Lpid lpids[]) { - int mpi_errno = MPI_SUCCESS, i; - MPIR_FUNC_ENTER; - - MPIDI_rank_map_mlut_t *mlut = NULL; - MPIDI_COMM(newcomm_ptr, map).mode = MPIDI_RANK_MAP_MLUT; - MPIDI_COMM(newcomm_ptr, map).avtid = -1; - mpi_errno = MPIDIU_alloc_mlut(&mlut, size); - MPIR_ERR_CHECK(mpi_errno); - MPIDI_COMM(newcomm_ptr, map).size = size; - MPIDI_COMM(newcomm_ptr, map).irreg.mlut.t = mlut; - MPIDI_COMM(newcomm_ptr, map).irreg.mlut.gpid = mlut->gpid; + int mpi_errno = MPI_SUCCESS; - for (i = 0; i < size; i++) { - MPIDI_COMM(newcomm_ptr, map).irreg.mlut.gpid[i].avtid = MPIDIU_GPID_GET_AVTID(lpids[i]); - MPIDI_COMM(newcomm_ptr, map).irreg.mlut.gpid[i].lpid = MPIDIU_GPID_GET_LPID(lpids[i]); - MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MAP, VERBOSE, - (MPL_DBG_FDEST, " remote rank=%d, avtid=%d, lpid=%d", i, - MPIDI_COMM(newcomm_ptr, map).irreg.mlut.gpid[i].avtid, - MPIDI_COMM(newcomm_ptr, map).irreg.mlut.gpid[i].lpid)); - } + /* Assuming MPID_Intercomm_exchange already called, nothing to do here. */ - fn_exit: - MPIR_FUNC_EXIT; return mpi_errno; - fn_fail: - goto fn_exit; } /* Create multi-leaders communicator */ diff --git a/src/mpid/ch4/src/ch4i_comm.c b/src/mpid/ch4/src/ch4i_comm.c index d8a3fe3e9f6..a723dbfc858 100644 --- a/src/mpid/ch4/src/ch4i_comm.c +++ b/src/mpid/ch4/src/ch4i_comm.c @@ -7,906 +7,6 @@ #include "mpidch4r.h" #include "ch4i_comm.h" -enum MPIDI_src_mapper_models { - MPIDI_SRC_MAPPER_IRREGULAR = 0, - MPIDI_SRC_MAPPER_DIRECT = 1, - MPIDI_SRC_MAPPER_OFFSET = 2, - MPIDI_SRC_MAPPER_STRIDE = 3 -}; - -static int map_size(MPIR_Comm_map_t map); -static int detect_regular_model(int *lpid, int size, int *offset, int *blocksize, int *stride); -static int src_comm_to_mlut(MPIDI_rank_map_t * src, MPIDI_rank_map_t * dest, int size, - int total_mapper_size, int mapper_offset); -static int src_mlut_to_mlut(MPIDI_rank_map_t * src, MPIDI_rank_map_t * dest, - MPIR_Comm_map_t * mapper, int total_mapper_size, int mapper_offset); -static int src_map_to_lut(MPIDI_rank_map_t * src, MPIDI_rank_map_t * dest, MPIR_Comm_map_t * mapper, - int total_mapper_size, int mapper_offset); -static void direct_of_src_rmap(MPIDI_rank_map_t * src, MPIDI_rank_map_t * dest, - MPIR_Comm_map_t * mapper); -static void offset_of_src_rmap(MPIDI_rank_map_t * src, MPIDI_rank_map_t * dest, - MPIR_Comm_map_t * mapper, int offset); -static void stride_of_src_rmap(MPIDI_rank_map_t * src, MPIDI_rank_map_t * dest, - MPIR_Comm_map_t * mapper, int stride, int blocksize, int offset); -static int check_convert_mlut_to_lut(MPIDI_rank_map_t * src); -static int check_convert_lut_to_regular(MPIDI_rank_map_t * src); -static int set_map(MPIDI_rank_map_t * src_rmap, MPIDI_rank_map_t * dest_rmap, - MPIR_Comm_map_t * mapper, int src_comm_size, int total_mapper_size, - int mapper_offset); - -static int map_size(MPIR_Comm_map_t map) -{ - int ret = 0; - MPIR_FUNC_ENTER; - - if (map.type == MPIR_COMM_MAP_TYPE__IRREGULAR) - ret = map.src_mapping_size; - else if (map.dir == MPIR_COMM_MAP_DIR__L2L || map.dir == MPIR_COMM_MAP_DIR__L2R) - ret = map.src_comm->local_size; - else - ret = map.src_comm->remote_size; - - MPIR_FUNC_EXIT; - return ret; -} - -static int detect_regular_model(int *lpid, int size, int *offset, int *blocksize, int *stride) -{ - int off = 0, bs = 0, st = 0; - int i; - int ret = MPIDI_SRC_MAPPER_IRREGULAR; - - MPIR_FUNC_ENTER; - - if (size == 0) { - ret = MPIDI_SRC_MAPPER_DIRECT; - goto fn_exit; - } - - off = lpid[0]; - MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MAP, VERBOSE, (MPL_DBG_FDEST, "\tdetect model: offset %d", off)); - - for (i = 0; i < size; i++) { - if (lpid[i] != i + off) { - break; - } - bs++; - } - - MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MAP, VERBOSE, - (MPL_DBG_FDEST, "\tdetect model: blocksize %d", bs)); - if (bs == size) { - if (off == 0) { - ret = MPIDI_SRC_MAPPER_DIRECT; - goto fn_exit; - } else { - *offset = off; - ret = MPIDI_SRC_MAPPER_OFFSET; - goto fn_exit; - } - } - - /* blocksize less than total size, try if this is stride */ - st = lpid[bs] - lpid[0]; - if (st < 0 || st <= bs) { - ret = MPIDI_SRC_MAPPER_IRREGULAR; - goto fn_exit; - } - MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MAP, VERBOSE, (MPL_DBG_FDEST, "\tdetect model: stride %d", st)); - for (i = bs; i < size; i++) { - if (lpid[i] != MPIDI_CALC_STRIDE(i, st, bs, off)) { - ret = MPIDI_SRC_MAPPER_IRREGULAR; - goto fn_exit; - } - } - *offset = off; - *blocksize = bs; - *stride = st; - ret = MPIDI_SRC_MAPPER_STRIDE; - - fn_exit: - MPIR_FUNC_EXIT; - return ret; -} - -static int src_comm_to_mlut(MPIDI_rank_map_t * src, MPIDI_rank_map_t * dest, int size, - int total_mapper_size, int mapper_offset) -{ - int mpi_errno = MPI_SUCCESS, i; - MPIDI_rank_map_mlut_t *mlut = NULL; - - MPIR_FUNC_ENTER; - - if (!mapper_offset) { - mpi_errno = MPIDIU_alloc_mlut(&mlut, total_mapper_size); - MPIR_ERR_CHECK(mpi_errno); - dest->size = total_mapper_size; - dest->mode = MPIDI_RANK_MAP_MLUT; - dest->avtid = -1; - dest->irreg.mlut.t = mlut; - dest->irreg.mlut.gpid = mlut->gpid; - } - - MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MAP, VERBOSE, - (MPL_DBG_FDEST, " source mode %d", (int) src->mode)); - MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MAP, VERBOSE, (MPL_DBG_FDEST, " size %d", size)); - switch (src->mode) { - case MPIDI_RANK_MAP_DIRECT: - case MPIDI_RANK_MAP_DIRECT_INTRA: - for (i = 0; i < size; i++) { - dest->irreg.mlut.gpid[i + mapper_offset].lpid = i; - dest->irreg.mlut.gpid[i + mapper_offset].avtid = src->avtid; - } - break; - case MPIDI_RANK_MAP_OFFSET: - case MPIDI_RANK_MAP_OFFSET_INTRA: - for (i = 0; i < size; i++) { - dest->irreg.mlut.gpid[i + mapper_offset].lpid = i + src->reg.offset; - dest->irreg.mlut.gpid[i + mapper_offset].avtid = src->avtid; - } - MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MAP, VERBOSE, - (MPL_DBG_FDEST, " source offset %d", src->reg.offset)); - break; - case MPIDI_RANK_MAP_STRIDE: - case MPIDI_RANK_MAP_STRIDE_INTRA: - for (i = 0; i < size; i++) { - dest->irreg.mlut.gpid[i + mapper_offset].lpid = MPIDI_CALC_STRIDE_SIMPLE(i, - src->reg. - stride.stride, - src->reg. - stride.offset); - dest->irreg.mlut.gpid[i + mapper_offset].avtid = src->avtid; - } - MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MAP, VERBOSE, - (MPL_DBG_FDEST, " source stride %d blocksize %d offset %d", - src->reg.stride.stride, src->reg.stride.blocksize, - src->reg.stride.offset)); - break; - case MPIDI_RANK_MAP_STRIDE_BLOCK: - case MPIDI_RANK_MAP_STRIDE_BLOCK_INTRA: - for (i = 0; i < size; i++) { - dest->irreg.mlut.gpid[i + mapper_offset].lpid = MPIDI_CALC_STRIDE(i, - src->reg.stride. - stride, - src->reg.stride. - blocksize, - src->reg.stride. - offset); - dest->irreg.mlut.gpid[i + mapper_offset].avtid = src->avtid; - } - MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MAP, VERBOSE, - (MPL_DBG_FDEST, " source stride %d blocksize %d offset %d", - src->reg.stride.stride, src->reg.stride.blocksize, - src->reg.stride.offset)); - break; - case MPIDI_RANK_MAP_LUT: - case MPIDI_RANK_MAP_LUT_INTRA: - for (i = 0; i < size; i++) { - dest->irreg.mlut.gpid[i + mapper_offset].lpid = src->irreg.lut.lpid[i]; - dest->irreg.mlut.gpid[i + mapper_offset].avtid = src->avtid; - } - break; - case MPIDI_RANK_MAP_MLUT: - for (i = 0; i < size; i++) { - dest->irreg.mlut.gpid[i + mapper_offset].lpid = src->irreg.mlut.gpid[i].lpid; - dest->irreg.mlut.gpid[i + mapper_offset].avtid = src->irreg.mlut.gpid[i].avtid; - } - break; - case MPIDI_RANK_MAP_NONE: - MPIR_Assert(0); - break; - } - - fn_exit: - MPIR_FUNC_EXIT; - return mpi_errno; - fn_fail: - goto fn_exit; -} - -static int src_mlut_to_mlut(MPIDI_rank_map_t * src, MPIDI_rank_map_t * dest, - MPIR_Comm_map_t * mapper, int total_mapper_size, int mapper_offset) -{ - int mpi_errno = MPI_SUCCESS, i; - int size = map_size(*mapper); - MPIDI_rank_map_mlut_t *mlut = NULL; - - MPIR_FUNC_ENTER; - - if (!mapper_offset) { - mpi_errno = MPIDIU_alloc_mlut(&mlut, total_mapper_size); - MPIR_ERR_CHECK(mpi_errno); - dest->size = total_mapper_size; - } - - dest->mode = src->mode; - dest->irreg.mlut.t = mlut; - dest->irreg.mlut.gpid = mlut->gpid; - for (i = 0; i < size; i++) { - dest->irreg.mlut.gpid[i + mapper_offset].avtid = - src->irreg.mlut.gpid[mapper->src_mapping[i]].avtid; - dest->irreg.mlut.gpid[i + mapper_offset].lpid = - src->irreg.mlut.gpid[mapper->src_mapping[i]].lpid; - } - fn_exit: - MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MAP, VERBOSE, - (MPL_DBG_FDEST, " src mode %d, dest mode %d", - (int) src->mode, (int) dest->mode)); - MPIR_FUNC_EXIT; - return mpi_errno; - fn_fail: - goto fn_exit; -} - -static int src_map_to_lut(MPIDI_rank_map_t * src, MPIDI_rank_map_t * dest, MPIR_Comm_map_t * mapper, - int total_mapper_size, int mapper_offset) -{ - int mpi_errno = MPI_SUCCESS, i; - int size = map_size(*mapper); - MPIDI_rank_map_lut_t *lut = NULL; - - MPIR_FUNC_ENTER; - - if (!mapper_offset) { - mpi_errno = MPIDIU_alloc_lut(&lut, total_mapper_size); - MPIR_ERR_CHECK(mpi_errno); - dest->size = total_mapper_size; - } - - MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MAP, VERBOSE, - (MPL_DBG_FDEST, " source mode %d", (int) src->mode)); - MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MAP, VERBOSE, - (MPL_DBG_FDEST, " size %d, mapper->src_mapping_size %d", - size, mapper->src_mapping_size)); - dest->mode = MPIDI_RANK_MAP_LUT; - dest->avtid = src->avtid; - dest->irreg.lut.t = lut; - dest->irreg.lut.lpid = lut->lpid; - switch (src->mode) { - case MPIDI_RANK_MAP_DIRECT: - case MPIDI_RANK_MAP_DIRECT_INTRA: - for (i = 0; i < size; i++) { - dest->irreg.lut.lpid[i + mapper_offset] = mapper->src_mapping[i]; - } - break; - case MPIDI_RANK_MAP_OFFSET: - case MPIDI_RANK_MAP_OFFSET_INTRA: - for (i = 0; i < size; i++) { - dest->irreg.lut.lpid[i + mapper_offset] = mapper->src_mapping[i] + src->reg.offset; - } - MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MAP, VERBOSE, - (MPL_DBG_FDEST, " source offset %d", src->reg.offset)); - break; - case MPIDI_RANK_MAP_STRIDE: - case MPIDI_RANK_MAP_STRIDE_INTRA: - for (i = 0; i < size; i++) { - dest->irreg.lut.lpid[i + mapper_offset] = - MPIDI_CALC_STRIDE_SIMPLE(mapper->src_mapping[i], src->reg.stride.stride, - src->reg.stride.offset); - } - MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MAP, VERBOSE, - (MPL_DBG_FDEST, " source stride %d blocksize %d offset %d", - src->reg.stride.stride, src->reg.stride.blocksize, - src->reg.stride.offset)); - break; - case MPIDI_RANK_MAP_STRIDE_BLOCK: - case MPIDI_RANK_MAP_STRIDE_BLOCK_INTRA: - for (i = 0; i < size; i++) { - dest->irreg.lut.lpid[i + mapper_offset] = MPIDI_CALC_STRIDE(mapper->src_mapping[i], - src->reg.stride.stride, - src->reg. - stride.blocksize, - src->reg.stride.offset); - } - MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MAP, VERBOSE, - (MPL_DBG_FDEST, " source stride %d blocksize %d offset %d", - src->reg.stride.stride, src->reg.stride.blocksize, - src->reg.stride.offset)); - break; - case MPIDI_RANK_MAP_LUT: - case MPIDI_RANK_MAP_LUT_INTRA: - for (i = 0; i < size; i++) { - dest->irreg.lut.lpid[i + mapper_offset] = - src->irreg.lut.lpid[mapper->src_mapping[i]]; - } - break; - default: - mpi_errno = 1; - MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MAP, VERBOSE, - (MPL_DBG_FDEST, " cannot convert mode %d to lut", (int) src->mode)); - goto fn_fail; - } - fn_exit: - MPIR_FUNC_EXIT; - return mpi_errno; - fn_fail: - goto fn_exit; -} - -static void direct_of_src_rmap(MPIDI_rank_map_t * src, MPIDI_rank_map_t * dest, - MPIR_Comm_map_t * mapper) -{ - MPIR_FUNC_ENTER; - dest->mode = src->mode; - if (mapper) { - dest->size = map_size(*mapper); - } else { - dest->size = src->size; - } - dest->avtid = src->avtid; - MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MAP, VERBOSE, - (MPL_DBG_FDEST, " source mode %d", (int) src->mode)); - switch (src->mode) { - case MPIDI_RANK_MAP_DIRECT: - case MPIDI_RANK_MAP_DIRECT_INTRA: - break; - case MPIDI_RANK_MAP_OFFSET: - case MPIDI_RANK_MAP_OFFSET_INTRA: - dest->reg.offset = src->reg.offset; - break; - case MPIDI_RANK_MAP_STRIDE: - case MPIDI_RANK_MAP_STRIDE_INTRA: - case MPIDI_RANK_MAP_STRIDE_BLOCK: - case MPIDI_RANK_MAP_STRIDE_BLOCK_INTRA: - dest->reg.stride.stride = src->reg.stride.stride; - dest->reg.stride.blocksize = src->reg.stride.blocksize; - dest->reg.stride.offset = src->reg.stride.offset; - break; - case MPIDI_RANK_MAP_LUT: - case MPIDI_RANK_MAP_LUT_INTRA: - dest->irreg.lut.t = src->irreg.lut.t; - dest->irreg.lut.lpid = src->irreg.lut.lpid; - MPIDIU_lut_add_ref(src->irreg.lut.t); - break; - case MPIDI_RANK_MAP_MLUT: - dest->irreg.mlut.t = src->irreg.mlut.t; - dest->irreg.mlut.gpid = src->irreg.mlut.gpid; - MPIDIU_mlut_add_ref(src->irreg.mlut.t); - break; - case MPIDI_RANK_MAP_NONE: - MPIR_Assert(0); - break; - } - MPIR_FUNC_EXIT; -} - -static void offset_of_src_rmap(MPIDI_rank_map_t * src, MPIDI_rank_map_t * dest, - MPIR_Comm_map_t * mapper, int offset) -{ - MPIR_FUNC_ENTER; - dest->avtid = src->avtid; - dest->size = map_size(*mapper); - MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MAP, VERBOSE, - (MPL_DBG_FDEST, " source mode %d", (int) src->mode)); - switch (src->mode) { - case MPIDI_RANK_MAP_DIRECT_INTRA: - dest->mode = MPIDI_RANK_MAP_OFFSET_INTRA; - dest->reg.offset = offset; - break; - case MPIDI_RANK_MAP_DIRECT: - dest->mode = MPIDI_RANK_MAP_OFFSET; - dest->reg.offset = offset; - break; - case MPIDI_RANK_MAP_OFFSET: - dest->mode = MPIDI_RANK_MAP_OFFSET; - dest->reg.offset = src->reg.offset + offset; - break; - case MPIDI_RANK_MAP_OFFSET_INTRA: - dest->mode = MPIDI_RANK_MAP_OFFSET_INTRA; - dest->reg.offset = src->reg.offset + offset; - break; - case MPIDI_RANK_MAP_STRIDE: - dest->mode = MPIDI_RANK_MAP_STRIDE; - dest->reg.stride.stride = src->reg.stride.stride; - dest->reg.stride.blocksize = src->reg.stride.blocksize; - dest->reg.stride.offset = src->reg.stride.offset + offset * src->reg.stride.stride; - break; - case MPIDI_RANK_MAP_STRIDE_INTRA: - dest->mode = MPIDI_RANK_MAP_STRIDE_INTRA; - dest->reg.stride.stride = src->reg.stride.stride; - dest->reg.stride.blocksize = src->reg.stride.blocksize; - dest->reg.stride.offset = src->reg.stride.offset + offset * src->reg.stride.stride; - break; - case MPIDI_RANK_MAP_STRIDE_BLOCK: - case MPIDI_RANK_MAP_STRIDE_BLOCK_INTRA: - src_map_to_lut(src, dest, mapper, mapper->src_mapping_size, 0); - break; - case MPIDI_RANK_MAP_LUT: - case MPIDI_RANK_MAP_LUT_INTRA: - dest->mode = src->mode; - dest->irreg.lut.t = src->irreg.lut.t; - dest->irreg.lut.lpid = &src->irreg.lut.lpid[offset]; - MPIDIU_lut_add_ref(src->irreg.lut.t); - break; - case MPIDI_RANK_MAP_MLUT: - dest->mode = src->mode; - dest->irreg.mlut.t = src->irreg.mlut.t; - dest->irreg.mlut.gpid = &src->irreg.mlut.gpid[offset]; - MPIDIU_mlut_add_ref(src->irreg.mlut.t); - break; - case MPIDI_RANK_MAP_NONE: - MPIR_Assert(0); - break; - } - MPIR_FUNC_EXIT; -} - -static void stride_of_src_rmap(MPIDI_rank_map_t * src, MPIDI_rank_map_t * dest, - MPIR_Comm_map_t * mapper, int stride, int blocksize, int offset) -{ - MPIR_FUNC_ENTER; - dest->avtid = src->avtid; - MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MAP, VERBOSE, - (MPL_DBG_FDEST, " source mode %d", (int) src->mode)); - switch (src->mode) { - case MPIDI_RANK_MAP_DIRECT_INTRA: - if (blocksize == 1) { - dest->mode = MPIDI_RANK_MAP_STRIDE_INTRA; - } else { - dest->mode = MPIDI_RANK_MAP_STRIDE_BLOCK_INTRA; - } - dest->size = map_size(*mapper); - dest->reg.stride.stride = stride; - dest->reg.stride.blocksize = blocksize; - dest->reg.stride.offset = offset; - MPIR_Assert(stride > 0); - MPIR_Assert(blocksize > 0); - break; - case MPIDI_RANK_MAP_DIRECT: - if (blocksize == 1) { - dest->mode = MPIDI_RANK_MAP_STRIDE; - } else { - dest->mode = MPIDI_RANK_MAP_STRIDE_BLOCK; - } - dest->size = map_size(*mapper); - dest->reg.stride.stride = stride; - dest->reg.stride.blocksize = blocksize; - dest->reg.stride.offset = offset; - MPIR_Assert(stride > 0); - MPIR_Assert(blocksize > 0); - break; - case MPIDI_RANK_MAP_OFFSET: - if (blocksize == 1) { - dest->mode = MPIDI_RANK_MAP_STRIDE; - } else { - dest->mode = MPIDI_RANK_MAP_STRIDE_BLOCK; - } - dest->size = map_size(*mapper); - dest->reg.stride.stride = stride; - dest->reg.stride.blocksize = blocksize; - dest->reg.stride.offset = offset + src->reg.offset; - break; - case MPIDI_RANK_MAP_OFFSET_INTRA: - if (blocksize == 1) { - dest->mode = MPIDI_RANK_MAP_STRIDE_INTRA; - } else { - dest->mode = MPIDI_RANK_MAP_STRIDE_BLOCK_INTRA; - } - dest->size = map_size(*mapper); - dest->reg.stride.stride = stride; - dest->reg.stride.blocksize = blocksize; - dest->reg.stride.offset = offset + src->reg.offset; - break; - case MPIDI_RANK_MAP_STRIDE: - if (blocksize == 1) { - dest->mode = MPIDI_RANK_MAP_STRIDE; - dest->reg.stride.stride = src->reg.stride.stride * stride; - dest->reg.stride.blocksize = blocksize; - dest->reg.stride.offset = src->reg.stride.stride * offset + src->reg.stride.offset; - } else { - src_map_to_lut(src, dest, mapper, mapper->src_mapping_size, 0); - } - break; - case MPIDI_RANK_MAP_STRIDE_INTRA: - if (blocksize == 1) { - dest->mode = MPIDI_RANK_MAP_STRIDE_INTRA; - dest->reg.stride.stride = src->reg.stride.stride * stride; - dest->reg.stride.blocksize = blocksize; - dest->reg.stride.offset = src->reg.stride.stride * offset + src->reg.stride.offset; - } else { - src_map_to_lut(src, dest, mapper, mapper->src_mapping_size, 0); - } - break; - case MPIDI_RANK_MAP_STRIDE_BLOCK: - case MPIDI_RANK_MAP_STRIDE_BLOCK_INTRA: - src_map_to_lut(src, dest, mapper, mapper->src_mapping_size, 0); - break; - case MPIDI_RANK_MAP_LUT: - case MPIDI_RANK_MAP_LUT_INTRA: - src_map_to_lut(src, dest, mapper, mapper->src_mapping_size, 0); - break; - case MPIDI_RANK_MAP_MLUT: - src_mlut_to_mlut(src, dest, mapper, mapper->src_mapping_size, 0); - break; - case MPIDI_RANK_MAP_NONE: - MPIR_Assert(0); - break; - } - MPIR_FUNC_EXIT; -} - -static int check_convert_mlut_to_lut(MPIDI_rank_map_t * src) -{ - int mpi_errno = MPI_SUCCESS, i; - int flag = 1; - int avtid; - MPIDI_rank_map_lut_t *lut = NULL; - - MPIR_FUNC_ENTER; - - if (src->mode != MPIDI_RANK_MAP_MLUT) { - goto fn_exit; - } - - /* check if all mlut item has the same avtid */ - avtid = src->irreg.mlut.gpid[0].avtid; - for (i = 1; i < src->size; i++) { - if (src->irreg.mlut.gpid[i].avtid != avtid) { - flag = 0; - break; - } - } - if (!flag) { /* multiple avtid */ - goto fn_exit; - } - - src->avtid = avtid; - if (avtid == 0) { - src->mode = MPIDI_RANK_MAP_LUT_INTRA; - } else { - src->mode = MPIDI_RANK_MAP_LUT; - } - mpi_errno = MPIDIU_alloc_lut(&lut, src->size); - MPIR_ERR_CHECK(mpi_errno); - for (i = 0; i < src->size; i++) { - lut->lpid[i] = src->irreg.mlut.gpid[i].lpid; - } - MPIDIU_release_mlut(src->irreg.mlut.t); - src->irreg.lut.t = lut; - src->irreg.lut.lpid = src->irreg.lut.t->lpid; - MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MAP, VERBOSE, (MPL_DBG_FDEST, " avtid %d", src->avtid)); - - fn_exit: - MPIR_FUNC_EXIT; - return mpi_errno; - fn_fail: - goto fn_exit; -} - -static int check_convert_lut_to_regular(MPIDI_rank_map_t * src) -{ - int mpi_errno = MPI_SUCCESS; - int mode_detected, offset, blocksize, stride; - MPIDI_rank_map_lut_t *lut = NULL; - - MPIR_FUNC_ENTER; - - if (src->mode != MPIDI_RANK_MAP_LUT && src->mode != MPIDI_RANK_MAP_LUT_INTRA) { - goto fn_exit; - } - - lut = src->irreg.lut.t; - mode_detected = detect_regular_model(src->irreg.lut.lpid, src->size, &offset, &blocksize, - &stride); - MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MAP, VERBOSE, - (MPL_DBG_FDEST, " detected mode: %d", mode_detected)); - - - switch (mode_detected) { - case MPIDI_SRC_MAPPER_DIRECT: - src->mode = MPIDI_RANK_MAP_DIRECT; - if (src->avtid == 0) { - src->mode = MPIDI_RANK_MAP_DIRECT_INTRA; - } - src->irreg.lut.t = NULL; - src->irreg.lut.lpid = NULL; - MPIDIU_release_lut(lut); - MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MAP, VERBOSE, - (MPL_DBG_FDEST, "\tlut to mode %d", (int) src->mode)); - break; - case MPIDI_SRC_MAPPER_OFFSET: - src->mode = MPIDI_RANK_MAP_OFFSET; - if (src->avtid == 0) { - src->mode = MPIDI_RANK_MAP_OFFSET_INTRA; - } - src->reg.offset = offset; - src->irreg.lut.t = NULL; - src->irreg.lut.lpid = NULL; - MPIDIU_release_lut(lut); - MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MAP, VERBOSE, - (MPL_DBG_FDEST, " lut to mode %d", (int) src->mode)); - MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MAP, VERBOSE, - (MPL_DBG_FDEST, "\toffset: %d", src->reg.offset)); - break; - case MPIDI_SRC_MAPPER_STRIDE: - if (blocksize == 1) { - src->mode = MPIDI_RANK_MAP_STRIDE; - if (src->avtid == 0) { - src->mode = MPIDI_RANK_MAP_STRIDE_INTRA; - } - } else { - src->mode = MPIDI_RANK_MAP_STRIDE_BLOCK; - if (src->avtid == 0) { - src->mode = MPIDI_RANK_MAP_STRIDE_BLOCK_INTRA; - } - } - src->reg.stride.stride = stride; - src->reg.stride.blocksize = blocksize; - src->reg.stride.offset = offset; - src->irreg.lut.t = NULL; - src->irreg.lut.lpid = NULL; - MPIDIU_release_lut(lut); - MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MAP, VERBOSE, - (MPL_DBG_FDEST, " lut to mode %d", (int) src->mode)); - MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MAP, VERBOSE, - (MPL_DBG_FDEST, "\toffset: %d", src->reg.stride.offset)); - MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MAP, VERBOSE, - (MPL_DBG_FDEST, "\tblocksize: %d", src->reg.stride.blocksize)); - MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MAP, VERBOSE, - (MPL_DBG_FDEST, "\tstride: %d", src->reg.stride.stride)); - break; - } - fn_exit: - MPIR_FUNC_EXIT; - return mpi_errno; -} - -static int set_map(MPIDI_rank_map_t * src_rmap, MPIDI_rank_map_t * dest_rmap, - MPIR_Comm_map_t * mapper, int src_comm_size, int total_mapper_size, - int mapper_offset) -{ - int mpi_errno = MPI_SUCCESS; - - MPIR_FUNC_ENTER; - - /* Simplest case: MAP_DUP, exact duplication of src_comm */ - if (mapper->type == MPIR_COMM_MAP_TYPE__DUP && src_comm_size == total_mapper_size) { - direct_of_src_rmap(src_rmap, dest_rmap, mapper); - goto fn_exit; - } - /* single src_comm, newcomm is smaller than src_comm, only one mapper */ - else if (mapper->type == MPIR_COMM_MAP_TYPE__IRREGULAR && - mapper->src_mapping_size == total_mapper_size) { - /* check if new comm has the same mapping as src_comm */ - /* detect src_mapping_offset for direct_to_direct and offset_to_offset */ - int mode_detected, offset = 0, blocksize, stride; - mode_detected = detect_regular_model(mapper->src_mapping, mapper->src_mapping_size, &offset, - &blocksize, &stride); - MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MAP, VERBOSE, - (MPL_DBG_FDEST, "\tdetected mode: %d", mode_detected)); - - switch (mode_detected) { - case MPIDI_SRC_MAPPER_DIRECT: - direct_of_src_rmap(src_rmap, dest_rmap, mapper); - break; - case MPIDI_SRC_MAPPER_OFFSET: - offset_of_src_rmap(src_rmap, dest_rmap, mapper, offset); - break; - case MPIDI_SRC_MAPPER_STRIDE: - stride_of_src_rmap(src_rmap, dest_rmap, mapper, stride, blocksize, offset); - break; - default: - if (src_rmap->mode == MPIDI_RANK_MAP_MLUT) { - src_mlut_to_mlut(src_rmap, dest_rmap, mapper, total_mapper_size, mapper_offset); - } else { - src_map_to_lut(src_rmap, dest_rmap, mapper, mapper->src_mapping_size, - mapper_offset); - } - } - goto fn_exit; - } - - /* more complex case: multiple mappers - * We always alloc lut (or mlut is src_rmap is mlut). We will check if a - * lut mapping can be converted to something simpler after all the mapper - * are processed - */ - - MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MAP, VERBOSE, (MPL_DBG_FDEST, " multiple mapper")); - if (mapper->type == MPIR_COMM_MAP_TYPE__DUP) { - MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MAP, VERBOSE, - (MPL_DBG_FDEST, " check map_size %d, src_comm_size %d", - map_size(*mapper), src_comm_size)); - src_comm_to_mlut(src_rmap, dest_rmap, src_comm_size, total_mapper_size, mapper_offset); - } else { /* mapper->type == MPIR_COMM_MAP_TYPE__IRREGULAR */ - src_mlut_to_mlut(src_rmap, dest_rmap, mapper, total_mapper_size, mapper_offset); - } - - fn_exit: - MPIR_FUNC_EXIT; - return mpi_errno; -} - -int MPIDI_comm_create_rank_map(MPIR_Comm * comm) -{ - int mpi_errno = MPI_SUCCESS; - MPIR_Comm_map_t *mapper; - MPIR_Comm *src_comm; - int total_mapper_size, mapper_offset; - - - MPIR_FUNC_ENTER; - - /* do some sanity checks */ - LL_FOREACH(comm->mapper_head, mapper) { - if (mapper->src_comm->comm_kind == MPIR_COMM_KIND__INTRACOMM) { - MPIR_Assert(mapper->dir == MPIR_COMM_MAP_DIR__L2L || - mapper->dir == MPIR_COMM_MAP_DIR__L2R); - } - - if (comm->comm_kind == MPIR_COMM_KIND__INTRACOMM) { - MPIR_Assert(mapper->dir == MPIR_COMM_MAP_DIR__L2L || - mapper->dir == MPIR_COMM_MAP_DIR__R2L); - } - } - - /* First, handle all the mappers that contribute to the local part - * of the comm */ - total_mapper_size = 0; - LL_FOREACH(comm->mapper_head, mapper) { - if (mapper->dir == MPIR_COMM_MAP_DIR__L2R || mapper->dir == MPIR_COMM_MAP_DIR__R2R) - continue; - - total_mapper_size += map_size(*mapper); - } - mapper_offset = 0; - LL_FOREACH(comm->mapper_head, mapper) { - src_comm = mapper->src_comm; - - if (mapper->dir == MPIR_COMM_MAP_DIR__L2R || mapper->dir == MPIR_COMM_MAP_DIR__R2R) - continue; - - if (mapper->dir == MPIR_COMM_MAP_DIR__L2L) { - if (src_comm->comm_kind == MPIR_COMM_KIND__INTRACOMM && - comm->comm_kind == MPIR_COMM_KIND__INTRACOMM) { - MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MAP, VERBOSE, - (MPL_DBG_FDEST, - " intra->intra, L2L, size=%d, total_mapper_size=%d, mapper_offset=%d", - src_comm->local_size, total_mapper_size, mapper_offset)); - set_map(&MPIDI_COMM(src_comm, map), &MPIDI_COMM(comm, map), mapper, - src_comm->local_size, total_mapper_size, mapper_offset); - } else if (src_comm->comm_kind == MPIR_COMM_KIND__INTRACOMM && - comm->comm_kind == MPIR_COMM_KIND__INTERCOMM) { - MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MAP, VERBOSE, - (MPL_DBG_FDEST, - " intra->inter, L2L, size=%d, total_mapper_size=%d, mapper_offset=%d", - src_comm->local_size, total_mapper_size, mapper_offset)); - set_map(&MPIDI_COMM(src_comm, map), &MPIDI_COMM(comm, local_map), mapper, - src_comm->local_size, total_mapper_size, mapper_offset); - } else if (src_comm->comm_kind == MPIR_COMM_KIND__INTERCOMM && - comm->comm_kind == MPIR_COMM_KIND__INTRACOMM) { - MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MAP, VERBOSE, - (MPL_DBG_FDEST, - " inter->intra, L2L, size=%d, total_mapper_size=%d, mapper_offset=%d", - src_comm->local_size, total_mapper_size, mapper_offset)); - set_map(&MPIDI_COMM(src_comm, local_map), &MPIDI_COMM(comm, map), mapper, - src_comm->local_size, total_mapper_size, mapper_offset); - } else { /* src_comm->comm_kind == MPIR_COMM_KIND__INTERCOMM && comm->comm_kind == MPIR_COMM_KIND__INTERCOMM */ - MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MAP, VERBOSE, - (MPL_DBG_FDEST, - " inter->inter, L2L, size=%d, total_mapper_size=%d, mapper_offset=%d", - src_comm->local_size, total_mapper_size, mapper_offset)); - set_map(&MPIDI_COMM(src_comm, local_map), &MPIDI_COMM(comm, local_map), - mapper, src_comm->local_size, total_mapper_size, mapper_offset); - } - } else { /* mapper->dir == MPIR_COMM_MAP_DIR__R2L */ - MPIR_Assert(src_comm->comm_kind == MPIR_COMM_KIND__INTERCOMM); - - if (comm->comm_kind == MPIR_COMM_KIND__INTRACOMM) { - MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MAP, VERBOSE, - (MPL_DBG_FDEST, - " ->intra, R2L, size=%d, total_mapper_size=%d, mapper_offset=%d", - src_comm->remote_size, total_mapper_size, mapper_offset)); - set_map(&MPIDI_COMM(src_comm, map), &MPIDI_COMM(comm, map), mapper, - src_comm->remote_size, total_mapper_size, mapper_offset); - } else { - MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MAP, VERBOSE, - (MPL_DBG_FDEST, - " ->inter, R2L, size=%d, total_mapper_size=%d, mapper_offset=%d", - src_comm->remote_size, total_mapper_size, mapper_offset)); - set_map(&MPIDI_COMM(src_comm, map), &MPIDI_COMM(comm, local_map), mapper, - src_comm->remote_size, total_mapper_size, mapper_offset); - } - } - - mapper_offset += map_size(*mapper); - } - - /* Next, handle all the mappers that contribute to the remote part - * of the comm (only valid for intercomms) - */ - total_mapper_size = 0; - LL_FOREACH(comm->mapper_head, mapper) { - if (mapper->dir == MPIR_COMM_MAP_DIR__L2L || mapper->dir == MPIR_COMM_MAP_DIR__R2L) - continue; - - total_mapper_size += map_size(*mapper); - } - mapper_offset = 0; - LL_FOREACH(comm->mapper_head, mapper) { - src_comm = mapper->src_comm; - - if (mapper->dir == MPIR_COMM_MAP_DIR__L2L || mapper->dir == MPIR_COMM_MAP_DIR__R2L) - continue; - - MPIR_Assert(comm->comm_kind == MPIR_COMM_KIND__INTERCOMM); - - if (mapper->dir == MPIR_COMM_MAP_DIR__L2R) { - if (src_comm->comm_kind == MPIR_COMM_KIND__INTRACOMM) { - MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MAP, VERBOSE, - (MPL_DBG_FDEST, - " intra->, L2R, size=%d, total_mapper_size=%d, mapper_offset=%d", - src_comm->local_size, total_mapper_size, mapper_offset)); - set_map(&MPIDI_COMM(src_comm, map), &MPIDI_COMM(comm, map), mapper, - src_comm->local_size, total_mapper_size, mapper_offset); - } else { /* src_comm->comm_kind == MPIR_COMM_KIND__INTERCOMM */ - MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MAP, VERBOSE, - (MPL_DBG_FDEST, - " inter->, L2R, size=%d, total_mapper_size=%d, mapper_offset=%d", - src_comm->local_size, total_mapper_size, mapper_offset)); - set_map(&MPIDI_COMM(src_comm, local_map), &MPIDI_COMM(comm, map), mapper, - src_comm->local_size, total_mapper_size, mapper_offset); - } - } else { /* mapper->dir == MPIR_COMM_MAP_DIR__R2R */ - MPIR_Assert(src_comm->comm_kind == MPIR_COMM_KIND__INTERCOMM); - MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MAP, VERBOSE, - (MPL_DBG_FDEST, - " inter->, R2R, size=%d, total_mapper_size=%d, mapper_offset=%d", - src_comm->remote_size, total_mapper_size, mapper_offset)); - set_map(&MPIDI_COMM(src_comm, map), &MPIDI_COMM(comm, map), mapper, - src_comm->remote_size, total_mapper_size, mapper_offset); - } - - mapper_offset += map_size(*mapper); - } - - /* check before finishing - * 1. if mlut can be converted to lut: all avtids are the same - * 2. if lut can be converted to regular modes: direct, offset, and more - */ - check_convert_mlut_to_lut(&MPIDI_COMM(comm, map)); - check_convert_lut_to_regular(&MPIDI_COMM(comm, map)); - if (comm->comm_kind == MPIR_COMM_KIND__INTERCOMM) { - check_convert_mlut_to_lut(&MPIDI_COMM(comm, local_map)); - check_convert_lut_to_regular(&MPIDI_COMM(comm, local_map)); - } - - if (comm->comm_kind == MPIR_COMM_KIND__INTERCOMM) { - /* setup the lut for the local_comm in the intercomm */ - if (comm->local_comm) { - MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MAP, VERBOSE, - (MPL_DBG_FDEST, "\t create local_comm using src_comm")); - direct_of_src_rmap(&MPIDI_COMM(comm, local_map), - &MPIDI_COMM(comm->local_comm, map), NULL); - - MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MEMORY, VERBOSE, - (MPL_DBG_FDEST, "create local_comm using src_comm")); - } - } - - if (comm->comm_kind == MPIR_COMM_KIND__INTRACOMM) { - MPIDI_COMM(comm, local_map).mode = MPIDI_RANK_MAP_NONE; - } -#ifdef MPL_USE_DBG_LOGGING - int rank_; - int avtid_, lpid_ = -1; - if (comm->remote_size < 16) { - for (rank_ = 0; rank_ < comm->remote_size; ++rank_) { - MPIDIU_comm_rank_to_pid(comm, rank_, &lpid_, &avtid_); - MPIDIU_comm_rank_to_av(comm, rank_); - } - } - if (comm->comm_kind == MPIR_COMM_KIND__INTERCOMM && comm->local_size < 16) { - for (rank_ = 0; rank_ < comm->local_size; ++rank_) { - MPIDIU_comm_rank_to_pid_local(comm, rank_, &lpid_, &avtid_); - } - } -#endif - - MPIR_FUNC_EXIT; - return mpi_errno; -} - /* number of leading zeros, from Hacker's Delight */ static int nlz(uint32_t x) { diff --git a/src/mpid/ch4/src/ch4i_comm.h b/src/mpid/ch4/src/ch4i_comm.h index 40032de5698..4966798b6a3 100644 --- a/src/mpid/ch4/src/ch4i_comm.h +++ b/src/mpid/ch4/src/ch4i_comm.h @@ -8,7 +8,6 @@ #include "ch4_types.h" -int MPIDI_comm_create_rank_map(MPIR_Comm * comm); int MPIDI_check_disjoint_lpids(MPIR_Lpid lpids1[], int n1, MPIR_Lpid lpids2[], int n2); #endif /* CH4I_COMM_H_INCLUDED */ diff --git a/src/mpid/ch4/src/init_comm.c b/src/mpid/ch4/src/init_comm.c index 17915496417..4fb35b00449 100644 --- a/src/mpid/ch4/src/init_comm.c +++ b/src/mpid/ch4/src/init_comm.c @@ -44,17 +44,6 @@ int MPIDI_create_init_comm(MPIR_Comm ** comm) map, &init_comm->local_group); MPIR_ERR_CHECK(mpi_errno); - MPIDI_COMM(init_comm, map).mode = MPIDI_RANK_MAP_LUT_INTRA; - mpi_errno = MPIDIU_alloc_lut(&lut, node_roots_comm_size); - MPIR_ERR_CHECK(mpi_errno); - MPIDI_COMM(init_comm, map).size = node_roots_comm_size; - MPIDI_COMM(init_comm, map).avtid = 0; - MPIDI_COMM(init_comm, map).irreg.lut.t = lut; - MPIDI_COMM(init_comm, map).irreg.lut.lpid = lut->lpid; - MPIDI_COMM(init_comm, local_map).mode = MPIDI_RANK_MAP_NONE; - for (i = 0; i < node_roots_comm_size; ++i) { - lut->lpid[i] = MPIR_Process.node_root_map[i]; - } mpi_errno = MPIDIG_init_comm(init_comm); MPIR_ERR_CHECK(mpi_errno); /* hacky, consider a separate MPIDI_{NM,SHM}_init_comm_hook From a1fefa636307cf31b58c974c739900a1d4b5c871 Mon Sep 17 00:00:00 2001 From: Hui Zhou Date: Fri, 20 Dec 2024 10:59:46 -0600 Subject: [PATCH 53/59] ch4: rewrite MPIDIU_rank_to_lpid Rename it to MPIDIU_get_grank, remove the dependency on MPIDIU_comm_rank_to_lpid (to be removed next) and use MPIR_comm_rank_to_lpid instead. --- src/mpid/ch4/shm/ipc/gpu/gpu_post.c | 2 +- src/mpid/ch4/shm/posix/posix_am.h | 4 ++-- src/mpid/ch4/shm/posix/posix_pre.h | 2 +- src/mpid/ch4/shm/posix/posix_send.h | 2 +- src/mpid/ch4/src/ch4_proc.h | 17 +++++------------ src/mpid/ch4/src/ch4_vci.h | 4 ++-- src/mpid/common/hcoll/hcoll_rte.c | 8 +++----- 7 files changed, 15 insertions(+), 24 deletions(-) diff --git a/src/mpid/ch4/shm/ipc/gpu/gpu_post.c b/src/mpid/ch4/shm/ipc/gpu/gpu_post.c index 55bfe3ca03e..b2ab09ec9cc 100644 --- a/src/mpid/ch4/shm/ipc/gpu/gpu_post.c +++ b/src/mpid/ch4/shm/ipc/gpu/gpu_post.c @@ -370,7 +370,7 @@ int MPIDI_GPU_get_ipc_attr(const void *buf, MPI_Aint count, MPI_Datatype datatyp ipc_attr->ipc_type = MPIDI_IPCI_TYPE__GPU; if (remote_rank != MPI_PROC_NULL) { - remote_rank = MPIDI_GPUI_global.local_ranks[MPIDIU_rank_to_lpid(remote_rank, comm)]; + remote_rank = MPIDI_GPUI_global.local_ranks[MPIDIU_get_grank(remote_rank, comm)]; } ipc_attr->u.gpu.remote_rank = remote_rank; diff --git a/src/mpid/ch4/shm/posix/posix_am.h b/src/mpid/ch4/shm/posix/posix_am.h index 5ddc2a45ec1..a0f3d10ebe6 100644 --- a/src/mpid/ch4/shm/posix/posix_am.h +++ b/src/mpid/ch4/shm/posix/posix_am.h @@ -85,7 +85,7 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_POSIX_am_isend(int rank, { int mpi_errno = MPI_SUCCESS; MPIDI_POSIX_am_header_t msg_hdr; - const int grank = MPIDIU_rank_to_lpid(rank, comm); + const int grank = MPIDIU_get_grank(rank, comm); MPIR_FUNC_ENTER; @@ -180,7 +180,7 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_POSIX_am_send_hdr(int rank, MPIR_Comm * comm, { int mpi_errno = MPI_SUCCESS; MPIDI_POSIX_am_header_t msg_hdr; - const int grank = MPIDIU_rank_to_lpid(rank, comm); + const int grank = MPIDIU_get_grank(rank, comm); MPIR_FUNC_ENTER; diff --git a/src/mpid/ch4/shm/posix/posix_pre.h b/src/mpid/ch4/shm/posix/posix_pre.h index 1357eed5895..98ddad5cf17 100644 --- a/src/mpid/ch4/shm/posix/posix_pre.h +++ b/src/mpid/ch4/shm/posix/posix_pre.h @@ -120,7 +120,7 @@ do { \ #define MPIDI_POSIX_EAGER_RECV_POSTED_HOOK(request,rank,communicator)\ do { \ - int grank_ = ((rank) >= 0) ? MPIDIU_rank_to_lpid((rank), (communicator)) : (rank); \ + int grank_ = ((rank) >= 0) ? MPIDIU_get_grank((rank), (communicator)) : (rank); \ (request)->dev.ch4.am.shm_am.posix.eager_recv_posted_hook_grank = grank_; \ MPIDI_POSIX_eager_recv_posted_hook(grank_); \ } while (0) diff --git a/src/mpid/ch4/shm/posix/posix_send.h b/src/mpid/ch4/shm/posix/posix_send.h index 3b66e77b716..cda37c20d07 100644 --- a/src/mpid/ch4/shm/posix/posix_send.h +++ b/src/mpid/ch4/shm/posix/posix_send.h @@ -64,7 +64,7 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_POSIX_mpi_isend(const void *buf, MPI_Aint cou am_hdr.data_sz = data_sz; am_hdr.rndv_hdr_sz = 0; - int grank = MPIDIU_rank_to_lpid(rank, comm); + int grank = MPIDIU_get_grank(rank, comm); MPI_Aint bytes_sent; int rc = MPIDI_POSIX_eager_send(grank, &msg_hdr, &am_hdr, sizeof(am_hdr), buf, count, datatype, 0, vci_src, vci_dst, &bytes_sent); diff --git a/src/mpid/ch4/src/ch4_proc.h b/src/mpid/ch4/src/ch4_proc.h index e338f4510e2..ba76966c477 100644 --- a/src/mpid/ch4/src/ch4_proc.h +++ b/src/mpid/ch4/src/ch4_proc.h @@ -188,21 +188,14 @@ MPL_STATIC_INLINE_PREFIX int MPIDIU_av_is_local(MPIDI_av_entry_t * av) return ret; } -MPL_STATIC_INLINE_PREFIX int MPIDIU_rank_to_lpid(int rank, MPIR_Comm * comm) +MPL_STATIC_INLINE_PREFIX int MPIDIU_get_grank(int rank, MPIR_Comm * comm) { - int ret; - MPIR_FUNC_ENTER; - - int avtid = 0, lpid = 0; - MPIDIU_comm_rank_to_pid(comm, rank, &lpid, &avtid); - if (avtid == 0) { - ret = lpid; + MPIR_Lpid lpid = MPIR_comm_rank_to_lpid(comm, rank); + if (MPIR_LPID_WORLD_INDEX(lpid) == 0) { + return (int) lpid; } else { - ret = -1; + return -1; } - - MPIR_FUNC_EXIT; - return ret; } /* used in fast path where we know the lpid has a valid av, such as from a committed communicator */ diff --git a/src/mpid/ch4/src/ch4_vci.h b/src/mpid/ch4/src/ch4_vci.h index e60adac148d..aff146298ac 100644 --- a/src/mpid/ch4/src/ch4_vci.h +++ b/src/mpid/ch4/src/ch4_vci.h @@ -47,7 +47,7 @@ /* VCI hashing function (fast path) */ /* For consistent hashing, we may need differentiate between src and dst vci and whether - * it is being called from sender side or receiver side (consdier intercomm). We use an + * it is being called from sender side or receiver side (consider intercomm). We use an * integer flag to encode the information. * * The flag constants are designed as bit fields, so different hashing algorithm can easily @@ -71,7 +71,7 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_hash_remote_vci(int raw_vci, MPIR_Comm * comm /* MPI_ANY_SOURCE, MPI_PROC_NULL, return a dummy, won't be used */ return 0; } else { - int grank = MPIDIU_rank_to_lpid(rank, comm_ptr); + int grank = MPIDIU_get_grank(rank, comm_ptr); MPIR_Assert(grank >= 0); return raw_vci % MPIDI_global.all_num_vcis[grank]; } diff --git a/src/mpid/common/hcoll/hcoll_rte.c b/src/mpid/common/hcoll/hcoll_rte.c index 0db52cd6226..aa055d6a022 100644 --- a/src/mpid/common/hcoll/hcoll_rte.c +++ b/src/mpid/common/hcoll/hcoll_rte.c @@ -301,11 +301,9 @@ static void coll_handle_complete(void *handle) static int world_rank(rte_grp_handle_t grp_h, rte_ec_handle_t ec) { -#ifdef MPIDCH4_H_INCLUDED - return MPIDIU_rank_to_lpid(ec.rank, (MPIR_Comm *) grp_h); -#else - return ((struct MPIDI_VC *) ec.handle)->pg_rank; -#endif + MPIR_Lpid lpid = MPIR_comm_rank_to_lpid((MPIR_Comm *) grp_h, ec.rank); + MPIR_Assert(MPIR_LPID_WORLD_INDEX(lpid) == 0); + return MPIR_LPID_WORLD_RANK(lpid); } #if HCOLL_API >= HCOLL_VERSION(3,6) From e8f96fb6015b5c17cfa748bcb60aa78d8cd6c7da Mon Sep 17 00:00:00 2001 From: Hui Zhou Date: Fri, 20 Dec 2024 11:23:15 -0600 Subject: [PATCH 54/59] ch4: remove MPIDIU_comm_rank_to_pid This is fully replaced by MPIR_comm_rank_to_lpid. --- src/mpid/ch4/netmod/ofi/ofi_impl.h | 2 - src/mpid/ch4/netmod/ucx/ucx_impl.h | 1 - src/mpid/ch4/src/ch4_proc.c | 89 --------------------- src/mpid/ch4/src/ch4_proc.h | 124 ----------------------------- src/mpid/ch4/src/init_comm.c | 4 +- src/mpid/ch4/src/mpidig_win.h | 13 +-- 6 files changed, 3 insertions(+), 230 deletions(-) diff --git a/src/mpid/ch4/netmod/ofi/ofi_impl.h b/src/mpid/ch4/netmod/ofi/ofi_impl.h index a3ad267eee4..ecfeb8c4c5c 100644 --- a/src/mpid/ch4/netmod/ofi/ofi_impl.h +++ b/src/mpid/ch4/netmod/ofi/ofi_impl.h @@ -31,8 +31,6 @@ ATTRIBUTE((unused)); #define MPIDI_OFI_DT(dt) ((dt)->dev.netmod.ofi) #define MPIDI_OFI_OP(op) ((op)->dev.netmod.ofi) #define MPIDI_OFI_COMM(comm) ((comm)->dev.ch4.netmod.ofi) -#define MPIDI_OFI_COMM_TO_INDEX(comm,rank) \ - MPIDIU_comm_rank_to_pid(comm, rank, NULL, NULL) #define MPIDI_OFI_TO_PHYS(avtid, lpid, _nic) \ MPIDI_OFI_AV(&MPIDIU_get_av((avtid), (lpid))).dest[_nic][0] diff --git a/src/mpid/ch4/netmod/ucx/ucx_impl.h b/src/mpid/ch4/netmod/ucx/ucx_impl.h index d204383ac5b..e5d952e9ba9 100644 --- a/src/mpid/ch4/netmod/ucx/ucx_impl.h +++ b/src/mpid/ch4/netmod/ucx/ucx_impl.h @@ -19,7 +19,6 @@ #define MPIDI_UCX_COMM(comm) ((comm)->dev.ch4.netmod.ucx) #define MPIDI_UCX_REQ(req) ((req)->dev.ch4.netmod.ucx) -#define COMM_TO_INDEX(comm,rank) MPIDIU_comm_rank_to_pid(comm, rank, NULL, NULL) #define MPIDI_UCX_COMM_TO_EP(comm,rank,vci_src,vci_dst) \ MPIDI_UCX_AV(MPIDIU_comm_rank_to_av(comm, rank)).dest[vci_src][vci_dst] #define MPIDI_UCX_AV_TO_EP(av,vci_src,vci_dst) MPIDI_UCX_AV((av)).dest[vci_src][vci_dst] diff --git a/src/mpid/ch4/src/ch4_proc.c b/src/mpid/ch4/src/ch4_proc.c index 60ac8a52632..2505410f4b0 100644 --- a/src/mpid/ch4/src/ch4_proc.c +++ b/src/mpid/ch4/src/ch4_proc.c @@ -396,92 +396,3 @@ int MPIDIU_upids_to_lpids(int size, int *remote_upid_size, char *remote_upids, fn_fail: goto fn_exit; } - -int MPIDIU_alloc_lut(MPIDI_rank_map_lut_t ** lut, int size) -{ - int mpi_errno = MPI_SUCCESS; - MPIDI_rank_map_lut_t *new_lut = NULL; - - MPIR_FUNC_ENTER; - - new_lut = (MPIDI_rank_map_lut_t *) MPL_malloc(sizeof(MPIDI_rank_map_lut_t) - + size * sizeof(MPIDI_lpid_t), MPL_MEM_ADDRESS); - if (new_lut == NULL) { - *lut = NULL; - MPIR_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**nomem"); - } - - MPIR_cc_set(&new_lut->ref_count, 1); - *lut = new_lut; - - MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MEMORY, VERBOSE, - (MPL_DBG_FDEST, "alloc lut %p, size %lu, refcount=%d", - new_lut, size * sizeof(MPIDI_lpid_t), MPIR_cc_get(&new_lut->ref_count))); - fn_exit: - MPIR_FUNC_EXIT; - return mpi_errno; - fn_fail: - goto fn_exit; -} - -int MPIDIU_release_lut(MPIDI_rank_map_lut_t * lut) -{ - int mpi_errno = MPI_SUCCESS; - int in_use = 0; - - MPIR_FUNC_ENTER; - - MPIR_cc_decr(&lut->ref_count, &in_use); - MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MEMORY, VERBOSE, (MPL_DBG_FDEST, "dec ref to lut %p", lut)); - if (!in_use) { - MPL_free(lut); - MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MEMORY, VERBOSE, (MPL_DBG_FDEST, "free lut %p", lut)); - } - MPIR_FUNC_EXIT; - return mpi_errno; -} - -int MPIDIU_alloc_mlut(MPIDI_rank_map_mlut_t ** mlut, int size) -{ - int mpi_errno = MPI_SUCCESS; - MPIDI_rank_map_mlut_t *new_mlut = NULL; - - MPIR_FUNC_ENTER; - - new_mlut = (MPIDI_rank_map_mlut_t *) MPL_malloc(sizeof(MPIDI_rank_map_mlut_t) - + size * sizeof(MPIDI_gpid_t), MPL_MEM_ADDRESS); - if (new_mlut == NULL) { - *mlut = NULL; - MPIR_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**nomem"); - } - - MPIR_cc_set(&new_mlut->ref_count, 1); - *mlut = new_mlut; - - MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MEMORY, VERBOSE, - (MPL_DBG_FDEST, "alloc mlut %p, size %lu, refcount=%d", - new_mlut, size * sizeof(MPIDI_gpid_t), MPIR_cc_get(&new_mlut->ref_count))); - fn_exit: - MPIR_FUNC_EXIT; - return mpi_errno; - fn_fail: - goto fn_exit; -} - -int MPIDIU_release_mlut(MPIDI_rank_map_mlut_t * mlut) -{ - int mpi_errno = MPI_SUCCESS; - int in_use = 0; - - MPIR_FUNC_ENTER; - - MPIR_cc_decr(&mlut->ref_count, &in_use); - MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MEMORY, VERBOSE, (MPL_DBG_FDEST, "dec ref to mlut %p", mlut)); - if (!in_use) { - MPL_free(mlut); - MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MEMORY, VERBOSE, (MPL_DBG_FDEST, "free mlut %p", mlut)); - } - - MPIR_FUNC_EXIT; - return mpi_errno; -} diff --git a/src/mpid/ch4/src/ch4_proc.h b/src/mpid/ch4/src/ch4_proc.h index ba76966c477..87159555e6f 100644 --- a/src/mpid/ch4/src/ch4_proc.h +++ b/src/mpid/ch4/src/ch4_proc.h @@ -35,85 +35,6 @@ void MPIDIU_upidhash_free(void); #endif int MPIDIU_upids_to_lpids(int size, int *remote_upid_size, char *remote_upids, MPIR_Lpid * remote_lpids); -int MPIDIU_alloc_lut(MPIDI_rank_map_lut_t ** lut, int size); -int MPIDIU_release_lut(MPIDI_rank_map_lut_t * lut); -int MPIDIU_alloc_mlut(MPIDI_rank_map_mlut_t ** mlut, int size); -int MPIDIU_release_mlut(MPIDI_rank_map_mlut_t * mlut); -#define MPIDIU_lut_add_ref(lut) \ - do { \ - MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MEMORY, VERBOSE, (MPL_DBG_FDEST, "inc ref to lut %p", lut)); \ - MPIR_cc_inc(&(lut)->ref_count); \ - } while (0) - -#define MPIDIU_mlut_add_ref(mlut) \ - do { \ - MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MEMORY, VERBOSE, (MPL_DBG_FDEST, "inc ref to mlut %p", mlut)); \ - MPIR_cc_inc(&(mlut)->ref_count); \ - } while (0) - -MPL_STATIC_INLINE_PREFIX int MPIDIU_comm_rank_to_pid(MPIR_Comm * comm, int rank, int *idx, - int *avtid) -{ - MPIR_FUNC_ENTER; - - *avtid = 0; - *idx = 0; - - switch (MPIDI_COMM(comm, map).mode) { - case MPIDI_RANK_MAP_DIRECT: - *avtid = MPIDI_COMM(comm, map).avtid; - *idx = rank; - break; - case MPIDI_RANK_MAP_DIRECT_INTRA: - *idx = rank; - break; - case MPIDI_RANK_MAP_OFFSET: - *avtid = MPIDI_COMM(comm, map).avtid; - *idx = rank + MPIDI_COMM(comm, map).reg.offset; - break; - case MPIDI_RANK_MAP_OFFSET_INTRA: - *idx = rank + MPIDI_COMM(comm, map).reg.offset; - break; - case MPIDI_RANK_MAP_STRIDE: - *avtid = MPIDI_COMM(comm, map).avtid; - *idx = MPIDI_CALC_STRIDE_SIMPLE(rank, MPIDI_COMM(comm, map).reg.stride.stride, - MPIDI_COMM(comm, map).reg.stride.offset); - break; - case MPIDI_RANK_MAP_STRIDE_INTRA: - *idx = MPIDI_CALC_STRIDE_SIMPLE(rank, MPIDI_COMM(comm, map).reg.stride.stride, - MPIDI_COMM(comm, map).reg.stride.offset); - break; - case MPIDI_RANK_MAP_STRIDE_BLOCK: - *avtid = MPIDI_COMM(comm, map).avtid; - *idx = MPIDI_CALC_STRIDE(rank, MPIDI_COMM(comm, map).reg.stride.stride, - MPIDI_COMM(comm, map).reg.stride.blocksize, - MPIDI_COMM(comm, map).reg.stride.offset); - break; - case MPIDI_RANK_MAP_STRIDE_BLOCK_INTRA: - *idx = MPIDI_CALC_STRIDE(rank, MPIDI_COMM(comm, map).reg.stride.stride, - MPIDI_COMM(comm, map).reg.stride.blocksize, - MPIDI_COMM(comm, map).reg.stride.offset); - break; - case MPIDI_RANK_MAP_LUT: - *avtid = MPIDI_COMM(comm, map).avtid; - *idx = MPIDI_COMM(comm, map).irreg.lut.lpid[rank]; - break; - case MPIDI_RANK_MAP_LUT_INTRA: - *idx = MPIDI_COMM(comm, map).irreg.lut.lpid[rank]; - break; - case MPIDI_RANK_MAP_MLUT: - *idx = MPIDI_COMM(comm, map).irreg.mlut.gpid[rank].lpid; - *avtid = MPIDI_COMM(comm, map).irreg.mlut.gpid[rank].avtid; - break; - case MPIDI_RANK_MAP_NONE: - MPIR_Assert(0); - break; - } - MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MAP, VERBOSE, - (MPL_DBG_FDEST, " comm_to_pid: rank=%d, avtid=%d idx=%d", rank, *avtid, *idx)); - MPIR_FUNC_EXIT; - return *idx; -} MPL_STATIC_INLINE_PREFIX MPIDI_av_entry_t *MPIDIU_comm_rank_to_av(MPIR_Comm * comm, int rank) { @@ -130,51 +51,6 @@ MPL_STATIC_INLINE_PREFIX MPIDI_av_entry_t *MPIDIU_comm_rank_to_av(MPIR_Comm * co return ret; } -MPL_STATIC_INLINE_PREFIX int MPIDIU_comm_rank_to_pid_local(MPIR_Comm * comm, int rank, int *idx, - int *avtid) -{ - MPIR_FUNC_ENTER; - - *avtid = MPIDI_COMM(comm, local_map).avtid; - switch (MPIDI_COMM(comm, local_map).mode) { - case MPIDI_RANK_MAP_DIRECT: - case MPIDI_RANK_MAP_DIRECT_INTRA: - *idx = rank; - break; - case MPIDI_RANK_MAP_OFFSET: - case MPIDI_RANK_MAP_OFFSET_INTRA: - *idx = rank + MPIDI_COMM(comm, local_map).reg.offset; - break; - case MPIDI_RANK_MAP_STRIDE: - case MPIDI_RANK_MAP_STRIDE_INTRA: - *idx = MPIDI_CALC_STRIDE_SIMPLE(rank, MPIDI_COMM(comm, local_map).reg.stride.stride, - MPIDI_COMM(comm, local_map).reg.stride.offset); - break; - case MPIDI_RANK_MAP_STRIDE_BLOCK: - case MPIDI_RANK_MAP_STRIDE_BLOCK_INTRA: - *idx = MPIDI_CALC_STRIDE(rank, MPIDI_COMM(comm, local_map).reg.stride.stride, - MPIDI_COMM(comm, local_map).reg.stride.blocksize, - MPIDI_COMM(comm, local_map).reg.stride.offset); - break; - case MPIDI_RANK_MAP_LUT: - case MPIDI_RANK_MAP_LUT_INTRA: - *idx = MPIDI_COMM(comm, local_map).irreg.lut.lpid[rank]; - break; - case MPIDI_RANK_MAP_MLUT: - *idx = MPIDI_COMM(comm, local_map).irreg.mlut.gpid[rank].lpid; - *avtid = MPIDI_COMM(comm, local_map).irreg.mlut.gpid[rank].avtid; - break; - case MPIDI_RANK_MAP_NONE: - MPIR_Assert(0); - break; - } - MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MAP, VERBOSE, - (MPL_DBG_FDEST, " comm_to_pid_local: rank=%d, avtid=%d idx=%d", - rank, *avtid, *idx)); - MPIR_FUNC_EXIT; - return *idx; -} - MPL_STATIC_INLINE_PREFIX int MPIDIU_av_is_local(MPIDI_av_entry_t * av) { int ret = 0; diff --git a/src/mpid/ch4/src/init_comm.c b/src/mpid/ch4/src/init_comm.c index 4fb35b00449..09b1729c284 100644 --- a/src/mpid/ch4/src/init_comm.c +++ b/src/mpid/ch4/src/init_comm.c @@ -21,7 +21,6 @@ int MPIDI_create_init_comm(MPIR_Comm ** comm) int node_roots_comm_size = MPIR_Process.num_nodes; int node_roots_comm_rank = MPIR_Process.node_map[world_rank]; MPIR_Comm *init_comm = NULL; - MPIDI_rank_map_lut_t *lut = NULL; mpi_errno = MPIR_Comm_create(&init_comm); MPIR_ERR_CHECK(mpi_errno); @@ -66,9 +65,8 @@ void MPIDI_destroy_init_comm(MPIR_Comm ** comm_ptr) MPIR_Comm *comm = NULL; if (*comm_ptr != NULL) { comm = *comm_ptr; - MPIDIU_release_lut(MPIDI_COMM(comm, map).irreg.lut.t); - MPIR_Group_release(comm->local_group); MPIDIG_destroy_comm(comm); + MPIR_Group_release(comm->local_group); MPIR_Object_release_ref(comm, &in_use); MPIR_Assertp(in_use == 0); MPII_COMML_FORGET(comm); diff --git a/src/mpid/ch4/src/mpidig_win.h b/src/mpid/ch4/src/mpidig_win.h index 6353bf7def3..63054ea36f9 100644 --- a/src/mpid/ch4/src/mpidig_win.h +++ b/src/mpid/ch4/src/mpidig_win.h @@ -562,18 +562,9 @@ MPL_STATIC_INLINE_PREFIX int MPIDIG_win_shared_query_part(MPIR_Win * win, int ra *disp_unit = 0; *((void **) baseptr) = NULL; } else { - int shm_rank = -1; /* find shm_rank in node_comm. Q: can we rely on comm_ptr->intranode_table? */ - int avtid, idx; - MPIDIU_comm_rank_to_pid(win->comm_ptr, rank, &idx, &avtid); - for (int i = 0; i < win->comm_ptr->node_comm->local_size; i++) { - int tmp_avtid, tmp_idx; - MPIDIU_comm_rank_to_pid(win->comm_ptr->node_comm, i, &tmp_idx, &tmp_avtid); - if (tmp_avtid == avtid && tmp_idx == idx) { - shm_rank = i; - break; - } - } + MPIR_Lpid lpid = MPIR_comm_rank_to_lpid(win->comm_ptr, rank); + int shm_rank = MPIR_Group_lpid_to_rank(win->comm_ptr->node_comm->local_group, lpid); MPIR_Assert(shm_rank >= 0); MPIDIG_win_shared_info_t *shared_table = MPIDIG_WIN(win, shared_table); From 140d11680590aaca1a016feba707ee0e441c21b5 Mon Sep 17 00:00:00 2001 From: Hui Zhou Date: Sun, 22 Dec 2024 23:14:52 -0600 Subject: [PATCH 55/59] ch4: track lpid in upid_hash Track MPIR_Lpid lpid rather than a pair of (avtid, lpid). --- src/mpid/ch4/include/mpidpre.h | 3 +-- src/mpid/ch4/netmod/ucx/ucx_init.c | 6 +++--- src/mpid/ch4/src/ch4_proc.c | 8 ++------ src/mpid/ch4/src/ch4_proc.h | 2 +- 4 files changed, 7 insertions(+), 12 deletions(-) diff --git a/src/mpid/ch4/include/mpidpre.h b/src/mpid/ch4/include/mpidpre.h index 7a4010ff95c..ab5b0792beb 100644 --- a/src/mpid/ch4/include/mpidpre.h +++ b/src/mpid/ch4/include/mpidpre.h @@ -620,8 +620,7 @@ typedef struct { typedef struct { void *upid; int upid_len; - int avtid; - int lpid; + MPIR_Lpid lpid; UT_hash_handle hh; } MPIDI_upid_hash; #endif diff --git a/src/mpid/ch4/netmod/ucx/ucx_init.c b/src/mpid/ch4/netmod/ucx/ucx_init.c index fd7698bbebf..8c7015ba44e 100644 --- a/src/mpid/ch4/netmod/ucx/ucx_init.c +++ b/src/mpid/ch4/netmod/ucx/ucx_init.c @@ -104,7 +104,7 @@ static int initial_address_exchange(void) ucp_ep_create(MPIDI_UCX_global.ctx[0].worker, &ep_params, &MPIDI_UCX_AV(&MPIDIU_get_av(0, node_roots[i])).dest[0][0]); MPIDI_UCX_CHK_STATUS(ucx_status); - MPIDIU_upidhash_add(ep_params.address, recv_bc_len, 0, node_roots[i]); + MPIDIU_upidhash_add(ep_params.address, recv_bc_len, node_roots[i]); } mpi_errno = MPIDU_bc_allgather(init_comm, MPIDI_UCX_global.ctx[0].if_address, (int) MPIDI_UCX_global.ctx[0].addrname_len, FALSE, @@ -119,7 +119,7 @@ static int initial_address_exchange(void) ucx_status = ucp_ep_create(MPIDI_UCX_global.ctx[0].worker, &ep_params, &MPIDI_UCX_AV(&MPIDIU_get_av(0, i)).dest[0][0]); MPIDI_UCX_CHK_STATUS(ucx_status); - MPIDIU_upidhash_add(ep_params.address, recv_bc_len, 0, i); + MPIDIU_upidhash_add(ep_params.address, recv_bc_len, i); } } mpi_errno = MPIDU_bc_table_destroy(); @@ -132,7 +132,7 @@ static int initial_address_exchange(void) ucp_ep_create(MPIDI_UCX_global.ctx[0].worker, &ep_params, &MPIDI_UCX_AV(&MPIDIU_get_av(0, i)).dest[0][0]); MPIDI_UCX_CHK_STATUS(ucx_status); - MPIDIU_upidhash_add(ep_params.address, recv_bc_len, 0, i); + MPIDIU_upidhash_add(ep_params.address, recv_bc_len, i); } mpi_errno = MPIDU_bc_table_destroy(); MPIR_ERR_CHECK(mpi_errno); diff --git a/src/mpid/ch4/src/ch4_proc.c b/src/mpid/ch4/src/ch4_proc.c index 2505410f4b0..3cc121f4b23 100644 --- a/src/mpid/ch4/src/ch4_proc.c +++ b/src/mpid/ch4/src/ch4_proc.c @@ -341,20 +341,17 @@ MPIDI_av_entry_t *MPIDIU_lpid_to_av_slow(MPIR_Lpid lpid) /* Store the upid, avtid, lpid in a hash to support get_local_upids and upids_to_lupids */ static MPIDI_upid_hash *upid_hash = NULL; -void MPIDIU_upidhash_add(const void *upid, int upid_len, int avtid, int lpid) +void MPIDIU_upidhash_add(const void *upid, int upid_len, MPIR_Lpid lpid) { MPIDI_upid_hash *t; t = MPL_malloc(sizeof(MPIDI_upid_hash), MPL_MEM_OTHER); - t->avtid = avtid; t->lpid = lpid; t->upid = MPL_malloc(upid_len, MPL_MEM_OTHER); memcpy(t->upid, upid, upid_len); t->upid_len = upid_len; HASH_ADD_KEYPTR(hh, upid_hash, t->upid, upid_len, t, MPL_MEM_OTHER); - MPIDIU_get_av(avtid, lpid).hash = t; - /* Do not free avt while we use upidhash - FIXME: improve it */ - MPIDIU_avt_add_ref(avtid); + MPIDIU_lpid_to_av(lpid)->hash = t; } MPIDI_upid_hash *MPIDIU_upidhash_find(const void *upid, int upid_len) @@ -369,7 +366,6 @@ void MPIDIU_upidhash_free(void) MPIDI_upid_hash *cur, *tmp; HASH_ITER(hh, upid_hash, cur, tmp) { HASH_DEL(upid_hash, cur); - MPIDIU_avt_release_ref(cur->avtid); MPL_free(cur->upid); MPL_free(cur); } diff --git a/src/mpid/ch4/src/ch4_proc.h b/src/mpid/ch4/src/ch4_proc.h index 87159555e6f..a89fc1856d2 100644 --- a/src/mpid/ch4/src/ch4_proc.h +++ b/src/mpid/ch4/src/ch4_proc.h @@ -29,7 +29,7 @@ int MPIDIU_avt_destroy(void); int MPIDIU_get_node_id(MPIR_Comm * comm, int rank, int *id_p); #ifdef MPIDI_BUILD_CH4_UPID_HASH -void MPIDIU_upidhash_add(const void *upid, int upid_len, int avtid, int lpid); +void MPIDIU_upidhash_add(const void *upid, int upid_len, MPIR_Lpid lpid); MPIDI_upid_hash *MPIDIU_upidhash_find(const void *upid, int upid_len); void MPIDIU_upidhash_free(void); #endif From 7efe6b248a4cc28754300aeb7c43f1f9c41bc4af Mon Sep 17 00:00:00 2001 From: Hui Zhou Date: Sun, 22 Dec 2024 23:19:07 -0600 Subject: [PATCH 56/59] ch4: remove netmod api upids_to_lpids Now we use MPIR_Lpid, we no longer needed netmod api to convert upids to lpids. The function is replaced by netmod api insert_upid. --- src/mpid/ch4/ch4_api.txt | 5 -- src/mpid/ch4/netmod/ofi/ofi_spawn.c | 96 ----------------------------- src/mpid/ch4/netmod/ucx/ucx_spawn.c | 57 ----------------- src/mpid/ch4/src/ch4_proc.c | 23 +------ src/mpid/ch4/src/ch4_proc.h | 2 - 5 files changed, 1 insertion(+), 182 deletions(-) diff --git a/src/mpid/ch4/ch4_api.txt b/src/mpid/ch4/ch4_api.txt index 1e0b5410c1c..25657023333 100644 --- a/src/mpid/ch4/ch4_api.txt +++ b/src/mpid/ch4/ch4_api.txt @@ -91,8 +91,6 @@ Non Native API: NM : comm, local_upid_size, local_upids insert_upid: int NM : lpid, upid, upid_len - upids_to_lpids : int - NM : size, remote_upid_size, remote_upids, remote_lpids dynamic_send : int NM : remote_lpid, tag, buf, size, timeout dynamic_recv : int @@ -507,9 +505,6 @@ PARAM: recv_buf: void * recv_size: int remote_lpid: MPIR_Lpid - remote_lpids: MPIR_Lpid * - remote_upid_size: int * - remote_upids: char * req: MPIR_Request * req_p: MPIR_Request ** result_addr: void * diff --git a/src/mpid/ch4/netmod/ofi/ofi_spawn.c b/src/mpid/ch4/netmod/ofi/ofi_spawn.c index 381d1510e0b..c50bac9b065 100644 --- a/src/mpid/ch4/netmod/ofi/ofi_spawn.c +++ b/src/mpid/ch4/netmod/ofi/ofi_spawn.c @@ -225,102 +225,6 @@ static int cancel_dynamic_request(MPIDI_OFI_dynamic_process_request_t * dynamic_ goto fn_exit; } -/* the following functions are "proc" functions, but because they are only used during dynamic - * process spawning, having them here provides better context */ - -int MPIDI_OFI_upids_to_lpids(int size, int *remote_upid_size, char *remote_upids, - MPIR_Lpid * remote_lpids) -{ - int i, mpi_errno = MPI_SUCCESS; - int *new_avt_procs; - char **new_upids; - int n_new_procs = 0; - int n_avts; - char *curr_upid; - int nic = 0; - int ctx_idx = MPIDI_OFI_get_ctx_index(0, nic); - - MPIR_CHKLMEM_DECL(2); - - MPIR_CHKLMEM_MALLOC(new_avt_procs, int *, sizeof(int) * size, mpi_errno, "new_avt_procs", - MPL_MEM_ADDRESS); - MPIR_CHKLMEM_MALLOC(new_upids, char **, sizeof(char *) * size, mpi_errno, "new_upids", - MPL_MEM_ADDRESS); - - n_avts = MPIDIU_get_n_avts(); - - curr_upid = remote_upids; - for (i = 0; i < size; i++) { - int j, k; - char tbladdr[FI_NAME_MAX]; - int found = 0; - size_t sz = 0; - - char *hostname = curr_upid; - int hostname_len = strlen(hostname); - char *addrname = hostname + hostname_len + 1; - int addrname_len = remote_upid_size[i] - hostname_len - 1; - - for (k = 0; k < n_avts; k++) { - if (MPIDIU_get_av_table(k) == NULL) { - continue; - } - for (j = 0; j < MPIDIU_get_av_table(k)->size; j++) { - sz = MPIDI_OFI_global.addrnamelen; - MPIDI_OFI_VCI_CALL(fi_av_lookup(MPIDI_OFI_global.ctx[ctx_idx].av, - MPIDI_OFI_TO_PHYS(k, j, nic), &tbladdr, &sz), 0, - avlookup); - if (sz == addrname_len && !memcmp(tbladdr, addrname, addrname_len)) { - remote_lpids[i] = MPIDIU_GPID_CREATE(k, j); - found = 1; - break; - } - } - if (found) { - break; - } - } - - if (!found) { - new_avt_procs[n_new_procs] = i; - new_upids[n_new_procs] = curr_upid; - n_new_procs++; - } - curr_upid += remote_upid_size[i]; - } - - /* create new av_table, insert processes */ - if (n_new_procs > 0) { - int avtid; - mpi_errno = MPIDIU_new_avt(n_new_procs, &avtid); - MPIR_ERR_CHECK(mpi_errno); - - for (i = 0; i < n_new_procs; i++) { - char *hostname = new_upids[i]; - char *addrname = hostname + strlen(hostname) + 1; - - fi_addr_t addr; - MPIDI_OFI_VCI_CALL(fi_av_insert(MPIDI_OFI_global.ctx[ctx_idx].av, addrname, - 1, &addr, 0ULL, NULL), 0, avmap); - MPIR_Assert(addr != FI_ADDR_NOTAVAIL); - MPIDI_OFI_AV(&MPIDIU_get_av(avtid, i)).dest[nic][0] = addr; - - int node_id; - mpi_errno = MPIR_nodeid_lookup(hostname, &node_id); - MPIR_ERR_CHECK(mpi_errno); - MPIDIU_get_av(avtid, i).node_id = node_id; - - remote_lpids[new_avt_procs[i]] = MPIDIU_GPID_CREATE(avtid, i); - } - } - - fn_exit: - MPIR_CHKLMEM_FREEALL(); - return mpi_errno; - fn_fail: - goto fn_exit; -} - int MPIDI_OFI_get_local_upids(MPIR_Comm * comm, int **local_upid_size, char **local_upids) { int mpi_errno = MPI_SUCCESS; diff --git a/src/mpid/ch4/netmod/ucx/ucx_spawn.c b/src/mpid/ch4/netmod/ucx/ucx_spawn.c index 87696e370cc..5cb4de4ce76 100644 --- a/src/mpid/ch4/netmod/ucx/ucx_spawn.c +++ b/src/mpid/ch4/netmod/ucx/ucx_spawn.c @@ -265,60 +265,3 @@ int MPIDI_UCX_insert_upid(MPIR_Lpid lpid, const char *upid, int upid_len) fn_fail: goto fn_exit; } - -int MPIDI_UCX_upids_to_lpids(int size, int *remote_upid_size, char *remote_upids, - MPIR_Lpid * remote_lpids) -{ - int mpi_errno = MPI_SUCCESS; - - int n_new_procs = 0; - int *new_avt_procs; - char **new_upids; - int vci = 0; - MPIR_CHKLMEM_DECL(2); - - MPIR_CHKLMEM_MALLOC(new_avt_procs, int *, sizeof(int) * size, mpi_errno, "new_avt_procs", - MPL_MEM_ADDRESS); - MPIR_CHKLMEM_MALLOC(new_upids, char **, sizeof(char *) * size, mpi_errno, "new_upids", - MPL_MEM_ADDRESS); - - char *curr_upid = remote_upids; - for (int i = 0; i < size; i++) { - MPIDI_upid_hash *t = MPIDIU_upidhash_find(curr_upid, remote_upid_size[i]); - if (t) { - remote_lpids[i] = MPIDIU_GPID_CREATE(t->avtid, t->lpid); - } else { - new_avt_procs[n_new_procs] = i; - new_upids[n_new_procs] = curr_upid; - n_new_procs++; - - } - curr_upid += remote_upid_size[i]; - } - - /* create new av_table, insert processes */ - if (n_new_procs > 0) { - int avtid; - mpi_errno = MPIDIU_new_avt(n_new_procs, &avtid); - MPIR_ERR_CHECK(mpi_errno); - - for (int i = 0; i < n_new_procs; i++) { - ucp_ep_params_t ep_params; - ucs_status_t ucx_status; - ep_params.field_mask = UCP_EP_PARAM_FIELD_REMOTE_ADDRESS; - ep_params.address = (ucp_address_t *) new_upids[i]; - ucx_status = ucp_ep_create(MPIDI_UCX_global.ctx[vci].worker, &ep_params, - &MPIDI_UCX_AV(&MPIDIU_get_av(avtid, i)).dest[0][0]); - MPIDI_UCX_CHK_STATUS(ucx_status); - MPIDIU_upidhash_add(new_upids[i], remote_upid_size[new_avt_procs[i]], avtid, i); - - remote_lpids[new_avt_procs[i]] = MPIDIU_GPID_CREATE(avtid, i); - } - } - - fn_exit: - MPIR_CHKLMEM_FREEALL(); - return mpi_errno; - fn_fail: - goto fn_exit; -} diff --git a/src/mpid/ch4/src/ch4_proc.c b/src/mpid/ch4/src/ch4_proc.c index 3cc121f4b23..34f2e143c11 100644 --- a/src/mpid/ch4/src/ch4_proc.c +++ b/src/mpid/ch4/src/ch4_proc.c @@ -338,7 +338,7 @@ MPIDI_av_entry_t *MPIDIU_lpid_to_av_slow(MPIR_Lpid lpid) } #ifdef MPIDI_BUILD_CH4_UPID_HASH -/* Store the upid, avtid, lpid in a hash to support get_local_upids and upids_to_lupids */ +/* Store the upid, avtid, lpid in a hash to support get_local_upids and insert_upid */ static MPIDI_upid_hash *upid_hash = NULL; void MPIDIU_upidhash_add(const void *upid, int upid_len, MPIR_Lpid lpid) @@ -371,24 +371,3 @@ void MPIDIU_upidhash_free(void) } } #endif - -/* convert upid to gpid by netmod. - * For ofi netmod, it inserts the address and fills an av entry. - */ -int MPIDIU_upids_to_lpids(int size, int *remote_upid_size, char *remote_upids, - MPIR_Lpid * remote_lpids) -{ - int mpi_errno = MPI_SUCCESS; - MPIR_FUNC_ENTER; - - MPID_THREAD_CS_ENTER(VCI, MPIDIU_THREAD_DYNPROC_MUTEX); - mpi_errno = MPIDI_NM_upids_to_lpids(size, remote_upid_size, remote_upids, remote_lpids); - MPIR_ERR_CHECK(mpi_errno); - - fn_exit: - MPID_THREAD_CS_EXIT(VCI, MPIDIU_THREAD_DYNPROC_MUTEX); - MPIR_FUNC_EXIT; - return mpi_errno; - fn_fail: - goto fn_exit; -} diff --git a/src/mpid/ch4/src/ch4_proc.h b/src/mpid/ch4/src/ch4_proc.h index a89fc1856d2..1425bf12e37 100644 --- a/src/mpid/ch4/src/ch4_proc.h +++ b/src/mpid/ch4/src/ch4_proc.h @@ -33,8 +33,6 @@ void MPIDIU_upidhash_add(const void *upid, int upid_len, MPIR_Lpid lpid); MPIDI_upid_hash *MPIDIU_upidhash_find(const void *upid, int upid_len); void MPIDIU_upidhash_free(void); #endif -int MPIDIU_upids_to_lpids(int size, int *remote_upid_size, char *remote_upids, - MPIR_Lpid * remote_lpids); MPL_STATIC_INLINE_PREFIX MPIDI_av_entry_t *MPIDIU_comm_rank_to_av(MPIR_Comm * comm, int rank) { From b0b1744e45dfb3316501c40f63e0e5e4eba759df Mon Sep 17 00:00:00 2001 From: Hui Zhou Date: Mon, 23 Dec 2024 08:53:51 -0600 Subject: [PATCH 57/59] ch4: replace MPIDIU_get_av We no longer expose avtid. Replace MPIDIU_get_av with MPIDIU_lpid_to_av. Also remote unused GPID macros. --- src/mpid/ch4/include/mpidpre.h | 11 ----------- src/mpid/ch4/netmod/ofi/init_addrxchg.c | 12 ++++++------ src/mpid/ch4/netmod/ofi/ofi_impl.h | 3 --- src/mpid/ch4/netmod/ofi/ofi_init.c | 8 ++++---- src/mpid/ch4/netmod/ofi/ofi_spawn.c | 4 +--- src/mpid/ch4/netmod/ucx/ucx_init.c | 10 +++++----- src/mpid/ch4/src/ch4_types.h | 3 --- 7 files changed, 16 insertions(+), 35 deletions(-) diff --git a/src/mpid/ch4/include/mpidpre.h b/src/mpid/ch4/include/mpidpre.h index ab5b0792beb..bab94b1bcf9 100644 --- a/src/mpid/ch4/include/mpidpre.h +++ b/src/mpid/ch4/include/mpidpre.h @@ -637,17 +637,6 @@ typedef struct MPIDI_av_entry { #define HAVE_DEV_COMM_HOOK -/* - * operation for (avtid, lpid) to/from gpid - */ -#define MPIDIU_LPID_BITS 32 -#define MPIDIU_LPID_MASK 0xFFFFFFFFU -#define MPIDIU_GPID_CREATE(avtid, lpid) (((uint64_t) (avtid) << MPIDIU_LPID_BITS) | (lpid)) -#define MPIDIU_GPID_GET_AVTID(gpid) ((gpid) >> MPIDIU_LPID_BITS) -#define MPIDIU_GPID_GET_LPID(gpid) ((gpid) & MPIDIU_LPID_MASK) - -#define MPIDI_DYNPROC_MASK (0x80000000U) - int MPIDI_check_for_failed_procs(void); #ifdef HAVE_SIGNAL diff --git a/src/mpid/ch4/netmod/ofi/init_addrxchg.c b/src/mpid/ch4/netmod/ofi/init_addrxchg.c index 7a1766df84e..6cb43a1c52f 100644 --- a/src/mpid/ch4/netmod/ofi/init_addrxchg.c +++ b/src/mpid/ch4/netmod/ofi/init_addrxchg.c @@ -133,7 +133,7 @@ int MPIDI_OFI_addr_exchange_root_ctx(void) for (int i = 0; i < num_nodes; i++) { MPIR_Assert(mapped_table[i] != FI_ADDR_NOTAVAIL); - MPIDI_OFI_AV(&MPIDIU_get_av(0, node_roots[i])).dest[0][0] = mapped_table[i]; + MPIDI_OFI_AV(MPIDIU_lpid_to_av(node_roots[i])).dest[0][0] = mapped_table[i]; } MPL_free(mapped_table); /* Then, allgather all address names using init_comm */ @@ -149,7 +149,7 @@ int MPIDI_OFI_addr_exchange_root_ctx(void) char *addrname = (char *) table + recv_bc_len * rank_map[i]; MPIDI_OFI_CALL(fi_av_insert(MPIDI_OFI_global.ctx[0].av, addrname, 1, &addr, 0ULL, NULL), avmap); - MPIDI_OFI_AV(&MPIDIU_get_av(0, i)).dest[0][0] = addr; + MPIDI_OFI_AV(MPIDIU_lpid_to_av(i)).dest[0][0] = addr; } } mpi_errno = MPIDU_bc_table_destroy(); @@ -163,7 +163,7 @@ int MPIDI_OFI_addr_exchange_root_ctx(void) for (int i = 0; i < size; i++) { MPIR_Assert(mapped_table[i] != FI_ADDR_NOTAVAIL); - MPIDI_OFI_AV(&MPIDIU_get_av(0, i)).dest[0][0] = mapped_table[i]; + MPIDI_OFI_AV(MPIDIU_lpid_to_av(i)).dest[0][0] = mapped_table[i]; } MPL_free(mapped_table); mpi_errno = MPIDU_bc_table_destroy(); @@ -173,7 +173,7 @@ int MPIDI_OFI_addr_exchange_root_ctx(void) /* check */ if (MPIDI_OFI_ENABLE_AV_TABLE) { for (int r = 0; r < size; r++) { - MPIDI_OFI_addr_t *av ATTRIBUTE((unused)) = &MPIDI_OFI_AV(&MPIDIU_get_av(0, r)); + MPIDI_OFI_addr_t *av ATTRIBUTE((unused)) = &MPIDI_OFI_AV(MPIDIU_lpid_to_av(r)); MPIR_Assert(av->dest[0][0] == get_root_av_table_index(r)); } } @@ -192,7 +192,7 @@ int MPIDI_OFI_addr_exchange_root_ctx(void) /* Macros to reduce clutter, so we can focus on the ordering logics. * Note: they are not perfectly wrapped, but tolerable since only used here. */ #define GET_AV_AND_ADDRNAMES(rank) \ - MPIDI_OFI_addr_t *av ATTRIBUTE((unused)) = &MPIDI_OFI_AV(&MPIDIU_get_av(0, rank)); \ + MPIDI_OFI_addr_t *av ATTRIBUTE((unused)) = &MPIDI_OFI_AV(MPIDIU_lpid_to_av(rank)); \ char *r_names = all_names + rank * max_vcis * num_nics * name_len; #define DO_AV_INSERT(ctx_idx, nic, vci) \ @@ -346,7 +346,7 @@ int MPIDI_OFI_addr_exchange_all_ctx(void) #if MPIDI_CH4_MAX_VCIS > 1 if (MPIDI_OFI_ENABLE_AV_TABLE) { for (int r = 0; r < size; r++) { - MPIDI_OFI_addr_t *av ATTRIBUTE((unused)) = &MPIDI_OFI_AV(&MPIDIU_get_av(0, r)); + MPIDI_OFI_addr_t *av ATTRIBUTE((unused)) = &MPIDI_OFI_AV(MPIDIU_lpid_to_av(r)); for (int nic = 0; nic < num_nics; nic++) { for (int vci = 0; vci < NUM_VCIS_FOR_RANK(r); vci++) { MPIR_Assert(av->dest[nic][vci] == get_av_table_index(r, nic, vci, diff --git a/src/mpid/ch4/netmod/ofi/ofi_impl.h b/src/mpid/ch4/netmod/ofi/ofi_impl.h index ecfeb8c4c5c..556a2e4110d 100644 --- a/src/mpid/ch4/netmod/ofi/ofi_impl.h +++ b/src/mpid/ch4/netmod/ofi/ofi_impl.h @@ -31,9 +31,6 @@ ATTRIBUTE((unused)); #define MPIDI_OFI_DT(dt) ((dt)->dev.netmod.ofi) #define MPIDI_OFI_OP(op) ((op)->dev.netmod.ofi) #define MPIDI_OFI_COMM(comm) ((comm)->dev.ch4.netmod.ofi) -#define MPIDI_OFI_TO_PHYS(avtid, lpid, _nic) \ - MPIDI_OFI_AV(&MPIDIU_get_av((avtid), (lpid))).dest[_nic][0] - #define MPIDI_OFI_WIN(win) ((win)->dev.netmod.ofi) #define MPIDI_OFI_NIC_NAME(nic) (MPIDI_OFI_global.prov_use[nic] ? \ diff --git a/src/mpid/ch4/netmod/ofi/ofi_init.c b/src/mpid/ch4/netmod/ofi/ofi_init.c index 634d3b7facb..32c56c0bce7 100644 --- a/src/mpid/ch4/netmod/ofi/ofi_init.c +++ b/src/mpid/ch4/netmod/ofi/ofi_init.c @@ -943,7 +943,7 @@ static int flush_send(int dst, int nic, int vci, MPIDI_OFI_dynamic_process_reque { int mpi_errno = MPI_SUCCESS; - fi_addr_t addr = MPIDI_OFI_av_to_phys(&MPIDIU_get_av(0, dst), nic, vci); + fi_addr_t addr = MPIDI_OFI_av_to_phys(MPIDIU_lpid_to_av(dst), nic, vci); static int data = 0; uint64_t match_bits = MPIDI_OFI_init_sendtag(MPIDI_OFI_FLUSH_CONTEXT_ID, 0, MPIDI_OFI_FLUSH_TAG); @@ -974,7 +974,7 @@ static int flush_recv(int src, int nic, int vci, MPIDI_OFI_dynamic_process_reque { int mpi_errno = MPI_SUCCESS; - fi_addr_t addr = MPIDI_OFI_av_to_phys(&MPIDIU_get_av(0, src), nic, vci); + fi_addr_t addr = MPIDI_OFI_av_to_phys(MPIDIU_lpid_to_av(src), nic, vci); uint64_t mask_bits = 0; uint64_t match_bits = MPIDI_OFI_init_sendtag(MPIDI_OFI_FLUSH_CONTEXT_ID, 0, MPIDI_OFI_FLUSH_TAG); @@ -1555,10 +1555,10 @@ static int try_open_shared_av(struct fid_domain *domain, struct fid_av **p_av, i /* directly references the mapped fi_addr_t array instead */ fi_addr_t *mapped_table = (fi_addr_t *) av_attr.map_addr; for (int i = 0; i < MPIR_Process.size; i++) { - MPIDI_OFI_AV(&MPIDIU_get_av(0, i)).dest[nic][0] = mapped_table[i]; + MPIDI_OFI_AV(MPIDIU_lpid_to_av(i)).dest[nic][0] = mapped_table[i]; MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MAP, VERBOSE, (MPL_DBG_FDEST, " grank mapped to: rank=%d, av=%p, dest=%" PRIu64, - i, (void *) &MPIDIU_get_av(0, i), mapped_table[i])); + i, (void *) MPIDIU_lpid_to_av(i), mapped_table[i])); } ret = 1; } diff --git a/src/mpid/ch4/netmod/ofi/ofi_spawn.c b/src/mpid/ch4/netmod/ofi/ofi_spawn.c index c50bac9b065..906667b7020 100644 --- a/src/mpid/ch4/netmod/ofi/ofi_spawn.c +++ b/src/mpid/ch4/netmod/ofi/ofi_spawn.c @@ -20,9 +20,7 @@ int MPIDI_OFI_dynamic_send(MPIR_Lpid remote_lpid, int tag, const void *buf, int int nic = 0; /* dynamic process only use nic 0 */ int vci = 0; /* dynamic process only use vci 0 */ int ctx_idx = 0; - int avtid = MPIDIU_GPID_GET_AVTID(remote_lpid); - int lpid = MPIDIU_GPID_GET_LPID(remote_lpid); - fi_addr_t remote_addr = MPIDI_OFI_av_to_phys(&MPIDIU_get_av(avtid, lpid), nic, vci); + fi_addr_t remote_addr = MPIDI_OFI_av_to_phys(MPIDIU_lpid_to_av_slow(remote_lpid), nic, vci); MPIDI_OFI_dynamic_process_request_t req; req.done = 0; diff --git a/src/mpid/ch4/netmod/ucx/ucx_init.c b/src/mpid/ch4/netmod/ucx/ucx_init.c index 8c7015ba44e..ad4c08c8363 100644 --- a/src/mpid/ch4/netmod/ucx/ucx_init.c +++ b/src/mpid/ch4/netmod/ucx/ucx_init.c @@ -102,7 +102,7 @@ static int initial_address_exchange(void) ep_params.address = (ucp_address_t *) ((char *) table + i * recv_bc_len); ucx_status = ucp_ep_create(MPIDI_UCX_global.ctx[0].worker, &ep_params, - &MPIDI_UCX_AV(&MPIDIU_get_av(0, node_roots[i])).dest[0][0]); + &MPIDI_UCX_AV(MPIDIU_lpid_to_av(node_roots[i])).dest[0][0]); MPIDI_UCX_CHK_STATUS(ucx_status); MPIDIU_upidhash_add(ep_params.address, recv_bc_len, node_roots[i]); } @@ -117,7 +117,7 @@ static int initial_address_exchange(void) ep_params.field_mask = UCP_EP_PARAM_FIELD_REMOTE_ADDRESS; ep_params.address = (ucp_address_t *) ((char *) table + rank_map[i] * recv_bc_len); ucx_status = ucp_ep_create(MPIDI_UCX_global.ctx[0].worker, &ep_params, - &MPIDI_UCX_AV(&MPIDIU_get_av(0, i)).dest[0][0]); + &MPIDI_UCX_AV(MPIDIU_lpid_to_av(i)).dest[0][0]); MPIDI_UCX_CHK_STATUS(ucx_status); MPIDIU_upidhash_add(ep_params.address, recv_bc_len, i); } @@ -130,7 +130,7 @@ static int initial_address_exchange(void) ep_params.address = (ucp_address_t *) ((char *) table + i * recv_bc_len); ucx_status = ucp_ep_create(MPIDI_UCX_global.ctx[0].worker, &ep_params, - &MPIDI_UCX_AV(&MPIDIU_get_av(0, i)).dest[0][0]); + &MPIDI_UCX_AV(MPIDIU_lpid_to_av(i)).dest[0][0]); MPIDI_UCX_CHK_STATUS(ucx_status); MPIDIU_upidhash_add(ep_params.address, recv_bc_len, i); } @@ -180,7 +180,7 @@ static int all_vcis_address_exchange(void) ucp_ep_params_t ep_params; for (int vci_local = 0; vci_local < num_vcis; vci_local++) { for (int r = 0; r < size; r++) { - MPIDI_UCX_addr_t *av = &MPIDI_UCX_AV(&MPIDIU_get_av(0, r)); + MPIDI_UCX_addr_t *av = &MPIDI_UCX_AV(MPIDIU_lpid_to_av(r)); for (int vci_remote = 0; vci_remote < num_vcis; vci_remote++) { if (vci_local == 0 && vci_remote == 0) { /* don't overwrite existing addr, or bad things will happen */ @@ -369,7 +369,7 @@ int MPIDI_UCX_mpi_finalize_hook(void) int p = 0; for (int i = 0; i < MPIR_Process.size; i++) { - MPIDI_UCX_addr_t *av = &MPIDI_UCX_AV(&MPIDIU_get_av(0, i)); + MPIDI_UCX_addr_t *av = &MPIDI_UCX_AV(MPIDIU_lpid_to_av(i)); for (int vci_local = 0; vci_local < MPIDI_UCX_global.num_vcis; vci_local++) { for (int vci_remote = 0; vci_remote < MPIDI_UCX_global.num_vcis; vci_remote++) { ucp_request = ucp_disconnect_nb(av->dest[vci_local][vci_remote]); diff --git a/src/mpid/ch4/src/ch4_types.h b/src/mpid/ch4/src/ch4_types.h index e618450a745..947628a0fa8 100644 --- a/src/mpid/ch4/src/ch4_types.h +++ b/src/mpid/ch4/src/ch4_types.h @@ -213,9 +213,6 @@ typedef struct { MPIDI_dyn_av_table_t dynamic_av_table; } MPIDIU_avt_manager; -#define MPIDIU_get_av_table(avtid) (MPIDI_global.avt_mgr.av_tables[(avtid)]) -#define MPIDIU_get_av(avtid, lpid) (MPIDI_global.avt_mgr.av_tables[(avtid)]->table[(lpid)]) - typedef struct { uint64_t key; void *value; From e7316e695a28f12dae53aee203eb604760c861cf Mon Sep 17 00:00:00 2001 From: Hui Zhou Date: Tue, 24 Dec 2024 22:28:10 -0600 Subject: [PATCH 58/59] ch4/ofi: add a way to tell whether an av entry is empty When ch4-layer allocates an av table, all entries are initialized to 0. However, 0 can be a valid entry for fi_addr_t. We could initialize all entries to FI_ADDR_NOTAVAIL, but that requires an additional complexity of a netmod API. Instead, because the entry 0 is always the first entry to be inserted by fi_av_insert, we can simply remember the entry (MPIDI_OFI_global.lpid0) and be able to tell which entries are empty (in MPIDI_OFI_insert_upid). --- src/mpid/ch4/netmod/ofi/init_addrxchg.c | 6 ++++++ src/mpid/ch4/netmod/ofi/ofi_init.c | 3 +++ src/mpid/ch4/netmod/ofi/ofi_spawn.c | 6 +++++- src/mpid/ch4/netmod/ofi/ofi_types.h | 3 +++ src/mpid/ch4/src/ch4_proc.c | 2 -- 5 files changed, 17 insertions(+), 3 deletions(-) diff --git a/src/mpid/ch4/netmod/ofi/init_addrxchg.c b/src/mpid/ch4/netmod/ofi/init_addrxchg.c index 6cb43a1c52f..a37e9bd1f62 100644 --- a/src/mpid/ch4/netmod/ofi/init_addrxchg.c +++ b/src/mpid/ch4/netmod/ofi/init_addrxchg.c @@ -131,6 +131,9 @@ int MPIDI_OFI_addr_exchange_root_ctx(void) (MPIDI_OFI_global.ctx[0].av, table, num_nodes, mapped_table, 0ULL, NULL), avmap); + if (mapped_table[0] == 0) { + MPIDI_OFI_global.lpid0 = node_roots[0]; + } for (int i = 0; i < num_nodes; i++) { MPIR_Assert(mapped_table[i] != FI_ADDR_NOTAVAIL); MPIDI_OFI_AV(MPIDIU_lpid_to_av(node_roots[i])).dest[0][0] = mapped_table[i]; @@ -161,6 +164,9 @@ int MPIDI_OFI_addr_exchange_root_ctx(void) MPIDI_OFI_CALL(fi_av_insert (MPIDI_OFI_global.ctx[0].av, table, size, mapped_table, 0ULL, NULL), avmap); + if (mapped_table[0] == 0) { + MPIDI_OFI_global.lpid0 = 0; + } for (int i = 0; i < size; i++) { MPIR_Assert(mapped_table[i] != FI_ADDR_NOTAVAIL); MPIDI_OFI_AV(MPIDIU_lpid_to_av(i)).dest[0][0] = mapped_table[i]; diff --git a/src/mpid/ch4/netmod/ofi/ofi_init.c b/src/mpid/ch4/netmod/ofi/ofi_init.c index 32c56c0bce7..651110c5bda 100644 --- a/src/mpid/ch4/netmod/ofi/ofi_init.c +++ b/src/mpid/ch4/netmod/ofi/ofi_init.c @@ -719,6 +719,9 @@ int MPIDI_OFI_init_local(int *tag_bits) mpi_errno = ofi_pvar_init(); MPIR_ERR_CHECK(mpi_errno); + /* A way to tell which av is empty */ + MPIDI_OFI_global.lpid0 = MPIR_LPID_INVALID; + /* -------------------------------- */ /* Set up the libfabric provider(s) */ /* -------------------------------- */ diff --git a/src/mpid/ch4/netmod/ofi/ofi_spawn.c b/src/mpid/ch4/netmod/ofi/ofi_spawn.c index 906667b7020..187e1db4b2c 100644 --- a/src/mpid/ch4/netmod/ofi/ofi_spawn.c +++ b/src/mpid/ch4/netmod/ofi/ofi_spawn.c @@ -288,7 +288,7 @@ int MPIDI_OFI_insert_upid(MPIR_Lpid lpid, const char *upid, int upid_len) bool do_insert = false; if (lpid & MPIR_LPID_DYNAMIC_MASK) { do_insert = true; - } else if (MPIDI_OFI_AV(av).dest[0][0] == FI_ADDR_NOTAVAIL) { + } else if (MPIDI_OFI_AV(av).dest[0][0] == 0 && lpid != MPIDI_OFI_global.lpid0) { MPIDI_av_entry_t *dynamic_av = MPIDIU_find_dynamic_av(upid, upid_len); if (dynamic_av) { /* just copy it over */ @@ -312,6 +312,10 @@ int MPIDI_OFI_insert_upid(MPIR_Lpid lpid, const char *upid, int upid_len) MPIR_Assert(MPIDI_OFI_AV(av).dest[0][0] != FI_ADDR_NOTAVAIL); } + if (MPIDI_OFI_AV(av).dest[0][0] == 0) { + MPIDI_OFI_global.lpid0 = lpid; + } + fn_exit: return mpi_errno; fn_fail: diff --git a/src/mpid/ch4/netmod/ofi/ofi_types.h b/src/mpid/ch4/netmod/ofi/ofi_types.h index 9b1309fd0e5..cdfe66c5394 100644 --- a/src/mpid/ch4/netmod/ofi/ofi_types.h +++ b/src/mpid/ch4/netmod/ofi/ofi_types.h @@ -524,6 +524,9 @@ typedef struct { size_t addrnamelen; /* OFI uses the same name length within a provider. */ char pname[MPI_MAX_PROCESSOR_NAME]; int port_name_tag_mask[MPIR_MAX_CONTEXT_MASK]; + /* To support dynamic av tables, we need a way to tell which entries are empty. + * ch4 av tables are initialize to 0s. Thus we need know which "0" is valid. */ + MPIR_Lpid lpid0; /* Capability settings */ #ifdef MPIDI_OFI_ENABLE_RUNTIME_CHECKS diff --git a/src/mpid/ch4/src/ch4_proc.c b/src/mpid/ch4/src/ch4_proc.c index 34f2e143c11..a5624b8f348 100644 --- a/src/mpid/ch4/src/ch4_proc.c +++ b/src/mpid/ch4/src/ch4_proc.c @@ -104,8 +104,6 @@ int MPIDIU_new_avt(int size, int *avtid) MPIR_cc_set(&MPIDI_global.avt_mgr.av_tables[*avtid]->ref_count, 0); - /* TODO: to support dynamic processes and dynamic av insertions, we need device hooks to initialize table with invalid entries */ - MPIR_FUNC_EXIT; return mpi_errno; } From 54d2544036a9eb63491611dc2f5f274e0c2968d7 Mon Sep 17 00:00:00 2001 From: Hui Zhou Date: Wed, 25 Dec 2024 00:01:27 -0600 Subject: [PATCH 59/59] ch4: remove av table ref_count We don't really tracek av tables' ref_count. We simply free all av tables at finalize. Rename MPIDIU_avt_destroy to MPIDIU_avt_finalize to better reflect its role. --- src/mpid/ch4/src/ch4_init.c | 2 +- src/mpid/ch4/src/ch4_proc.c | 35 ++--------------------------------- src/mpid/ch4/src/ch4_proc.h | 4 +--- src/mpid/ch4/src/ch4_types.h | 1 - 4 files changed, 4 insertions(+), 38 deletions(-) diff --git a/src/mpid/ch4/src/ch4_init.c b/src/mpid/ch4/src/ch4_init.c index 6842fed8b76..58af18a41ba 100644 --- a/src/mpid/ch4/src/ch4_init.c +++ b/src/mpid/ch4/src/ch4_init.c @@ -820,7 +820,7 @@ int MPID_Finalize(void) MPIDU_genq_private_pool_destroy(MPIDI_global.gpu_coll_pool); - MPIDIU_avt_destroy(); + MPIDIU_avt_finalize(); mpi_errno = MPIDU_Init_shm_finalize(); MPIR_ERR_CHECK(mpi_errno); diff --git a/src/mpid/ch4/src/ch4_proc.c b/src/mpid/ch4/src/ch4_proc.c index a5624b8f348..c9442caf662 100644 --- a/src/mpid/ch4/src/ch4_proc.c +++ b/src/mpid/ch4/src/ch4_proc.c @@ -102,8 +102,6 @@ int MPIDIU_new_avt(int size, int *avtid) } MPIDI_global.avt_mgr.av_tables[*avtid] = new_av_table; - MPIR_cc_set(&MPIDI_global.avt_mgr.av_tables[*avtid]->ref_count, 0); - MPIR_FUNC_EXIT; return mpi_errno; } @@ -124,33 +122,6 @@ int MPIDIU_free_avt(int avtid) return mpi_errno; } -int MPIDIU_avt_add_ref(int avtid) -{ - MPIR_FUNC_ENTER; - - MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_GENERAL, VERBOSE, (MPL_DBG_FDEST, " incr avtid=%d", avtid)); - MPIR_cc_inc(&MPIDI_global.avt_mgr.av_tables[avtid]->ref_count); - - MPIR_FUNC_EXIT; - return MPI_SUCCESS; -} - -int MPIDIU_avt_release_ref(int avtid) -{ - int in_use; - - MPIR_FUNC_ENTER; - - MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_GENERAL, VERBOSE, (MPL_DBG_FDEST, " decr avtid=%d", avtid)); - MPIR_cc_decr(&MPIDI_global.avt_mgr.av_tables[avtid]->ref_count, &in_use); - if (!in_use) { - MPIDIU_free_avt(avtid); - } - - MPIR_FUNC_EXIT; - return MPI_SUCCESS; -} - static void init_dynamic_av_table(void); static void destroy_dynamic_av_table(void); @@ -178,7 +149,6 @@ int MPIDIU_avt_init(void) #endif MPIDI_global.avt_mgr.av_table0->size = size; - MPIR_cc_set(&MPIDI_global.avt_mgr.av_table0->ref_count, 1); for (int i = 0; i < size; i++) { MPIDI_global.avt_mgr.av_table0->table[i].is_local = @@ -194,14 +164,13 @@ int MPIDIU_avt_init(void) return mpi_errno; } -int MPIDIU_avt_destroy(void) +int MPIDIU_avt_finalize(void) { MPIR_FUNC_ENTER; for (int i = 0; i < MPIDI_global.avt_mgr.n_avts; i++) { if (MPIDI_global.avt_mgr.av_tables[i] != NULL) { - MPIDIU_avt_release_ref(i); - /*TODO: Check all references is cleared and the entry is set to NULL */ + MPIDIU_free_avt(i); } } diff --git a/src/mpid/ch4/src/ch4_proc.h b/src/mpid/ch4/src/ch4_proc.h index 1425bf12e37..706f67b5f7e 100644 --- a/src/mpid/ch4/src/ch4_proc.h +++ b/src/mpid/ch4/src/ch4_proc.h @@ -22,10 +22,8 @@ int MPIDIU_get_n_avts(void); int MPIDIU_get_avt_size(int avtid); int MPIDIU_new_avt(int size, int *avtid); int MPIDIU_free_avt(int avtid); -int MPIDIU_avt_add_ref(int avtid); -int MPIDIU_avt_release_ref(int avtid); int MPIDIU_avt_init(void); -int MPIDIU_avt_destroy(void); +int MPIDIU_avt_finalize(void); int MPIDIU_get_node_id(MPIR_Comm * comm, int rank, int *id_p); #ifdef MPIDI_BUILD_CH4_UPID_HASH diff --git a/src/mpid/ch4/src/ch4_types.h b/src/mpid/ch4/src/ch4_types.h index 947628a0fa8..c8ea6c1cc09 100644 --- a/src/mpid/ch4/src/ch4_types.h +++ b/src/mpid/ch4/src/ch4_types.h @@ -185,7 +185,6 @@ typedef struct MPIDIG_acc_ack_msg_t { typedef MPIDIG_acc_ack_msg_t MPIDIG_get_acc_ack_msg_t; typedef struct { - MPIR_cc_t ref_count; int size; MPIDI_av_entry_t table[]; } MPIDI_av_table_t;