From 225e98f57fed39d8707626df96e5b0aa3efe5cc3 Mon Sep 17 00:00:00 2001 From: Hui Zhou Date: Tue, 10 Dec 2024 13:28:48 -0600 Subject: [PATCH 01/19] test: remove unused test glpid This test requires to access MPICH internals, thus won't be used with the current design. --- test/mpi/group/Makefile.am | 4 ---- test/mpi/group/glpid.c | 44 -------------------------------------- 2 files changed, 48 deletions(-) delete mode 100644 test/mpi/group/glpid.c diff --git a/test/mpi/group/Makefile.am b/test/mpi/group/Makefile.am index d647c9d377a..993dab99371 100644 --- a/test/mpi/group/Makefile.am +++ b/test/mpi/group/Makefile.am @@ -16,7 +16,3 @@ noinst_PROGRAMS = \ groupcreate \ gtranks \ groupnullincl - -# glpid is a whitebox test that uses mpiimpl.h; it is unlikely to build with the -# current build system setup -#EXTRA_PROGRAMS = glpid diff --git a/test/mpi/group/glpid.c b/test/mpi/group/glpid.c deleted file mode 100644 index 06238aeb942..00000000000 --- a/test/mpi/group/glpid.c +++ /dev/null @@ -1,44 +0,0 @@ -/* - * Copyright (C) by Argonne National Laboratory - * See COPYRIGHT in top-level directory - */ - -#include -#include "mpi.h" -#include "mpiimpl.h" - -int main(int argc, char *argv[]) -{ - MPIR_Group group, *group_ptr = &group; - int i; - - MPI_Init(&argc, &argv); - - /* Setup a sample group */ - group.handle = 1; - group.ref_count = 1; - group.size = 4; - group.rank = 0; - group.idx_of_first_lpid = -1; - group.lrank_to_lpid = (MPII_Group_pmap_t *) - MPL_malloc(group.size * sizeof(MPII_Group_pmap_t), MPL_MEM_OTHER); - for (i = 0; i < group.size; i++) { - group.lrank_to_lpid[i].lrank = i; - group.lrank_to_lpid[i].lpid = group.size - i - 1; - group.lrank_to_lpid[i].next_lpid = -1; - group.lrank_to_lpid[i].flag = 0; - } - - /* Set up the group lpid list */ - MPII_Group_setup_lpid_list(group_ptr); - - /* Print the group structure */ - printf("Index of first lpid = %d\n", group.idx_of_first_lpid); - for (i = 0; i < group.size; i++) { - printf("lrank_to_lpid[%d].next_lpid = %d, .lpid = %d\n", - i, group.lrank_to_lpid[i].next_lpid, group.lrank_to_lpid[i].lpid); - } - - MPI_Finalize(); - return 0; -} From 0a216599030490c4a03ddb8088867e01b64f5617 Mon Sep 17 00:00:00 2001 From: Hui Zhou Date: Tue, 10 Dec 2024 13:23:06 -0600 Subject: [PATCH 02/19] group: remove unused groupdebug.c We no longer use this file. --- src/mpi/group/groupdebug.c | 77 -------------------------------------- 1 file changed, 77 deletions(-) delete mode 100644 src/mpi/group/groupdebug.c diff --git a/src/mpi/group/groupdebug.c b/src/mpi/group/groupdebug.c deleted file mode 100644 index a70b9592d2f..00000000000 --- a/src/mpi/group/groupdebug.c +++ /dev/null @@ -1,77 +0,0 @@ -/* - * Copyright (C) by Argonne National Laboratory - * See COPYRIGHT in top-level directory - */ - -#include "mpiimpl.h" -#include "group.h" - -/* style: allow:fprintf:2 sig:0 */ -/* style: PMPIuse:PMPI_Abort:2 sig:0 */ - -/* - * This file contains routines that are used only to perform testing - * and debugging of the group routines - */ -void MPITEST_Group_create(int, int, MPI_Group *); -void MPITEST_Group_print(MPI_Group); - -/* --BEGIN DEBUG-- */ -void MPITEST_Group_create(int nproc, int myrank, MPI_Group * new_group) -{ - MPIR_Group *new_group_ptr; - int i; - - new_group_ptr = (MPIR_Group *) MPIR_Handle_obj_alloc(&MPIR_Group_mem); - if (!new_group_ptr) { - fprintf(stderr, "Could not create a new group\n"); - PMPI_Abort(MPI_COMM_WORLD, 1); - } - MPIR_Object_set_ref(new_group_ptr, 1); - new_group_ptr->lrank_to_lpid = - (MPII_Group_pmap_t *) MPL_malloc(nproc * sizeof(MPII_Group_pmap_t), MPL_MEM_DEBUG); - if (!new_group_ptr->lrank_to_lpid) { - fprintf(stderr, "Could not create lrank map for new group\n"); - PMPI_Abort(MPI_COMM_WORLD, 1); - } - - new_group_ptr->rank = MPI_UNDEFINED; - for (i = 0; i < nproc; i++) { - new_group_ptr->lrank_to_lpid[i].lrank = i; - new_group_ptr->lrank_to_lpid[i].lpid = i; - } - new_group_ptr->size = nproc; - new_group_ptr->rank = myrank; - new_group_ptr->idx_of_first_lpid = -1; - - *new_group = new_group_ptr->handle; -} - -void MPITEST_Group_print(MPI_Group g) -{ - MPIR_Group *g_ptr; - int g_idx, size, i; - - MPIR_Group_get_ptr(g, g_ptr); - - g_idx = g_ptr->idx_of_first_lpid; - if (g_idx < 0) { - MPII_Group_setup_lpid_list(g_ptr); - g_idx = g_ptr->idx_of_first_lpid; - } - - /* Loop through these, printing the lpids by rank and in order */ - size = g_ptr->size; - fprintf(stdout, "Lpids in rank order\n"); - for (i = 0; i < size; i++) { - fprintf(stdout, "Rank %d has lpid %d\n", i, g_ptr->lrank_to_lpid[i].lpid); - } - - fprintf(stdout, "Ranks in lpid order\n"); - while (g_idx >= 0) { - fprintf(stdout, "Rank %d has lpid %d\n", g_idx, g_ptr->lrank_to_lpid[g_idx].lpid); - g_idx = g_ptr->lrank_to_lpid[g_idx].next_lpid; - } -} - -/* --END DEBUG-- */ From 976d46ce3ab03da0c5da218c9c7e7e9f118c4fb4 Mon Sep 17 00:00:00 2001 From: Hui Zhou Date: Tue, 10 Dec 2024 14:17:37 -0600 Subject: [PATCH 03/19] group: abstract group access and lpid integer type Hide the internal fields of MPIR_Group from unnecessary access. Outside group_util.c and group_impl.c, it only need assume the MPIR_Lpid integer type, creation routines based on lpid map or lpid stride description, and access routine to look up lpid from a group rank. --- src/include/mpir_group.h | 17 +++++++- src/mpi/group/grouputil.c | 84 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 100 insertions(+), 1 deletion(-) diff --git a/src/include/mpir_group.h b/src/include/mpir_group.h index c40f22fe877..43e79552ec1 100644 --- a/src/include/mpir_group.h +++ b/src/include/mpir_group.h @@ -11,12 +11,19 @@ * only because they are required for the group operations (e.g., * MPI_Group_intersection) and for the scalable RMA synchronization *---------------------------------------------------------------------------*/ + +/* Abstract the integer type for lpid (process id). It is possible to use 32-bit + * in principle, but 64-bit is simpler since we can trivially combine + * (world_idx, world_rank). + */ +typedef uint64_t MPIR_Lpid; + /* This structure is used to implement the group operations such as MPI_Group_translate_ranks */ /* note: next_lpid (with idx_of_first_lpid in MPIR_Group) gives a linked list * in a sorted lpid ascending order */ typedef struct MPII_Group_pmap_t { - uint64_t lpid; /* local process id, from VCONN */ + MPIR_Lpid lpid; /* local process id, from VCONN */ int next_lpid; /* Index of next lpid (in lpid order) */ } MPII_Group_pmap_t; @@ -104,6 +111,14 @@ void MPIR_Group_setup_lpid_pairs(MPIR_Group *, MPIR_Group *); int MPIR_Group_create(int, MPIR_Group **); int MPIR_Group_release(MPIR_Group * group_ptr); +int MPIR_Group_create_map(int size, int rank, MPIR_Session * session_ptr, MPIR_Lpid * map, + MPIR_Group ** new_group_ptr); +int MPIR_Group_create_stride(int size, int rank, MPIR_Session * session_ptr, + MPIR_Lpid offset, MPIR_Lpid stride, MPIR_Lpid blocksize, + MPIR_Group ** new_group_ptr); +MPIR_Lpid MPIR_Group_rank_to_lpid(MPIR_Group * group, int rank); +int MPIR_Group_lpid_to_rank(MPIR_Group * group, MPIR_Lpid lpid); + int MPIR_Group_check_subset(MPIR_Group * group_ptr, MPIR_Comm * comm_ptr); void MPIR_Group_set_session_ptr(MPIR_Group * group_ptr, MPIR_Session * session_out); int MPIR_Group_init(void); diff --git a/src/mpi/group/grouputil.c b/src/mpi/group/grouputil.c index ac777e50305..414c562fe3c 100644 --- a/src/mpi/group/grouputil.c +++ b/src/mpi/group/grouputil.c @@ -94,6 +94,90 @@ int MPIR_Group_create(int nproc, MPIR_Group ** new_group_ptr) return mpi_errno; } +int MPIR_Group_create_map(int size, int rank, MPIR_Session * session_ptr, MPIR_Lpid * map, + MPIR_Group ** new_group_ptr) +{ + int mpi_errno = MPI_SUCCESS; + + if (size == 0) { + /* See 5.3.2, Group Constructors. For many group routines, + * the standard explicitly says to return MPI_GROUP_EMPTY; + * for others it is implied */ + *new_group_ptr = MPIR_Group_empty; + goto fn_exit; + } + + MPIR_Group *newgrp; + mpi_errno = MPIR_Group_create(size, &newgrp); + MPIR_ERR_CHECK(mpi_errno); + + newgrp->rank = rank; + MPIR_Group_set_session_ptr(newgrp, session_ptr); + + for (int i = 0; i < size; i++) { + newgrp->lrank_to_lpid[i].lpid = map[i]; + } + + *new_group_ptr = newgrp; + + fn_exit: + MPL_free(map); + return mpi_errno; + fn_fail: + goto fn_exit; +} + +int MPIR_Group_create_stride(int size, int rank, MPIR_Session * session_ptr, + MPIR_Lpid offset, MPIR_Lpid stride, MPIR_Lpid blocksize, + MPIR_Group ** new_group_ptr) +{ + int mpi_errno = MPI_SUCCESS; + MPIR_Group *newgrp; + + MPIR_Assert(size > 0); + + mpi_errno = MPIR_Group_create(size, &newgrp); + MPIR_ERR_CHECK(mpi_errno); + + newgrp->rank = rank; + MPIR_Group_set_session_ptr(newgrp, session_ptr); + + MPIR_Lpid lpid = offset; + int i = 0; + while (i < size) { + for (int j = 0; j < blocksize; j++) { + newgrp->lrank_to_lpid[i + j].lpid = lpid + j; + } + i += blocksize; + lpid += stride; + } + + *new_group_ptr = newgrp; + + fn_exit: + return mpi_errno; + fn_fail: + goto fn_exit; +} + +MPIR_Lpid MPIR_Group_rank_to_lpid(MPIR_Group * group, int rank) +{ + return group->lrank_to_lpid[rank].lpid; +} + +int MPIR_Group_lpid_to_rank(MPIR_Group * group, MPIR_Lpid lpid) +{ + /* Use linear search for now. + * Optimization, build hash map in MPIR_Group_create_map and do O(1) hash lookup + */ + for (int i = 0; i < group->size; i++) { + if (lpid == group->lrank_to_lpid[i].lpid) { + return i; + } + } + return MPI_UNDEFINED; +} + /* * return value is the first index in the list * From 729e8a7e337067ec96e5e5288e678d48f620a123 Mon Sep 17 00:00:00 2001 From: Hui Zhou Date: Tue, 10 Dec 2024 13:41:20 -0600 Subject: [PATCH 04/19] misc: use the new group rank/lpid conversion routines For most external usages, we only need MPIR_Group_rank_to_lpid. --- src/mpi/comm/comm_impl.c | 6 +++--- src/mpid/ch3/src/ch3u_comm.c | 2 +- src/mpid/ch3/src/ch3u_handle_connection.c | 2 +- src/mpid/ch4/src/ch4_impl.h | 5 ++++- 4 files changed, 9 insertions(+), 6 deletions(-) diff --git a/src/mpi/comm/comm_impl.c b/src/mpi/comm/comm_impl.c index 9dbba6d703f..9f361f60007 100644 --- a/src/mpi/comm/comm_impl.c +++ b/src/mpi/comm/comm_impl.c @@ -224,7 +224,7 @@ int MPII_Comm_create_calculate_mapping(MPIR_Group * group_ptr, subsetOfWorld = 1; wsize = MPIR_Process.size; for (i = 0; i < n; i++) { - uint64_t g_lpid = group_ptr->lrank_to_lpid[i].lpid; + MPIR_Lpid g_lpid = MPIR_Group_rank_to_lpid(group_ptr, i); /* This mapping is relative to comm world */ MPL_DBG_MSG_FMT(MPIR_DBG_COMM, VERBOSE, @@ -261,7 +261,7 @@ int MPII_Comm_create_calculate_mapping(MPIR_Group * group_ptr, for (j = 0; j < comm_ptr->local_size; j++) { uint64_t comm_lpid; MPID_Comm_get_lpid(comm_ptr, j, &comm_lpid, FALSE); - if (comm_lpid == group_ptr->lrank_to_lpid[i].lpid) { + if (comm_lpid == MPIR_Group_rank_to_lpid(group_ptr, i)) { mapping[i] = j; break; } @@ -800,7 +800,7 @@ int MPIR_Intercomm_create_from_groups_impl(MPIR_Group * local_group_ptr, int loc int tag = get_tag_from_stringtag(stringtag); /* FIXME: ensure lpid is from comm_world */ - uint64_t remote_lpid = remote_group_ptr->lrank_to_lpid[remote_leader].lpid; + MPIR_Lpid remote_lpid = MPIR_Group_rank_to_lpid(remote_group_ptr, remote_leader); MPIR_Assert(remote_lpid < MPIR_Process.size); mpi_errno = MPIR_Intercomm_create_impl(local_comm, local_leader, MPIR_Process.comm_world, (int) remote_lpid, diff --git a/src/mpid/ch3/src/ch3u_comm.c b/src/mpid/ch3/src/ch3u_comm.c index b704d3042e2..ce2f495055b 100644 --- a/src/mpid/ch3/src/ch3u_comm.c +++ b/src/mpid/ch3/src/ch3u_comm.c @@ -512,7 +512,7 @@ static int nonempty_intersection(MPIR_Comm *comm, MPIR_Group *group, int *flag) for (i_g = 0; i_g < group->size; ++i_g) { /* FIXME: This won't work for dynamic procs */ - MPIDI_PG_Get_vc(MPIDI_Process.my_pg, group->lrank_to_lpid[i_g].lpid, &vc_g); + MPIDI_PG_Get_vc(MPIDI_Process.my_pg, MPIR_Group_rank_to_lpid(group, i_g), &vc_g); for (i_c = 0; i_c < comm->remote_size; ++i_c) { MPIDI_Comm_get_vc(comm, i_c, &vc_c); if (vc_g == vc_c) { diff --git a/src/mpid/ch3/src/ch3u_handle_connection.c b/src/mpid/ch3/src/ch3u_handle_connection.c index ef5819aaf3d..17ef122cb7f 100644 --- a/src/mpid/ch3/src/ch3u_handle_connection.c +++ b/src/mpid/ch3/src/ch3u_handle_connection.c @@ -372,7 +372,7 @@ static int terminate_failed_VCs(MPIR_Group *new_failed_group) MPIDI_VC_t *vc; /* terminate the VC */ /* FIXME: This won't work for dynamic procs */ - MPIDI_PG_Get_vc(MPIDI_Process.my_pg, new_failed_group->lrank_to_lpid[i].lpid, &vc); + MPIDI_PG_Get_vc(MPIDI_Process.my_pg, MPIR_Group_rank_to_lpid(new_failed_group, i), &vc); mpi_errno = MPIDI_CH3_Connection_terminate(vc); MPIR_ERR_CHECK(mpi_errno); } diff --git a/src/mpid/ch4/src/ch4_impl.h b/src/mpid/ch4/src/ch4_impl.h index 8991052f1a5..2f5a31dc767 100644 --- a/src/mpid/ch4/src/ch4_impl.h +++ b/src/mpid/ch4/src/ch4_impl.h @@ -387,7 +387,10 @@ MPL_STATIC_INLINE_PREFIX int MPIDIU_valid_group_rank(MPIR_Comm * comm, int rank, MPIDI_NM_comm_get_gpid(comm, rank, &gpid, FALSE); - for (z = 0; z < size && gpid != grp->lrank_to_lpid[z].lpid; ++z) { + for (z = 0; z < size; ++z) { + if (gpid == MPIR_Group_rank_to_lpid(grp, z)) { + break; + } } ret = (z < size); From 87f5fac414f3f5d11cff5190793d6b85a81c27d6 Mon Sep 17 00:00:00 2001 From: Hui Zhou Date: Tue, 10 Dec 2024 14:01:29 -0600 Subject: [PATCH 05/19] comm: use MPIR_Group_create_{map, stride) Avoid access group internal fields. --- src/mpi/comm/comm_impl.c | 55 +++++++++++----------------------------- src/mpi/comm/ulfm_impl.c | 15 ++++++----- 2 files changed, 23 insertions(+), 47 deletions(-) diff --git a/src/mpi/comm/comm_impl.c b/src/mpi/comm/comm_impl.c index 9f361f60007..46f06b89762 100644 --- a/src/mpi/comm/comm_impl.c +++ b/src/mpi/comm/comm_impl.c @@ -68,36 +68,19 @@ int MPIR_Comm_test_threadcomm_impl(MPIR_Comm * comm_ptr, int *flag) static int comm_create_local_group(MPIR_Comm * comm_ptr) { int mpi_errno = MPI_SUCCESS; - MPIR_Group *group_ptr; - int n = comm_ptr->local_size; - - mpi_errno = MPIR_Group_create(n, &group_ptr); - MPIR_ERR_CHECK(mpi_errno); - /* Group belongs to the same session as communicator */ - MPIR_Group_set_session_ptr(group_ptr, comm_ptr->session_ptr); - - group_ptr->is_local_dense_monotonic = TRUE; + int n = comm_ptr->local_size; + MPIR_Lpid *map = MPL_malloc(n * sizeof(MPIR_Lpid), MPL_MEM_GROUP); - int comm_world_size = MPIR_Process.size; for (int i = 0; i < n; i++) { uint64_t lpid; (void) MPID_Comm_get_lpid(comm_ptr, i, &lpid, FALSE); - group_ptr->lrank_to_lpid[i].lpid = lpid; - if (lpid > comm_world_size || (i > 0 && group_ptr->lrank_to_lpid[i - 1].lpid != (lpid - 1))) { - group_ptr->is_local_dense_monotonic = FALSE; - } + map[i] = lpid; } - group_ptr->size = n; - group_ptr->rank = comm_ptr->rank; - group_ptr->idx_of_first_lpid = -1; - - comm_ptr->local_group = group_ptr; - - /* FIXME : Add a sanity check that the size of the group is the same as - * the size of the communicator. This helps catch corrupted - * communicators */ + mpi_errno = MPIR_Group_create_map(n, comm_ptr->rank, comm_ptr->session_ptr, map, + &comm_ptr->local_group); + MPIR_ERR_CHECK(mpi_errno); fn_exit: return mpi_errno; @@ -931,31 +914,23 @@ int MPIR_Comm_idup_with_info_impl(MPIR_Comm * comm_ptr, MPIR_Info * info, int MPIR_Comm_remote_group_impl(MPIR_Comm * comm_ptr, MPIR_Group ** group_ptr) { int mpi_errno = MPI_SUCCESS; - int i, n; - MPIR_FUNC_ENTER; + /* Create a group and populate it with the local process ids */ if (!comm_ptr->remote_group) { - n = comm_ptr->remote_size; - mpi_errno = MPIR_Group_create(n, group_ptr); - MPIR_ERR_CHECK(mpi_errno); + int n = comm_ptr->remote_size; + MPIR_Lpid *map = MPL_malloc(n * sizeof(MPIR_Lpid), MPL_MEM_GROUP); - for (i = 0; i < n; i++) { + for (int i = 0; i < n; i++) { uint64_t lpid; (void) MPID_Comm_get_lpid(comm_ptr, i, &lpid, TRUE); - (*group_ptr)->lrank_to_lpid[i].lpid = lpid; - /* TODO calculate is_local_dense_monotonic */ + map[i] = lpid; } - (*group_ptr)->size = n; - (*group_ptr)->rank = MPI_UNDEFINED; - (*group_ptr)->idx_of_first_lpid = -1; - - MPIR_Group_set_session_ptr(*group_ptr, comm_ptr->session_ptr); - - comm_ptr->remote_group = *group_ptr; - } else { - *group_ptr = comm_ptr->remote_group; + mpi_errno = MPIR_Group_create_map(n, MPI_UNDEFINED, comm_ptr->session_ptr, map, + &comm_ptr->remote_group); + MPIR_ERR_CHECK(mpi_errno); } + *group_ptr = comm_ptr->remote_group; MPIR_Group_add_ref(comm_ptr->remote_group); fn_exit: diff --git a/src/mpi/comm/ulfm_impl.c b/src/mpi/comm/ulfm_impl.c index dfd4ad6bfcf..33edffa3d11 100644 --- a/src/mpi/comm/ulfm_impl.c +++ b/src/mpi/comm/ulfm_impl.c @@ -87,21 +87,22 @@ int MPIR_Comm_get_failed_impl(MPIR_Comm * comm_ptr, MPIR_Group ** failed_group_p /* create failed_group */ int n = utarray_len(failed_procs); + MPIR_Lpid *map = MPL_malloc(n * sizeof(MPIR_Lpid), MPL_MEM_GROUP); + MPIR_Group *new_group; - mpi_errno = MPIR_Group_create(n, &new_group); - MPIR_ERR_CHECK(mpi_errno); - new_group->rank = MPI_UNDEFINED; + int myrank = MPI_UNDEFINED; for (int i = 0; i < utarray_len(failed_procs); i++) { int *p = (int *) utarray_eltptr(failed_procs, i); - new_group->lrank_to_lpid[i].lpid = *p; + map[i] = *p; /* if calling process is part of the group, set the rank */ if (*p == MPIR_Process.rank) { - new_group->rank = i; + myrank = i; } } - new_group->size = n; - new_group->idx_of_first_lpid = -1; + + mpi_errno = MPIR_Group_create_map(n, myrank, comm_ptr->session_ptr, map, &new_group); + MPIR_ERR_CHECK(mpi_errno); MPIR_Group *comm_group; MPIR_Comm_group_impl(comm_ptr, &comm_group); From 59c1d05a428df266adb306cfd1a88a4ed27526e9 Mon Sep 17 00:00:00 2001 From: Hui Zhou Date: Tue, 10 Dec 2024 14:58:57 -0600 Subject: [PATCH 06/19] group: rearange functions in group_impl.c Group similar functions together to facilitate refactoring. There is no changes in this commit other than moving functions around. The 4 incl/excl functions are very similar. The 3 difference/intersection/union functions are very similar. --- src/mpi/group/group_impl.c | 404 ++++++++++++++++++------------------- 1 file changed, 202 insertions(+), 202 deletions(-) diff --git a/src/mpi/group/group_impl.c b/src/mpi/group/group_impl.c index dbd3cd88204..848996c7d8c 100644 --- a/src/mpi/group/group_impl.c +++ b/src/mpi/group/group_impl.c @@ -18,6 +18,22 @@ int MPIR_Group_size_impl(MPIR_Group * group_ptr, int *size) return MPI_SUCCESS; } +int MPIR_Group_free_impl(MPIR_Group * group_ptr) +{ + int mpi_errno = MPI_SUCCESS; + + /* Do not free MPI_GROUP_EMPTY */ + if (group_ptr->handle != MPI_GROUP_EMPTY) { + mpi_errno = MPIR_Group_release(group_ptr); + MPIR_ERR_CHECK(mpi_errno); + } + + fn_exit: + return mpi_errno; + fn_fail: + goto fn_exit; +} + int MPIR_Group_compare_impl(MPIR_Group * group_ptr1, MPIR_Group * group_ptr2, int *result) { int mpi_errno = MPI_SUCCESS; @@ -67,77 +83,76 @@ int MPIR_Group_compare_impl(MPIR_Group * group_ptr1, MPIR_Group * group_ptr2, in return mpi_errno; } -int MPIR_Group_difference_impl(MPIR_Group * group_ptr1, MPIR_Group * group_ptr2, - MPIR_Group ** new_group_ptr) +int MPIR_Group_translate_ranks_impl(MPIR_Group * gp1, int n, const int ranks1[], + MPIR_Group * gp2, int ranks2[]) { int mpi_errno = MPI_SUCCESS; - int size1, i, k, g1_idx, g2_idx, nnew; + int i, g2_idx; uint64_t l1_pid, l2_pid; - int *flags = NULL; - MPIR_FUNC_ENTER; - /* Return a group consisting of the members of group1 that are *not* - * in group2 */ - size1 = group_ptr1->size; - /* Insure that the lpid lists are setup */ - MPIR_Group_setup_lpid_pairs(group_ptr1, group_ptr2); + MPL_DBG_MSG_S(MPIR_DBG_OTHER, VERBOSE, "gp2->is_local_dense_monotonic=%s", + (gp2->is_local_dense_monotonic ? "TRUE" : "FALSE")); - flags = MPL_calloc(size1, sizeof(int), MPL_MEM_OTHER); + /* Initialize the output ranks */ + for (i = 0; i < n; i++) + ranks2[i] = MPI_UNDEFINED; - g1_idx = group_ptr1->idx_of_first_lpid; - g2_idx = group_ptr2->idx_of_first_lpid; + if (gp2->size > 0 && gp2->is_local_dense_monotonic) { + /* g2 probably == group_of(MPI_COMM_WORLD); use fast, constant-time lookup */ + uint64_t lpid_offset = gp2->lrank_to_lpid[0].lpid; - nnew = size1; - while (g1_idx >= 0 && g2_idx >= 0) { - l1_pid = group_ptr1->lrank_to_lpid[g1_idx].lpid; - l2_pid = group_ptr2->lrank_to_lpid[g2_idx].lpid; - if (l1_pid < l2_pid) { - g1_idx = group_ptr1->lrank_to_lpid[g1_idx].next_lpid; - } else if (l1_pid > l2_pid) { - g2_idx = group_ptr2->lrank_to_lpid[g2_idx].next_lpid; - } else { - /* Equal */ - flags[g1_idx] = 1; - g1_idx = group_ptr1->lrank_to_lpid[g1_idx].next_lpid; - g2_idx = group_ptr2->lrank_to_lpid[g2_idx].next_lpid; - nnew--; + for (i = 0; i < n; ++i) { + uint64_t g1_lpid; + + if (ranks1[i] == MPI_PROC_NULL) { + ranks2[i] = MPI_PROC_NULL; + continue; + } + /* "adjusted" lpid from g1 */ + g1_lpid = gp1->lrank_to_lpid[ranks1[i]].lpid - lpid_offset; + if (g1_lpid < gp2->size) { + ranks2[i] = g1_lpid; + } + /* else leave UNDEFINED */ } - } - /* Create the group */ - if (nnew == 0) { - /* See 5.3.2, Group Constructors. For many group routines, - * the standard explicitly says to return MPI_GROUP_EMPTY; - * for others it is implied */ - *new_group_ptr = MPIR_Group_empty; - goto fn_exit; } else { - mpi_errno = MPIR_Group_create(nnew, new_group_ptr); - /* --BEGIN ERROR HANDLING-- */ - if (mpi_errno) { - goto fn_fail; + /* general, slow path; lookup time is dependent on the user-provided rank values! */ + g2_idx = gp2->idx_of_first_lpid; + if (g2_idx < 0) { + MPII_Group_setup_lpid_list(gp2); + g2_idx = gp2->idx_of_first_lpid; } - /* --END ERROR HANDLING-- */ - (*new_group_ptr)->rank = MPI_UNDEFINED; - k = 0; - for (i = 0; i < size1; i++) { - if (!flags[i]) { - (*new_group_ptr)->lrank_to_lpid[k].lpid = group_ptr1->lrank_to_lpid[i].lpid; - if (i == group_ptr1->rank) - (*new_group_ptr)->rank = k; - k++; + if (g2_idx >= 0) { + /* g2_idx can be < 0 if the g2 group is empty */ + l2_pid = gp2->lrank_to_lpid[g2_idx].lpid; + for (i = 0; i < n; i++) { + if (ranks1[i] == MPI_PROC_NULL) { + ranks2[i] = MPI_PROC_NULL; + continue; + } + l1_pid = gp1->lrank_to_lpid[ranks1[i]].lpid; + /* Search for this l1_pid in group2. Use the following + * optimization: start from the last position in the lpid list + * if possible. A more sophisticated version could use a + * tree based or even hashed search to speed the translation. */ + if (l1_pid < l2_pid || g2_idx < 0) { + /* Start over from the beginning */ + g2_idx = gp2->idx_of_first_lpid; + l2_pid = gp2->lrank_to_lpid[g2_idx].lpid; + } + while (g2_idx >= 0 && l1_pid > l2_pid) { + g2_idx = gp2->lrank_to_lpid[g2_idx].next_lpid; + if (g2_idx >= 0) + l2_pid = gp2->lrank_to_lpid[g2_idx].lpid; + else + l2_pid = (uint64_t) - 1; + } + if (l1_pid == l2_pid) + ranks2[i] = g2_idx; } } - /* TODO calculate is_local_dense_monotonic */ } - - MPIR_Group_set_session_ptr(*new_group_ptr, group_ptr1->session_ptr); - - fn_exit: - MPL_free(flags); - MPIR_FUNC_EXIT; return mpi_errno; - fn_fail: - goto fn_exit; } int MPIR_Group_excl_impl(MPIR_Group * group_ptr, int n, const int ranks[], @@ -188,22 +203,6 @@ int MPIR_Group_excl_impl(MPIR_Group * group_ptr, int n, const int ranks[], goto fn_exit; } -int MPIR_Group_free_impl(MPIR_Group * group_ptr) -{ - int mpi_errno = MPI_SUCCESS; - - /* Do not free MPI_GROUP_EMPTY */ - if (group_ptr->handle != MPI_GROUP_EMPTY) { - mpi_errno = MPIR_Group_release(group_ptr); - MPIR_ERR_CHECK(mpi_errno); - } - - fn_exit: - return mpi_errno; - fn_fail: - goto fn_exit; -} - int MPIR_Group_incl_impl(MPIR_Group * group_ptr, int n, const int ranks[], MPIR_Group ** new_group_ptr) { @@ -242,79 +241,6 @@ int MPIR_Group_incl_impl(MPIR_Group * group_ptr, int n, const int ranks[], goto fn_exit; } -int MPIR_Group_intersection_impl(MPIR_Group * group_ptr1, MPIR_Group * group_ptr2, - MPIR_Group ** new_group_ptr) -{ - int mpi_errno = MPI_SUCCESS; - int size1, i, k, g1_idx, g2_idx, nnew; - uint64_t l1_pid, l2_pid; - int *flags = NULL; - - MPIR_FUNC_ENTER; - /* Return a group consisting of the members of group1 that are - * in group2 */ - size1 = group_ptr1->size; - /* Insure that the lpid lists are setup */ - MPIR_Group_setup_lpid_pairs(group_ptr1, group_ptr2); - - flags = MPL_calloc(size1, sizeof(int), MPL_MEM_OTHER); - - g1_idx = group_ptr1->idx_of_first_lpid; - g2_idx = group_ptr2->idx_of_first_lpid; - - nnew = 0; - while (g1_idx >= 0 && g2_idx >= 0) { - l1_pid = group_ptr1->lrank_to_lpid[g1_idx].lpid; - l2_pid = group_ptr2->lrank_to_lpid[g2_idx].lpid; - if (l1_pid < l2_pid) { - g1_idx = group_ptr1->lrank_to_lpid[g1_idx].next_lpid; - } else if (l1_pid > l2_pid) { - g2_idx = group_ptr2->lrank_to_lpid[g2_idx].next_lpid; - } else { - /* Equal */ - flags[g1_idx] = 1; - g1_idx = group_ptr1->lrank_to_lpid[g1_idx].next_lpid; - g2_idx = group_ptr2->lrank_to_lpid[g2_idx].next_lpid; - nnew++; - } - } - /* Create the group. Handle the trivial case first */ - if (nnew == 0) { - *new_group_ptr = MPIR_Group_empty; - goto fn_exit; - } - - mpi_errno = MPIR_Group_create(nnew, new_group_ptr); - MPIR_ERR_CHECK(mpi_errno); - - (*new_group_ptr)->rank = MPI_UNDEFINED; - (*new_group_ptr)->is_local_dense_monotonic = TRUE; - k = 0; - for (i = 0; i < size1; i++) { - if (flags[i]) { - uint64_t lpid = group_ptr1->lrank_to_lpid[i].lpid; - (*new_group_ptr)->lrank_to_lpid[k].lpid = lpid; - if (i == group_ptr1->rank) - (*new_group_ptr)->rank = k; - if (lpid > MPIR_Process.size || - (k > 0 && (*new_group_ptr)->lrank_to_lpid[k - 1].lpid != (lpid - 1))) { - (*new_group_ptr)->is_local_dense_monotonic = FALSE; - } - - k++; - } - } - - MPIR_Group_set_session_ptr(*new_group_ptr, group_ptr1->session_ptr); - - fn_exit: - MPL_free(flags); - MPIR_FUNC_EXIT; - return mpi_errno; - fn_fail: - goto fn_exit; -} - int MPIR_Group_range_excl_impl(MPIR_Group * group_ptr, int n, int ranges[][3], MPIR_Group ** new_group_ptr) { @@ -464,76 +390,150 @@ int MPIR_Group_range_incl_impl(MPIR_Group * group_ptr, int n, int ranges[][3], goto fn_exit; } -int MPIR_Group_translate_ranks_impl(MPIR_Group * gp1, int n, const int ranks1[], - MPIR_Group * gp2, int ranks2[]) +int MPIR_Group_difference_impl(MPIR_Group * group_ptr1, MPIR_Group * group_ptr2, + MPIR_Group ** new_group_ptr) { int mpi_errno = MPI_SUCCESS; - int i, g2_idx; + int size1, i, k, g1_idx, g2_idx, nnew; uint64_t l1_pid, l2_pid; + int *flags = NULL; - MPL_DBG_MSG_S(MPIR_DBG_OTHER, VERBOSE, "gp2->is_local_dense_monotonic=%s", - (gp2->is_local_dense_monotonic ? "TRUE" : "FALSE")); - - /* Initialize the output ranks */ - for (i = 0; i < n; i++) - ranks2[i] = MPI_UNDEFINED; + MPIR_FUNC_ENTER; + /* Return a group consisting of the members of group1 that are *not* + * in group2 */ + size1 = group_ptr1->size; + /* Insure that the lpid lists are setup */ + MPIR_Group_setup_lpid_pairs(group_ptr1, group_ptr2); - if (gp2->size > 0 && gp2->is_local_dense_monotonic) { - /* g2 probably == group_of(MPI_COMM_WORLD); use fast, constant-time lookup */ - uint64_t lpid_offset = gp2->lrank_to_lpid[0].lpid; + flags = MPL_calloc(size1, sizeof(int), MPL_MEM_OTHER); - for (i = 0; i < n; ++i) { - uint64_t g1_lpid; + g1_idx = group_ptr1->idx_of_first_lpid; + g2_idx = group_ptr2->idx_of_first_lpid; - if (ranks1[i] == MPI_PROC_NULL) { - ranks2[i] = MPI_PROC_NULL; - continue; - } - /* "adjusted" lpid from g1 */ - g1_lpid = gp1->lrank_to_lpid[ranks1[i]].lpid - lpid_offset; - if (g1_lpid < gp2->size) { - ranks2[i] = g1_lpid; - } - /* else leave UNDEFINED */ + nnew = size1; + while (g1_idx >= 0 && g2_idx >= 0) { + l1_pid = group_ptr1->lrank_to_lpid[g1_idx].lpid; + l2_pid = group_ptr2->lrank_to_lpid[g2_idx].lpid; + if (l1_pid < l2_pid) { + g1_idx = group_ptr1->lrank_to_lpid[g1_idx].next_lpid; + } else if (l1_pid > l2_pid) { + g2_idx = group_ptr2->lrank_to_lpid[g2_idx].next_lpid; + } else { + /* Equal */ + flags[g1_idx] = 1; + g1_idx = group_ptr1->lrank_to_lpid[g1_idx].next_lpid; + g2_idx = group_ptr2->lrank_to_lpid[g2_idx].next_lpid; + nnew--; } + } + /* Create the group */ + if (nnew == 0) { + /* See 5.3.2, Group Constructors. For many group routines, + * the standard explicitly says to return MPI_GROUP_EMPTY; + * for others it is implied */ + *new_group_ptr = MPIR_Group_empty; + goto fn_exit; } else { - /* general, slow path; lookup time is dependent on the user-provided rank values! */ - g2_idx = gp2->idx_of_first_lpid; - if (g2_idx < 0) { - MPII_Group_setup_lpid_list(gp2); - g2_idx = gp2->idx_of_first_lpid; + mpi_errno = MPIR_Group_create(nnew, new_group_ptr); + /* --BEGIN ERROR HANDLING-- */ + if (mpi_errno) { + goto fn_fail; } - if (g2_idx >= 0) { - /* g2_idx can be < 0 if the g2 group is empty */ - l2_pid = gp2->lrank_to_lpid[g2_idx].lpid; - for (i = 0; i < n; i++) { - if (ranks1[i] == MPI_PROC_NULL) { - ranks2[i] = MPI_PROC_NULL; - continue; - } - l1_pid = gp1->lrank_to_lpid[ranks1[i]].lpid; - /* Search for this l1_pid in group2. Use the following - * optimization: start from the last position in the lpid list - * if possible. A more sophisticated version could use a - * tree based or even hashed search to speed the translation. */ - if (l1_pid < l2_pid || g2_idx < 0) { - /* Start over from the beginning */ - g2_idx = gp2->idx_of_first_lpid; - l2_pid = gp2->lrank_to_lpid[g2_idx].lpid; - } - while (g2_idx >= 0 && l1_pid > l2_pid) { - g2_idx = gp2->lrank_to_lpid[g2_idx].next_lpid; - if (g2_idx >= 0) - l2_pid = gp2->lrank_to_lpid[g2_idx].lpid; - else - l2_pid = (uint64_t) - 1; - } - if (l1_pid == l2_pid) - ranks2[i] = g2_idx; + /* --END ERROR HANDLING-- */ + (*new_group_ptr)->rank = MPI_UNDEFINED; + k = 0; + for (i = 0; i < size1; i++) { + if (!flags[i]) { + (*new_group_ptr)->lrank_to_lpid[k].lpid = group_ptr1->lrank_to_lpid[i].lpid; + if (i == group_ptr1->rank) + (*new_group_ptr)->rank = k; + k++; } } + /* TODO calculate is_local_dense_monotonic */ } + + MPIR_Group_set_session_ptr(*new_group_ptr, group_ptr1->session_ptr); + + fn_exit: + MPL_free(flags); + MPIR_FUNC_EXIT; return mpi_errno; + fn_fail: + goto fn_exit; +} + +int MPIR_Group_intersection_impl(MPIR_Group * group_ptr1, MPIR_Group * group_ptr2, + MPIR_Group ** new_group_ptr) +{ + int mpi_errno = MPI_SUCCESS; + int size1, i, k, g1_idx, g2_idx, nnew; + uint64_t l1_pid, l2_pid; + int *flags = NULL; + + MPIR_FUNC_ENTER; + /* Return a group consisting of the members of group1 that are + * in group2 */ + size1 = group_ptr1->size; + /* Insure that the lpid lists are setup */ + MPIR_Group_setup_lpid_pairs(group_ptr1, group_ptr2); + + flags = MPL_calloc(size1, sizeof(int), MPL_MEM_OTHER); + + g1_idx = group_ptr1->idx_of_first_lpid; + g2_idx = group_ptr2->idx_of_first_lpid; + + nnew = 0; + while (g1_idx >= 0 && g2_idx >= 0) { + l1_pid = group_ptr1->lrank_to_lpid[g1_idx].lpid; + l2_pid = group_ptr2->lrank_to_lpid[g2_idx].lpid; + if (l1_pid < l2_pid) { + g1_idx = group_ptr1->lrank_to_lpid[g1_idx].next_lpid; + } else if (l1_pid > l2_pid) { + g2_idx = group_ptr2->lrank_to_lpid[g2_idx].next_lpid; + } else { + /* Equal */ + flags[g1_idx] = 1; + g1_idx = group_ptr1->lrank_to_lpid[g1_idx].next_lpid; + g2_idx = group_ptr2->lrank_to_lpid[g2_idx].next_lpid; + nnew++; + } + } + /* Create the group. Handle the trivial case first */ + if (nnew == 0) { + *new_group_ptr = MPIR_Group_empty; + goto fn_exit; + } + + mpi_errno = MPIR_Group_create(nnew, new_group_ptr); + MPIR_ERR_CHECK(mpi_errno); + + (*new_group_ptr)->rank = MPI_UNDEFINED; + (*new_group_ptr)->is_local_dense_monotonic = TRUE; + k = 0; + for (i = 0; i < size1; i++) { + if (flags[i]) { + uint64_t lpid = group_ptr1->lrank_to_lpid[i].lpid; + (*new_group_ptr)->lrank_to_lpid[k].lpid = lpid; + if (i == group_ptr1->rank) + (*new_group_ptr)->rank = k; + if (lpid > MPIR_Process.size || + (k > 0 && (*new_group_ptr)->lrank_to_lpid[k - 1].lpid != (lpid - 1))) { + (*new_group_ptr)->is_local_dense_monotonic = FALSE; + } + + k++; + } + } + + MPIR_Group_set_session_ptr(*new_group_ptr, group_ptr1->session_ptr); + + fn_exit: + MPL_free(flags); + MPIR_FUNC_EXIT; + return mpi_errno; + fn_fail: + goto fn_exit; } int MPIR_Group_union_impl(MPIR_Group * group_ptr1, MPIR_Group * group_ptr2, From 8dbe66e1ecac14a9a9c2c205ec369614bd03e663 Mon Sep 17 00:00:00 2001 From: Hui Zhou Date: Tue, 10 Dec 2024 16:11:59 -0600 Subject: [PATCH 07/19] group: refactor group_impl.c to use new group interfaces Use MPIR_Group_{rank_to_lpid,lpid_to_rank} to avoid directly access MPIR_Group internal fields. For most group creation routines, just populate an lpid lookup map and call MPIR_Group_create_map to create the group. --- src/mpi/group/group_impl.c | 575 +++++++++++-------------------------- 1 file changed, 171 insertions(+), 404 deletions(-) diff --git a/src/mpi/group/group_impl.c b/src/mpi/group/group_impl.c index 848996c7d8c..fa123a70efc 100644 --- a/src/mpi/group/group_impl.c +++ b/src/mpi/group/group_impl.c @@ -37,7 +37,6 @@ int MPIR_Group_free_impl(MPIR_Group * group_ptr) int MPIR_Group_compare_impl(MPIR_Group * group_ptr1, MPIR_Group * group_ptr2, int *result) { int mpi_errno = MPI_SUCCESS; - int g1_idx, g2_idx, size, i; /* See if their sizes are equal */ if (group_ptr1->size != group_ptr2->size) { @@ -45,39 +44,39 @@ int MPIR_Group_compare_impl(MPIR_Group * group_ptr1, MPIR_Group * group_ptr2, in goto fn_exit; } - /* Run through the lrank to lpid lists of each group in lpid order - * to see if the same processes are involved */ - g1_idx = group_ptr1->idx_of_first_lpid; - g2_idx = group_ptr2->idx_of_first_lpid; - /* If the lpid list hasn't been created, do it now */ - if (g1_idx < 0) { - MPII_Group_setup_lpid_list(group_ptr1); - g1_idx = group_ptr1->idx_of_first_lpid; - } - if (g2_idx < 0) { - MPII_Group_setup_lpid_list(group_ptr2); - g2_idx = group_ptr2->idx_of_first_lpid; - } - while (g1_idx >= 0 && g2_idx >= 0) { - if (group_ptr1->lrank_to_lpid[g1_idx].lpid != group_ptr2->lrank_to_lpid[g2_idx].lpid) { - *result = MPI_UNEQUAL; - goto fn_exit; + int size; + size = group_ptr1->size; + + /* See if they are identical */ + bool is_ident = true; + for (int i = 0; i < size; i++) { + if (MPIR_Group_rank_to_lpid(group_ptr1, i) != MPIR_Group_rank_to_lpid(group_ptr2, i)) { + is_ident = false; + break; } - g1_idx = group_ptr1->lrank_to_lpid[g1_idx].next_lpid; - g2_idx = group_ptr2->lrank_to_lpid[g2_idx].next_lpid; } - /* See if the processes are in the same order by rank */ - size = group_ptr1->size; - for (i = 0; i < size; i++) { - if (group_ptr1->lrank_to_lpid[i].lpid != group_ptr2->lrank_to_lpid[i].lpid) { - *result = MPI_SIMILAR; - goto fn_exit; + if (is_ident) { + *result = MPI_IDENT; + goto fn_exit; + } + + /* See if they are similar */ + bool is_similar = true; + for (int i = 0; i < size; i++) { + MPIR_Lpid lpid = MPIR_Group_rank_to_lpid(group_ptr1, i); + if (MPI_UNDEFINED == MPIR_Group_lpid_to_rank(group_ptr2, lpid)) { + /* not found */ + is_similar = false; + break; } } - /* If we reach here, the groups are identical */ - *result = MPI_IDENT; + if (is_similar) { + *result = MPI_SIMILAR; + } else { + *result = MPI_UNEQUAL; + } fn_exit: return mpi_errno; @@ -87,71 +86,16 @@ int MPIR_Group_translate_ranks_impl(MPIR_Group * gp1, int n, const int ranks1[], MPIR_Group * gp2, int ranks2[]) { int mpi_errno = MPI_SUCCESS; - int i, g2_idx; - uint64_t l1_pid, l2_pid; - - MPL_DBG_MSG_S(MPIR_DBG_OTHER, VERBOSE, "gp2->is_local_dense_monotonic=%s", - (gp2->is_local_dense_monotonic ? "TRUE" : "FALSE")); - - /* Initialize the output ranks */ - for (i = 0; i < n; i++) - ranks2[i] = MPI_UNDEFINED; - - if (gp2->size > 0 && gp2->is_local_dense_monotonic) { - /* g2 probably == group_of(MPI_COMM_WORLD); use fast, constant-time lookup */ - uint64_t lpid_offset = gp2->lrank_to_lpid[0].lpid; - - for (i = 0; i < n; ++i) { - uint64_t g1_lpid; - if (ranks1[i] == MPI_PROC_NULL) { - ranks2[i] = MPI_PROC_NULL; - continue; - } - /* "adjusted" lpid from g1 */ - g1_lpid = gp1->lrank_to_lpid[ranks1[i]].lpid - lpid_offset; - if (g1_lpid < gp2->size) { - ranks2[i] = g1_lpid; - } - /* else leave UNDEFINED */ - } - } else { - /* general, slow path; lookup time is dependent on the user-provided rank values! */ - g2_idx = gp2->idx_of_first_lpid; - if (g2_idx < 0) { - MPII_Group_setup_lpid_list(gp2); - g2_idx = gp2->idx_of_first_lpid; - } - if (g2_idx >= 0) { - /* g2_idx can be < 0 if the g2 group is empty */ - l2_pid = gp2->lrank_to_lpid[g2_idx].lpid; - for (i = 0; i < n; i++) { - if (ranks1[i] == MPI_PROC_NULL) { - ranks2[i] = MPI_PROC_NULL; - continue; - } - l1_pid = gp1->lrank_to_lpid[ranks1[i]].lpid; - /* Search for this l1_pid in group2. Use the following - * optimization: start from the last position in the lpid list - * if possible. A more sophisticated version could use a - * tree based or even hashed search to speed the translation. */ - if (l1_pid < l2_pid || g2_idx < 0) { - /* Start over from the beginning */ - g2_idx = gp2->idx_of_first_lpid; - l2_pid = gp2->lrank_to_lpid[g2_idx].lpid; - } - while (g2_idx >= 0 && l1_pid > l2_pid) { - g2_idx = gp2->lrank_to_lpid[g2_idx].next_lpid; - if (g2_idx >= 0) - l2_pid = gp2->lrank_to_lpid[g2_idx].lpid; - else - l2_pid = (uint64_t) - 1; - } - if (l1_pid == l2_pid) - ranks2[i] = g2_idx; - } + for (int i = 0; i < n; i++) { + if (ranks1[i] == MPI_PROC_NULL) { + ranks2[i] = MPI_PROC_NULL; + continue; } + MPIR_Lpid lpid = MPIR_Group_rank_to_lpid(gp1, ranks1[i]); + ranks2[i] = MPIR_Group_lpid_to_rank(gp2, lpid); } + return mpi_errno; } @@ -159,41 +103,34 @@ int MPIR_Group_excl_impl(MPIR_Group * group_ptr, int n, const int ranks[], MPIR_Group ** new_group_ptr) { int mpi_errno = MPI_SUCCESS; - int size, i, newi; - int *flags = NULL; - MPIR_FUNC_ENTER; - size = group_ptr->size; + int size = group_ptr->size; + int nnew = size - n; - /* Allocate a new group and lrank_to_lpid array */ - mpi_errno = MPIR_Group_create(size - n, new_group_ptr); - MPIR_ERR_CHECK(mpi_errno); - - (*new_group_ptr)->rank = MPI_UNDEFINED; /* Use flag fields to mark the members to *exclude* . */ - - flags = MPL_calloc(size, sizeof(int), MPL_MEM_OTHER); - - for (i = 0; i < n; i++) { + int *flags = MPL_calloc(size, sizeof(int), MPL_MEM_OTHER); + for (int i = 0; i < n; i++) { flags[ranks[i]] = 1; } - newi = 0; - for (i = 0; i < size; i++) { + MPIR_Lpid *map = MPL_malloc(nnew * sizeof(MPIR_Lpid), MPL_MEM_GROUP); + MPIR_ERR_CHKANDJUMP(!map, mpi_errno, MPI_ERR_OTHER, "**nomem"); + + int myrank = MPI_UNDEFINED; + int newi = 0; + for (int i = 0; i < size; i++) { if (flags[i] == 0) { - (*new_group_ptr)->lrank_to_lpid[newi].lpid = group_ptr->lrank_to_lpid[i].lpid; - if (group_ptr->rank == i) - (*new_group_ptr)->rank = newi; + map[newi] = MPIR_Group_rank_to_lpid(group_ptr, i); + if (group_ptr->rank == i) { + myrank = newi; + } newi++; } } - (*new_group_ptr)->size = size - n; - (*new_group_ptr)->idx_of_first_lpid = -1; - /* TODO calculate is_local_dense_monotonic */ - - MPIR_Group_set_session_ptr(*new_group_ptr, group_ptr->session_ptr); + mpi_errno = MPIR_Group_create_map(nnew, myrank, group_ptr->session_ptr, map, new_group_ptr); + MPIR_ERR_CHECK(mpi_errno); fn_exit: MPL_free(flags); @@ -207,8 +144,6 @@ int MPIR_Group_incl_impl(MPIR_Group * group_ptr, int n, const int ranks[], MPIR_Group ** new_group_ptr) { int mpi_errno = MPI_SUCCESS; - int i; - MPIR_FUNC_ENTER; if (n == 0) { @@ -216,23 +151,20 @@ int MPIR_Group_incl_impl(MPIR_Group * group_ptr, int n, const int ranks[], goto fn_exit; } - /* Allocate a new group and lrank_to_lpid array */ - mpi_errno = MPIR_Group_create(n, new_group_ptr); - if (mpi_errno) - goto fn_fail; + int nnew = n; + MPIR_Lpid *map = MPL_malloc(nnew * sizeof(MPIR_Lpid), MPL_MEM_OTHER); + MPIR_ERR_CHKANDJUMP(!map, mpi_errno, MPI_ERR_OTHER, "**nomem"); - (*new_group_ptr)->rank = MPI_UNDEFINED; - for (i = 0; i < n; i++) { - (*new_group_ptr)->lrank_to_lpid[i].lpid = group_ptr->lrank_to_lpid[ranks[i]].lpid; - if (ranks[i] == group_ptr->rank) - (*new_group_ptr)->rank = i; + int myrank = MPI_UNDEFINED; + for (int i = 0; i < n; i++) { + map[i] = MPIR_Group_rank_to_lpid(group_ptr, ranks[i]); + if (ranks[i] == group_ptr->rank) { + myrank = i; + } } - (*new_group_ptr)->size = n; - (*new_group_ptr)->idx_of_first_lpid = -1; - /* TODO calculate is_local_dense_monotonic */ - - MPIR_Group_set_session_ptr(*new_group_ptr, group_ptr->session_ptr); + mpi_errno = MPIR_Group_create_map(nnew, myrank, group_ptr->session_ptr, map, new_group_ptr); + MPIR_ERR_CHECK(mpi_errno); fn_exit: MPIR_FUNC_EXIT; @@ -245,17 +177,15 @@ int MPIR_Group_range_excl_impl(MPIR_Group * group_ptr, int n, int ranges[][3], MPIR_Group ** new_group_ptr) { int mpi_errno = MPI_SUCCESS; - int size, i, j, k, nnew, first, last, stride; - int *flags = NULL; - MPIR_FUNC_ENTER; + /* Compute size, assuming that included ranks are valid (and distinct) */ - size = group_ptr->size; - nnew = 0; - for (i = 0; i < n; i++) { - first = ranges[i][0]; - last = ranges[i][1]; - stride = ranges[i][2]; + int size = group_ptr->size; + int nnew = 0; + for (int i = 0; i < n; i++) { + int first = ranges[i][0]; + int last = ranges[i][1]; + int stride = ranges[i][2]; /* works for stride of either sign. Error checking above * has already guaranteed stride != 0 */ nnew += 1 + (last - first) / stride; @@ -267,15 +197,6 @@ int MPIR_Group_range_excl_impl(MPIR_Group * group_ptr, int n, int ranges[][3], goto fn_exit; } - /* Allocate a new group and lrank_to_lpid array */ - mpi_errno = MPIR_Group_create(nnew, new_group_ptr); - /* --BEGIN ERROR HANDLING-- */ - if (mpi_errno) { - goto fn_fail; - } - /* --END ERROR HANDLING-- */ - (*new_group_ptr)->rank = MPI_UNDEFINED; - /* Group members are taken in rank order from the original group, * with the specified members removed. Use the flag array for that * purpose. If this was a critical routine, we could use the @@ -283,41 +204,46 @@ int MPIR_Group_range_excl_impl(MPIR_Group * group_ptr, int n, int ranges[][3], * was enabled *and* we are not MPI_THREAD_MULTIPLE, but since this * is a low-usage routine, we haven't taken that optimization. */ - flags = MPL_calloc(size, sizeof(int), MPL_MEM_OTHER); + int *flags = MPL_calloc(size, sizeof(int), MPL_MEM_OTHER); - for (i = 0; i < n; i++) { - first = ranges[i][0]; - last = ranges[i][1]; - stride = ranges[i][2]; + for (int i = 0; i < n; i++) { + int first = ranges[i][0]; + int last = ranges[i][1]; + int stride = ranges[i][2]; if (stride > 0) { - for (j = first; j <= last; j += stride) { + for (int j = first; j <= last; j += stride) { flags[j] = 1; } } else { - for (j = first; j >= last; j += stride) { + for (int j = first; j >= last; j += stride) { flags[j] = 1; } } } + /* Now, run through the group and pick up the members that were * not excluded */ - k = 0; - for (i = 0; i < size; i++) { + MPIR_Lpid *map = MPL_malloc(nnew * sizeof(MPIR_Lpid), MPL_MEM_GROUP); + MPIR_ERR_CHKANDJUMP(!map, mpi_errno, MPI_ERR_OTHER, "**nomem"); + + int myrank = MPI_UNDEFINED; + int k = 0; + for (int i = 0; i < size; i++) { if (!flags[i]) { - (*new_group_ptr)->lrank_to_lpid[k].lpid = group_ptr->lrank_to_lpid[i].lpid; + map[k] = MPIR_Group_rank_to_lpid(group_ptr, i); if (group_ptr->rank == i) { - (*new_group_ptr)->rank = k; + myrank = k; } k++; } } - /* TODO calculate is_local_dense_monotonic */ + MPL_free(flags); - MPIR_Group_set_session_ptr(*new_group_ptr, group_ptr->session_ptr); + mpi_errno = MPIR_Group_create_map(nnew, myrank, group_ptr->session_ptr, map, new_group_ptr); + MPIR_ERR_CHECK(mpi_errno); fn_exit: - MPL_free(flags); MPIR_FUNC_EXIT; return mpi_errno; fn_fail: @@ -328,16 +254,14 @@ int MPIR_Group_range_incl_impl(MPIR_Group * group_ptr, int n, int ranges[][3], MPIR_Group ** new_group_ptr) { int mpi_errno = MPI_SUCCESS; - int first, last, stride, nnew, i, j, k; - MPIR_FUNC_ENTER; /* Compute size, assuming that included ranks are valid (and distinct) */ - nnew = 0; - for (i = 0; i < n; i++) { - first = ranges[i][0]; - last = ranges[i][1]; - stride = ranges[i][2]; + int nnew = 0; + for (int i = 0; i < n; i++) { + int first = ranges[i][0]; + int last = ranges[i][1]; + int stride = ranges[i][2]; /* works for stride of either sign. Error checking above * has already guaranteed stride != 0 */ nnew += 1 + (last - first) / stride; @@ -348,40 +272,39 @@ int MPIR_Group_range_incl_impl(MPIR_Group * group_ptr, int n, int ranges[][3], goto fn_exit; } - /* Allocate a new group and lrank_to_lpid array */ - mpi_errno = MPIR_Group_create(nnew, new_group_ptr); - if (mpi_errno) - goto fn_fail; - (*new_group_ptr)->rank = MPI_UNDEFINED; + MPIR_Lpid *map = MPL_malloc(nnew * sizeof(MPIR_Lpid), MPL_MEM_GROUP); + MPIR_ERR_CHKANDJUMP(!map, mpi_errno, MPI_ERR_OTHER, "**nomem"); /* Group members taken in order specified by the range array */ /* This could be integrated with the error checking, but since this * is a low-usage routine, we haven't taken that optimization */ - k = 0; - for (i = 0; i < n; i++) { - first = ranges[i][0]; - last = ranges[i][1]; - stride = ranges[i][2]; + int myrank = MPI_UNDEFINED; + int k = 0; + for (int i = 0; i < n; i++) { + int first = ranges[i][0]; + int last = ranges[i][1]; + int stride = ranges[i][2]; if (stride > 0) { - for (j = first; j <= last; j += stride) { - (*new_group_ptr)->lrank_to_lpid[k].lpid = group_ptr->lrank_to_lpid[j].lpid; - if (j == group_ptr->rank) - (*new_group_ptr)->rank = k; + for (int j = first; j <= last; j += stride) { + map[k] = MPIR_Group_rank_to_lpid(group_ptr, j); + if (j == group_ptr->rank) { + myrank = k; + } k++; } } else { - for (j = first; j >= last; j += stride) { - (*new_group_ptr)->lrank_to_lpid[k].lpid = group_ptr->lrank_to_lpid[j].lpid; - if (j == group_ptr->rank) - (*new_group_ptr)->rank = k; + for (int j = first; j >= last; j += stride) { + map[k] = MPIR_Group_rank_to_lpid(group_ptr, j); + if (j == group_ptr->rank) { + myrank = k; + } k++; } } } - /* TODO calculate is_local_dense_monotonic */ - - MPIR_Group_set_session_ptr(*new_group_ptr, group_ptr->session_ptr); + mpi_errno = MPIR_Group_create_map(nnew, myrank, group_ptr->session_ptr, map, new_group_ptr); + MPIR_ERR_CHECK(mpi_errno); fn_exit: MPIR_FUNC_EXIT; @@ -394,69 +317,32 @@ int MPIR_Group_difference_impl(MPIR_Group * group_ptr1, MPIR_Group * group_ptr2, MPIR_Group ** new_group_ptr) { int mpi_errno = MPI_SUCCESS; - int size1, i, k, g1_idx, g2_idx, nnew; - uint64_t l1_pid, l2_pid; - int *flags = NULL; - MPIR_FUNC_ENTER; - /* Return a group consisting of the members of group1 that are *not* - * in group2 */ - size1 = group_ptr1->size; - /* Insure that the lpid lists are setup */ - MPIR_Group_setup_lpid_pairs(group_ptr1, group_ptr2); - - flags = MPL_calloc(size1, sizeof(int), MPL_MEM_OTHER); - - g1_idx = group_ptr1->idx_of_first_lpid; - g2_idx = group_ptr2->idx_of_first_lpid; - - nnew = size1; - while (g1_idx >= 0 && g2_idx >= 0) { - l1_pid = group_ptr1->lrank_to_lpid[g1_idx].lpid; - l2_pid = group_ptr2->lrank_to_lpid[g2_idx].lpid; - if (l1_pid < l2_pid) { - g1_idx = group_ptr1->lrank_to_lpid[g1_idx].next_lpid; - } else if (l1_pid > l2_pid) { - g2_idx = group_ptr2->lrank_to_lpid[g2_idx].next_lpid; - } else { - /* Equal */ - flags[g1_idx] = 1; - g1_idx = group_ptr1->lrank_to_lpid[g1_idx].next_lpid; - g2_idx = group_ptr2->lrank_to_lpid[g2_idx].next_lpid; - nnew--; - } - } - /* Create the group */ - if (nnew == 0) { - /* See 5.3.2, Group Constructors. For many group routines, - * the standard explicitly says to return MPI_GROUP_EMPTY; - * for others it is implied */ - *new_group_ptr = MPIR_Group_empty; - goto fn_exit; - } else { - mpi_errno = MPIR_Group_create(nnew, new_group_ptr); - /* --BEGIN ERROR HANDLING-- */ - if (mpi_errno) { - goto fn_fail; - } - /* --END ERROR HANDLING-- */ - (*new_group_ptr)->rank = MPI_UNDEFINED; - k = 0; - for (i = 0; i < size1; i++) { - if (!flags[i]) { - (*new_group_ptr)->lrank_to_lpid[k].lpid = group_ptr1->lrank_to_lpid[i].lpid; - if (i == group_ptr1->rank) - (*new_group_ptr)->rank = k; - k++; + + MPIR_Assert(group_ptr1->session_ptr == group_ptr2->session_ptr); + + MPIR_Lpid *map = MPL_malloc(group_ptr1->size * sizeof(MPIR_Lpid), MPL_MEM_GROUP); + MPIR_ERR_CHKANDJUMP(!map, mpi_errno, MPI_ERR_OTHER, "**nomem"); + + int nnew = 0; + int myrank = MPI_UNDEFINED; + /* For each rank in group1, search it in group2. */ + for (int i = 0; i < group_ptr1->size; i++) { + MPIR_Lpid lpid = MPIR_Group_rank_to_lpid(group_ptr1, i); + if (MPI_UNDEFINED == MPIR_Group_lpid_to_rank(group_ptr2, lpid)) { + /* not found */ + if (i == group_ptr1->rank) { + myrank = nnew; } + map[nnew++] = lpid; } - /* TODO calculate is_local_dense_monotonic */ } - MPIR_Group_set_session_ptr(*new_group_ptr, group_ptr1->session_ptr); + /* Create the group */ + mpi_errno = MPIR_Group_create_map(nnew, myrank, group_ptr1->session_ptr, map, new_group_ptr); + MPIR_ERR_CHECK(mpi_errno); fn_exit: - MPL_free(flags); MPIR_FUNC_EXIT; return mpi_errno; fn_fail: @@ -467,69 +353,34 @@ int MPIR_Group_intersection_impl(MPIR_Group * group_ptr1, MPIR_Group * group_ptr MPIR_Group ** new_group_ptr) { int mpi_errno = MPI_SUCCESS; - int size1, i, k, g1_idx, g2_idx, nnew; - uint64_t l1_pid, l2_pid; - int *flags = NULL; - MPIR_FUNC_ENTER; - /* Return a group consisting of the members of group1 that are - * in group2 */ - size1 = group_ptr1->size; - /* Insure that the lpid lists are setup */ - MPIR_Group_setup_lpid_pairs(group_ptr1, group_ptr2); - - flags = MPL_calloc(size1, sizeof(int), MPL_MEM_OTHER); - - g1_idx = group_ptr1->idx_of_first_lpid; - g2_idx = group_ptr2->idx_of_first_lpid; - - nnew = 0; - while (g1_idx >= 0 && g2_idx >= 0) { - l1_pid = group_ptr1->lrank_to_lpid[g1_idx].lpid; - l2_pid = group_ptr2->lrank_to_lpid[g2_idx].lpid; - if (l1_pid < l2_pid) { - g1_idx = group_ptr1->lrank_to_lpid[g1_idx].next_lpid; - } else if (l1_pid > l2_pid) { - g2_idx = group_ptr2->lrank_to_lpid[g2_idx].next_lpid; - } else { - /* Equal */ - flags[g1_idx] = 1; - g1_idx = group_ptr1->lrank_to_lpid[g1_idx].next_lpid; - g2_idx = group_ptr2->lrank_to_lpid[g2_idx].next_lpid; - nnew++; - } - } - /* Create the group. Handle the trivial case first */ - if (nnew == 0) { - *new_group_ptr = MPIR_Group_empty; - goto fn_exit; - } - mpi_errno = MPIR_Group_create(nnew, new_group_ptr); - MPIR_ERR_CHECK(mpi_errno); + /* Similar to MPI_Group_difference, but take the ranks that are found in group2 */ - (*new_group_ptr)->rank = MPI_UNDEFINED; - (*new_group_ptr)->is_local_dense_monotonic = TRUE; - k = 0; - for (i = 0; i < size1; i++) { - if (flags[i]) { - uint64_t lpid = group_ptr1->lrank_to_lpid[i].lpid; - (*new_group_ptr)->lrank_to_lpid[k].lpid = lpid; - if (i == group_ptr1->rank) - (*new_group_ptr)->rank = k; - if (lpid > MPIR_Process.size || - (k > 0 && (*new_group_ptr)->lrank_to_lpid[k - 1].lpid != (lpid - 1))) { - (*new_group_ptr)->is_local_dense_monotonic = FALSE; - } + MPIR_Assert(group_ptr1->session_ptr == group_ptr2->session_ptr); - k++; + MPIR_Lpid *map = MPL_malloc(group_ptr1->size * sizeof(MPIR_Lpid), MPL_MEM_GROUP); + MPIR_ERR_CHKANDJUMP(!map, mpi_errno, MPI_ERR_OTHER, "**nomem"); + + int nnew = 0; + int myrank = MPI_UNDEFINED; + /* For each rank in group1, search it in group2. */ + for (int i = 0; i < group_ptr1->size; i++) { + MPIR_Lpid lpid = MPIR_Group_rank_to_lpid(group_ptr1, i); + if (MPI_UNDEFINED != MPIR_Group_lpid_to_rank(group_ptr2, lpid)) { + /* found */ + if (i == group_ptr1->rank) { + myrank = nnew; + } + map[nnew++] = lpid; } } - MPIR_Group_set_session_ptr(*new_group_ptr, group_ptr1->session_ptr); + /* Create the group */ + mpi_errno = MPIR_Group_create_map(nnew, myrank, group_ptr1->session_ptr, map, new_group_ptr); + MPIR_ERR_CHECK(mpi_errno); fn_exit: - MPL_free(flags); MPIR_FUNC_EXIT; return mpi_errno; fn_fail: @@ -540,104 +391,41 @@ int MPIR_Group_union_impl(MPIR_Group * group_ptr1, MPIR_Group * group_ptr2, MPIR_Group ** new_group_ptr) { int mpi_errno = MPI_SUCCESS; - int g1_idx, g2_idx, nnew, i, k, size1, size2; - uint64_t mylpid; - int *flags = NULL; - MPIR_FUNC_ENTER; - /* Determine the size of the new group. The new group consists of all - * members of group1 plus the members of group2 that are not in group1. - */ - g1_idx = group_ptr1->idx_of_first_lpid; - g2_idx = group_ptr2->idx_of_first_lpid; - - /* If the lpid list hasn't been created, do it now */ - if (g1_idx < 0) { - MPII_Group_setup_lpid_list(group_ptr1); - g1_idx = group_ptr1->idx_of_first_lpid; - } - if (g2_idx < 0) { - MPII_Group_setup_lpid_list(group_ptr2); - g2_idx = group_ptr2->idx_of_first_lpid; - } - nnew = group_ptr1->size; - - /* Clear the flag bits on the second group. The flag is set if - * a member of the second group belongs to the union */ - size2 = group_ptr2->size; - flags = MPL_calloc(size2, sizeof(int), MPL_MEM_OTHER); - - /* Loop through the lists that are ordered by lpid (local process - * id) to detect which processes in group 2 are not in group 1 - */ - while (g1_idx >= 0 && g2_idx >= 0) { - uint64_t l1_pid, l2_pid; - l1_pid = group_ptr1->lrank_to_lpid[g1_idx].lpid; - l2_pid = group_ptr2->lrank_to_lpid[g2_idx].lpid; - if (l1_pid > l2_pid) { - nnew++; - flags[g2_idx] = 1; - g2_idx = group_ptr2->lrank_to_lpid[g2_idx].next_lpid; - } else if (l1_pid == l2_pid) { - g1_idx = group_ptr1->lrank_to_lpid[g1_idx].next_lpid; - g2_idx = group_ptr2->lrank_to_lpid[g2_idx].next_lpid; - } else { - /* l1 < l2 */ - g1_idx = group_ptr1->lrank_to_lpid[g1_idx].next_lpid; - } - } - /* If we hit the end of group1, add the remaining members of group 2 */ - while (g2_idx >= 0) { - nnew++; - flags[g2_idx] = 1; - g2_idx = group_ptr2->lrank_to_lpid[g2_idx].next_lpid; - } - - if (nnew == 0) { - *new_group_ptr = MPIR_Group_empty; - goto fn_exit; - } + MPIR_Assert(group_ptr1->session_ptr == group_ptr2->session_ptr); - /* Allocate a new group and lrank_to_lpid array */ - mpi_errno = MPIR_Group_create(nnew, new_group_ptr); - MPIR_ERR_CHECK(mpi_errno); + MPIR_Lpid *map = MPL_malloc((group_ptr1->size + group_ptr2->size) * sizeof(MPIR_Lpid), + MPL_MEM_GROUP); + MPIR_ERR_CHKANDJUMP(!map, mpi_errno, MPI_ERR_OTHER, "**nomem"); /* If this process is in group1, then we can set the rank now. * If we are not in this group, this assignment will set the * current rank to MPI_UNDEFINED */ - (*new_group_ptr)->rank = group_ptr1->rank; + int myrank = group_ptr1->rank; /* Add group1 */ - size1 = group_ptr1->size; - for (i = 0; i < size1; i++) { - (*new_group_ptr)->lrank_to_lpid[i].lpid = group_ptr1->lrank_to_lpid[i].lpid; + for (int i = 0; i < group_ptr1->size; i++) { + map[i] = MPIR_Group_rank_to_lpid(group_ptr1, i); } /* Add members of group2 that are not in group 1 */ - - if (group_ptr1->rank == MPI_UNDEFINED && group_ptr2->rank >= 0) { - mylpid = group_ptr2->lrank_to_lpid[group_ptr2->rank].lpid; - } else { - mylpid = (uint64_t) - 2; - } - k = size1; - for (i = 0; i < size2; i++) { - if (flags[i]) { - (*new_group_ptr)->lrank_to_lpid[k].lpid = group_ptr2->lrank_to_lpid[i].lpid; - if ((*new_group_ptr)->rank == MPI_UNDEFINED && - group_ptr2->lrank_to_lpid[i].lpid == mylpid) - (*new_group_ptr)->rank = k; - k++; + int nnew = group_ptr1->size; + for (int i = 0; i < group_ptr2->size; i++) { + MPIR_Lpid lpid = MPIR_Group_rank_to_lpid(group_ptr2, i); + if (MPI_UNDEFINED == MPIR_Group_lpid_to_rank(group_ptr1, lpid)) { + /* not found */ + if (i == group_ptr2->rank) { + myrank = nnew; + } + map[nnew++] = lpid; } } - /* TODO calculate is_local_dense_monotonic */ - - MPIR_Group_set_session_ptr(*new_group_ptr, group_ptr1->session_ptr); + mpi_errno = MPIR_Group_create_map(nnew, myrank, group_ptr1->session_ptr, map, new_group_ptr); + MPIR_ERR_CHECK(mpi_errno); fn_exit: - MPL_free(flags); MPIR_FUNC_EXIT; return mpi_errno; fn_fail: @@ -648,40 +436,19 @@ int MPIR_Group_from_session_pset_impl(MPIR_Session * session_ptr, const char *ps MPIR_Group ** new_group_ptr) { int mpi_errno = MPI_SUCCESS; - MPIR_Group *group_ptr; if (MPL_stricmp(pset_name, "mpi://WORLD") == 0) { - mpi_errno = MPIR_Group_create(MPIR_Process.size, &group_ptr); + mpi_errno = MPIR_Group_create_stride(MPIR_Process.size, MPIR_Process.rank, session_ptr, + 0, 1, 1, new_group_ptr); MPIR_ERR_CHECK(mpi_errno); - - group_ptr->size = MPIR_Process.size; - group_ptr->rank = MPIR_Process.rank; - group_ptr->is_local_dense_monotonic = TRUE; - for (int i = 0; i < group_ptr->size; i++) { - group_ptr->lrank_to_lpid[i].lpid = i; - group_ptr->lrank_to_lpid[i].next_lpid = i + 1; - } - group_ptr->lrank_to_lpid[group_ptr->size - 1].next_lpid = -1; - group_ptr->idx_of_first_lpid = 0; } else if (MPL_stricmp(pset_name, "mpi://SELF") == 0) { - mpi_errno = MPIR_Group_create(1, &group_ptr); + mpi_errno = MPIR_Group_create_stride(1, 0, session_ptr, 0, 1, 1, new_group_ptr); MPIR_ERR_CHECK(mpi_errno); - - group_ptr->size = 1; - group_ptr->rank = 0; - group_ptr->is_local_dense_monotonic = TRUE; - group_ptr->lrank_to_lpid[0].lpid = MPIR_Process.rank; - group_ptr->lrank_to_lpid[0].next_lpid = -1; - group_ptr->idx_of_first_lpid = 0; } else { /* TODO: Implement pset struct, locate pset struct ptr */ MPIR_ERR_SETANDSTMT(mpi_errno, MPI_ERR_ARG, goto fn_fail, "**psetinvalidname"); } - MPIR_Group_set_session_ptr(group_ptr, session_ptr); - - *new_group_ptr = group_ptr; - fn_exit: return mpi_errno; fn_fail: From 9b05ad0d9c3f842e8f276265f5cb442767877c4f Mon Sep 17 00:00:00 2001 From: Hui Zhou Date: Tue, 10 Dec 2024 13:14:43 -0600 Subject: [PATCH 08/19] group: refactor MPIR_Group * add option to use stride to describe group composition * remove the linked list design --- src/include/mpir_group.h | 56 +++---- src/mpi/comm/comm_impl.c | 3 - src/mpi/group/grouputil.c | 309 ++++++++++++-------------------------- 3 files changed, 122 insertions(+), 246 deletions(-) diff --git a/src/include/mpir_group.h b/src/include/mpir_group.h index 43e79552ec1..1148a8e8006 100644 --- a/src/include/mpir_group.h +++ b/src/include/mpir_group.h @@ -12,24 +12,6 @@ * MPI_Group_intersection) and for the scalable RMA synchronization *---------------------------------------------------------------------------*/ -/* Abstract the integer type for lpid (process id). It is possible to use 32-bit - * in principle, but 64-bit is simpler since we can trivially combine - * (world_idx, world_rank). - */ -typedef uint64_t MPIR_Lpid; - -/* This structure is used to implement the group operations such as - MPI_Group_translate_ranks */ -/* note: next_lpid (with idx_of_first_lpid in MPIR_Group) gives a linked list - * in a sorted lpid ascending order */ -typedef struct MPII_Group_pmap_t { - MPIR_Lpid lpid; /* local process id, from VCONN */ - int next_lpid; /* Index of next lpid (in lpid order) */ -} MPII_Group_pmap_t; - -/* Any changes in the MPIR_Group structure must be made to the - predefined value in MPIR_Group_builtin for MPI_GROUP_EMPTY in - src/mpi/group/grouputil.c */ /*S MPIR_Group - Description of the Group data structure @@ -60,22 +42,35 @@ typedef struct MPII_Group_pmap_t { Group-DS S*/ + +/* Abstract the integer type for lpid (process id). It is possible to use 32-bit + * in principle, but 64-bit is simpler since we can trivially combine + * (world_idx, world_rank). + */ +typedef uint64_t MPIR_Lpid; + +struct MPIR_Pmap { + int size; /* same as group->size, duplicate here so Pmap is logically complete */ + bool use_map; + union { + MPIR_Lpid *map; + struct { + MPIR_Lpid offset; + MPIR_Lpid stride; + MPIR_Lpid blocksize; + } stride; + } u; +}; + struct MPIR_Group { MPIR_OBJECT_HEADER; /* adds handle and ref_count fields */ int size; /* Size of a group */ - int rank; /* rank of this process relative to this - * group */ - int idx_of_first_lpid; - MPII_Group_pmap_t *lrank_to_lpid; /* Array mapping a local rank to local - * process number */ - int is_local_dense_monotonic; /* see NOTE-G1 */ - - /* We may want some additional data for the RMA syncrhonization calls */ - /* Other, device-specific information */ + int rank; /* rank of this process relative to this group */ + struct MPIR_Pmap pmap; + MPIR_Session *session_ptr; /* Pointer to session to which this group belongs */ #ifdef MPID_DEV_GROUP_DECL MPID_DEV_GROUP_DECL #endif - MPIR_Session * session_ptr; /* Pointer to session to which this group belongs */ }; /* NOTE-G1: is_local_dense_monotonic will be true iff the group meets the @@ -104,10 +99,8 @@ extern MPIR_Group *const MPIR_Group_empty; #define MPIR_Group_release_ref(_group, _inuse) \ do { MPIR_Object_release_ref(_group, _inuse); } while (0) -void MPII_Group_setup_lpid_list(MPIR_Group *); int MPIR_Group_check_valid_ranks(MPIR_Group *, const int[], int); int MPIR_Group_check_valid_ranges(MPIR_Group *, int[][3], int); -void MPIR_Group_setup_lpid_pairs(MPIR_Group *, MPIR_Group *); int MPIR_Group_create(int, MPIR_Group **); int MPIR_Group_release(MPIR_Group * group_ptr); @@ -123,7 +116,4 @@ int MPIR_Group_check_subset(MPIR_Group * group_ptr, MPIR_Comm * comm_ptr); void MPIR_Group_set_session_ptr(MPIR_Group * group_ptr, MPIR_Session * session_out); int MPIR_Group_init(void); -/* internal functions */ -void MPII_Group_setup_lpid_list(MPIR_Group *); - #endif /* MPIR_GROUP_H_INCLUDED */ diff --git a/src/mpi/comm/comm_impl.c b/src/mpi/comm/comm_impl.c index 46f06b89762..56db002f58c 100644 --- a/src/mpi/comm/comm_impl.c +++ b/src/mpi/comm/comm_impl.c @@ -198,9 +198,6 @@ int MPII_Comm_create_calculate_mapping(MPIR_Group * group_ptr, * exactly the same as the ranks in comm world. */ - /* we examine the group's lpids in both the intracomm and non-comm_world cases */ - MPII_Group_setup_lpid_list(group_ptr); - /* Optimize for groups contained within MPI_COMM_WORLD. */ if (comm_ptr->comm_kind == MPIR_COMM_KIND__INTRACOMM) { int wsize; diff --git a/src/mpi/group/grouputil.c b/src/mpi/group/grouputil.c index 414c562fe3c..59c45561eca 100644 --- a/src/mpi/group/grouputil.c +++ b/src/mpi/group/grouputil.c @@ -28,10 +28,9 @@ int MPIR_Group_init(void) MPIR_Object_set_ref(&MPIR_Group_builtin[0], 1); MPIR_Group_builtin[0].size = 0; MPIR_Group_builtin[0].rank = MPI_UNDEFINED; - MPIR_Group_builtin[0].idx_of_first_lpid = -1; - MPIR_Group_builtin[0].lrank_to_lpid = NULL; + MPIR_Group_builtin[0].session_ptr = NULL; + memset(&MPIR_Group_builtin[0].pmap, 0, sizeof(struct MPIR_Pmap)); - /* TODO hook for device here? */ return mpi_errno; } @@ -44,7 +43,9 @@ int MPIR_Group_release(MPIR_Group * group_ptr) MPIR_Group_release_ref(group_ptr, &inuse); if (!inuse) { /* Only if refcount is 0 do we actually free. */ - MPL_free(group_ptr->lrank_to_lpid); + if (group_ptr->pmap.use_map) { + MPL_free(group_ptr->pmap.u.map); + } if (group_ptr->session_ptr != NULL) { /* Release session */ MPIR_Session_release(group_ptr->session_ptr); @@ -73,24 +74,14 @@ int MPIR_Group_create(int nproc, MPIR_Group ** new_group_ptr) } /* --END ERROR HANDLING-- */ MPIR_Object_set_ref(*new_group_ptr, 1); - (*new_group_ptr)->lrank_to_lpid = - (MPII_Group_pmap_t *) MPL_calloc(nproc, sizeof(MPII_Group_pmap_t), MPL_MEM_GROUP); - /* --BEGIN ERROR HANDLING-- */ - if (!(*new_group_ptr)->lrank_to_lpid) { - MPIR_Handle_obj_free(&MPIR_Group_mem, *new_group_ptr); - *new_group_ptr = NULL; - MPIR_CHKMEM_SETERR(mpi_errno, nproc * sizeof(MPII_Group_pmap_t), "newgroup->lrank_to_lpid"); - return mpi_errno; - } - /* --END ERROR HANDLING-- */ - (*new_group_ptr)->size = nproc; - /* Make sure that there is no question that the list of ranks sorted - * by pids is marked as uninitialized */ - (*new_group_ptr)->idx_of_first_lpid = -1; - - (*new_group_ptr)->is_local_dense_monotonic = FALSE; + /* initialize fields */ + (*new_group_ptr)->size = nproc; + (*new_group_ptr)->rank = MPI_UNDEFINED; (*new_group_ptr)->session_ptr = NULL; + memset(&(*new_group_ptr)->pmap, 0, sizeof(struct MPIR_Pmap)); + (*new_group_ptr)->pmap.size = nproc; + return mpi_errno; } @@ -103,25 +94,25 @@ int MPIR_Group_create_map(int size, int rank, MPIR_Session * session_ptr, MPIR_L /* See 5.3.2, Group Constructors. For many group routines, * the standard explicitly says to return MPI_GROUP_EMPTY; * for others it is implied */ + MPL_free(map); *new_group_ptr = MPIR_Group_empty; goto fn_exit; - } + } else { + MPIR_Group *newgrp; + mpi_errno = MPIR_Group_create(size, &newgrp); + MPIR_ERR_CHECK(mpi_errno); - MPIR_Group *newgrp; - mpi_errno = MPIR_Group_create(size, &newgrp); - MPIR_ERR_CHECK(mpi_errno); + newgrp->rank = rank; + MPIR_Group_set_session_ptr(newgrp, session_ptr); - newgrp->rank = rank; - MPIR_Group_set_session_ptr(newgrp, session_ptr); + newgrp->pmap.use_map = true; + newgrp->pmap.u.map = map; - for (int i = 0; i < size; i++) { - newgrp->lrank_to_lpid[i].lpid = map[i]; + /* TODO: build hash to accelerate MPIR_Group_lpid_to_rank */ + *new_group_ptr = newgrp; } - *new_group_ptr = newgrp; - fn_exit: - MPL_free(map); return mpi_errno; fn_fail: goto fn_exit; @@ -132,176 +123,88 @@ int MPIR_Group_create_stride(int size, int rank, MPIR_Session * session_ptr, MPIR_Group ** new_group_ptr) { int mpi_errno = MPI_SUCCESS; - MPIR_Group *newgrp; - MPIR_Assert(size > 0); + if (size == 0) { + /* See 5.3.2, Group Constructors. For many group routines, + * the standard explicitly says to return MPI_GROUP_EMPTY; + * for others it is implied */ + *new_group_ptr = MPIR_Group_empty; + goto fn_exit; + } else { + MPIR_Group *newgrp; + mpi_errno = MPIR_Group_create(size, &newgrp); + MPIR_ERR_CHECK(mpi_errno); - mpi_errno = MPIR_Group_create(size, &newgrp); - MPIR_ERR_CHECK(mpi_errno); + newgrp->rank = rank; + MPIR_Group_set_session_ptr(newgrp, session_ptr); - newgrp->rank = rank; - MPIR_Group_set_session_ptr(newgrp, session_ptr); + newgrp->pmap.use_map = false; + newgrp->pmap.u.stride.offset = offset; + newgrp->pmap.u.stride.stride = stride; + newgrp->pmap.u.stride.blocksize = blocksize; - MPIR_Lpid lpid = offset; - int i = 0; - while (i < size) { - for (int j = 0; j < blocksize; j++) { - newgrp->lrank_to_lpid[i + j].lpid = lpid + j; - } - i += blocksize; - lpid += stride; + *new_group_ptr = newgrp; } - *new_group_ptr = newgrp; - fn_exit: return mpi_errno; fn_fail: goto fn_exit; } -MPIR_Lpid MPIR_Group_rank_to_lpid(MPIR_Group * group, int rank) +static MPIR_Lpid pmap_rank_to_lpid(struct MPIR_Pmap *pmap, int rank) { - return group->lrank_to_lpid[rank].lpid; -} + if (rank < 0 || rank >= pmap->size) { + return MPI_UNDEFINED; + } -int MPIR_Group_lpid_to_rank(MPIR_Group * group, MPIR_Lpid lpid) -{ - /* Use linear search for now. - * Optimization, build hash map in MPIR_Group_create_map and do O(1) hash lookup - */ - for (int i = 0; i < group->size; i++) { - if (lpid == group->lrank_to_lpid[i].lpid) { - return i; - } + if (pmap->use_map) { + return pmap->u.map[rank]; + } else { + MPIR_Lpid i_blk = rank / pmap->u.stride.blocksize; + MPIR_Lpid r_blk = rank % pmap->u.stride.blocksize; + return pmap->u.stride.offset + i_blk * pmap->u.stride.stride + r_blk; } - return MPI_UNDEFINED; } -/* - * return value is the first index in the list - * - * This "sorts" an lpid array by lpid value, using a simple merge sort - * algorithm. - * - * In actuality, it does not reorder the elements of maparray (these must remain - * in group rank order). Instead it builds the traversal order (in increasing - * lpid order) through the maparray given by the "next_lpid" fields. - */ -static int mergesort_lpidarray(MPII_Group_pmap_t maparray[], int n) +static int pmap_lpid_to_rank(struct MPIR_Pmap *pmap, MPIR_Lpid lpid) { - int idx1, idx2, first_idx, cur_idx, next_lpid, idx2_offset; - - if (n == 2) { - if (maparray[0].lpid > maparray[1].lpid) { - first_idx = 1; - maparray[0].next_lpid = -1; - maparray[1].next_lpid = 0; - } else { - first_idx = 0; - maparray[0].next_lpid = 1; - maparray[1].next_lpid = -1; + if (pmap->use_map) { + /* Use linear search for now. + * Optimization: build hash map in MPIR_Group_create_map and do O(1) hash lookup + */ + for (int rank = 0; rank < pmap->size; rank++) { + if (pmap->u.map[rank] == lpid) { + return rank; + } } - return first_idx; - } - if (n == 1) { - maparray[0].next_lpid = -1; - return 0; - } - if (n == 0) - return -1; - - /* Sort each half */ - idx2_offset = n / 2; - idx1 = mergesort_lpidarray(maparray, n / 2); - idx2 = mergesort_lpidarray(maparray + idx2_offset, n - n / 2) + idx2_offset; - /* merge the results */ - /* There are three lists: - * first_idx - points to the HEAD of the sorted, merged list - * cur_idx - points to the LAST element of the sorted, merged list - * idx1 - points to the HEAD of one sorted list - * idx2 - points to the HEAD of the other sorted list - * - * We first identify the head element of the sorted list. We then - * take elements from the remaining lists. When one list is empty, - * we add the other list to the end of sorted list. - * - * The last wrinkle is that the next_lpid fields in maparray[idx2] - * are relative to n/2, not 0 (that is, a next_lpid of 1 is - * really 1 + n/2, relative to the beginning of maparray). - */ - /* Find the head element */ - if (maparray[idx1].lpid > maparray[idx2].lpid) { - first_idx = idx2; - idx2 = maparray[idx2].next_lpid + idx2_offset; + return MPI_UNDEFINED; } else { - first_idx = idx1; - idx1 = maparray[idx1].next_lpid; - } + lpid -= pmap->u.stride.offset; + MPIR_Lpid i_blk = lpid / pmap->u.stride.stride; + MPIR_Lpid r_blk = lpid % pmap->u.stride.stride; - /* Merge the lists until one is empty */ - cur_idx = first_idx; - while (idx1 >= 0 && idx2 >= 0) { - if (maparray[idx1].lpid > maparray[idx2].lpid) { - next_lpid = maparray[idx2].next_lpid; - if (next_lpid >= 0) - next_lpid += idx2_offset; - maparray[cur_idx].next_lpid = idx2; - cur_idx = idx2; - idx2 = next_lpid; - } else { - next_lpid = maparray[idx1].next_lpid; - maparray[cur_idx].next_lpid = idx1; - cur_idx = idx1; - idx1 = next_lpid; + if (r_blk >= pmap->u.stride.blocksize) { + return MPI_UNDEFINED; } - } - /* Add whichever list remains */ - if (idx1 >= 0) { - maparray[cur_idx].next_lpid = idx1; - } else { - maparray[cur_idx].next_lpid = idx2; - /* Convert the rest of these next_lpid values to be - * relative to the beginning of maparray */ - while (idx2 >= 0) { - next_lpid = maparray[idx2].next_lpid; - if (next_lpid >= 0) { - next_lpid += idx2_offset; - maparray[idx2].next_lpid = next_lpid; - } - idx2 = next_lpid; + + int rank = i_blk * pmap->u.stride.blocksize + r_blk; + if (rank >= 0 && rank < pmap->size) { + return rank; + } else { + return MPI_UNDEFINED; } } - - return first_idx; } -/* - * Create a list of the lpids, in lpid order. - * - * Called by group_compare, group_translate_ranks, group_union - * - * In the case of a single main thread lock, the lock must - * be held on entry to this routine. This forces some of the routines - * noted above to hold the SINGLE_CS; which would otherwise not be required. - */ -void MPII_Group_setup_lpid_list(MPIR_Group * group_ptr) +int MPIR_Group_lpid_to_rank(MPIR_Group * group, MPIR_Lpid lpid) { - if (group_ptr->idx_of_first_lpid == -1) { - group_ptr->idx_of_first_lpid = - mergesort_lpidarray(group_ptr->lrank_to_lpid, group_ptr->size); - } + return pmap_lpid_to_rank(&group->pmap, lpid); } -void MPIR_Group_setup_lpid_pairs(MPIR_Group * group_ptr1, MPIR_Group * group_ptr2) +MPIR_Lpid MPIR_Group_rank_to_lpid(MPIR_Group * group, int rank) { - /* If the lpid list hasn't been created, do it now */ - if (group_ptr1->idx_of_first_lpid < 0) { - MPII_Group_setup_lpid_list(group_ptr1); - } - if (group_ptr2->idx_of_first_lpid < 0) { - MPII_Group_setup_lpid_list(group_ptr2); - } + return pmap_rank_to_lpid(&group->pmap, rank); } #ifdef HAVE_ERROR_CHECKING @@ -439,54 +342,40 @@ int MPIR_Group_check_valid_ranges(MPIR_Group * group_ptr, int ranges[][3], int n int MPIR_Group_check_subset(MPIR_Group * group_ptr, MPIR_Comm * comm_ptr) { int mpi_errno = MPI_SUCCESS; - int g1_idx, g2_idx, l1_pid, l2_pid, i; - MPII_Group_pmap_t *vmap = 0; + int vsize = comm_ptr->comm_kind == MPIR_COMM_KIND__INTERCOMM ? comm_ptr->local_size : comm_ptr->remote_size; - MPIR_CHKLMEM_DECL(1); - - MPIR_Assert(group_ptr != NULL); - - MPIR_CHKLMEM_MALLOC(vmap, MPII_Group_pmap_t *, - vsize * sizeof(MPII_Group_pmap_t), mpi_errno, "", MPL_MEM_GROUP); /* Initialize the vmap */ - for (i = 0; i < vsize; i++) { - MPID_Comm_get_lpid(comm_ptr, i, &vmap[i].lpid, FALSE); - vmap[i].next_lpid = 0; + MPIR_Lpid *vmap = MPL_malloc(vsize * sizeof(MPIR_Lpid), MPL_MEM_GROUP); + for (int i = 0; i < vsize; i++) { + /* FIXME: MPID_Comm_get_lpid to be removed */ + uint64_t dev_lpid; + MPID_Comm_get_lpid(comm_ptr, i, &dev_lpid, FALSE); + MPIR_Assert((dev_lpid >> 32) == 0); + vmap[i] = dev_lpid; } - MPII_Group_setup_lpid_list(group_ptr); - g1_idx = group_ptr->idx_of_first_lpid; - g2_idx = mergesort_lpidarray(vmap, vsize); - MPL_DBG_MSG_FMT(MPIR_DBG_COMM, VERBOSE, (MPL_DBG_FDEST, - "initial indices: %d %d\n", g1_idx, g2_idx)); - while (g1_idx >= 0 && g2_idx >= 0) { - l1_pid = group_ptr->lrank_to_lpid[g1_idx].lpid; - l2_pid = vmap[g2_idx].lpid; - MPL_DBG_MSG_FMT(MPIR_DBG_COMM, VERBOSE, (MPL_DBG_FDEST, - "Lpids are %d, %d\n", l1_pid, l2_pid)); - if (l1_pid < l2_pid) { - /* If we have to advance g1, we didn't find a match, so - * that's an error. */ - break; - } else if (l1_pid > l2_pid) { - g2_idx = vmap[g2_idx].next_lpid; - } else { - /* Equal */ - g1_idx = group_ptr->lrank_to_lpid[g1_idx].next_lpid; - g2_idx = vmap[g2_idx].next_lpid; + for (int rank = 0; rank < group_ptr->size; rank++) { + MPIR_Lpid lpid = MPIR_Group_rank_to_lpid(group_ptr, rank); + bool found = false; + for (int i = 0; i < vsize; i++) { + if (vmap[i] == lpid) { + found = true; + break; + } + } + if (!found) { + MPIR_ERR_SET1(mpi_errno, MPI_ERR_GROUP, "**groupnotincomm", + "**groupnotincomm %d", rank); + goto fn_fail; } - MPL_DBG_MSG_FMT(MPIR_DBG_COMM, VERBOSE, (MPL_DBG_FDEST, - "g1 = %d, g2 = %d\n", g1_idx, g2_idx)); - } - - if (g1_idx >= 0) { - MPIR_ERR_SET1(mpi_errno, MPI_ERR_GROUP, "**groupnotincomm", "**groupnotincomm %d", g1_idx); } - fn_fail: - MPIR_CHKLMEM_FREEALL(); + fn_exit: + MPL_free(vmap); return mpi_errno; + fn_fail: + goto fn_exit; } #endif /* HAVE_ERROR_CHECKING */ From ed910e96540a3adbb96d2e72f0bf6e8eb1e246fa Mon Sep 17 00:00:00 2001 From: Hui Zhou Date: Wed, 11 Dec 2024 09:20:39 -0600 Subject: [PATCH 09/19] ---- START HERE ---- --- dummy | 1 + 1 file changed, 1 insertion(+) create mode 100644 dummy diff --git a/dummy b/dummy new file mode 100644 index 00000000000..d00491fd7e5 --- /dev/null +++ b/dummy @@ -0,0 +1 @@ +1 From 37b9eb9a176e67a7246ae9d335222cb5be9e5f6c Mon Sep 17 00:00:00 2001 From: Hui Zhou Date: Wed, 11 Dec 2024 19:14:31 -0600 Subject: [PATCH 10/19] mpid/ch4: remove MPIDI_NM_comm_get_gpid This is the same as MPID_Comm_get_lpid. NOTE: we'll will remove MPID_Comm_get_lpid as well once we move the ownership of lpid to the MPIR-layer. --- src/mpid/ch4/ch4_api.txt | 4 ---- src/mpid/ch4/netmod/ofi/ofi_proc.h | 16 ---------------- src/mpid/ch4/netmod/ucx/ucx_proc.h | 17 ----------------- src/mpid/ch4/src/ch4_impl.h | 6 +++--- src/mpid/ch4/src/ch4_init.c | 2 +- 5 files changed, 4 insertions(+), 41 deletions(-) diff --git a/src/mpid/ch4/ch4_api.txt b/src/mpid/ch4/ch4_api.txt index c1778e546ff..e9a2e2b7e5d 100644 --- a/src/mpid/ch4/ch4_api.txt +++ b/src/mpid/ch4/ch4_api.txt @@ -87,8 +87,6 @@ Non Native API: am_tag_recv : int NM*: rank, comm, handler_id, tag, buf-2, count, datatype, src_vci, dst_vci, rreq SHM*: rank, comm, handler_id, tag, buf-2, count, datatype, src_vci, dst_vci, rreq - comm_get_gpid : int - NM*: comm_ptr, idx, gpid_ptr, is_remote get_local_upids : int NM : comm, local_upid_size, local_upids upids_to_gpids : int @@ -477,8 +475,6 @@ PARAM: local_upid_size: int ** local_upids: char ** lock_type: int - gpid_ptr: uint64_t * - lpids: const int[] made_progress: int * message: MPIR_Request * message_p: MPIR_Request ** diff --git a/src/mpid/ch4/netmod/ofi/ofi_proc.h b/src/mpid/ch4/netmod/ofi/ofi_proc.h index b23e6ec531d..c7ab1f2fb7f 100644 --- a/src/mpid/ch4/netmod/ofi/ofi_proc.h +++ b/src/mpid/ch4/netmod/ofi/ofi_proc.h @@ -20,20 +20,4 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_NM_rank_is_local(int rank, MPIR_Comm * comm) return ret; } -MPL_STATIC_INLINE_PREFIX int MPIDI_NM_comm_get_gpid(MPIR_Comm * comm_ptr, - int idx, uint64_t * gpid_ptr, bool is_remote) -{ - int avtid = 0, lpid = 0; - if (comm_ptr->comm_kind == MPIR_COMM_KIND__INTRACOMM) - MPIDIU_comm_rank_to_pid(comm_ptr, idx, &lpid, &avtid); - else if (is_remote) - MPIDIU_comm_rank_to_pid(comm_ptr, idx, &lpid, &avtid); - else { - MPIDIU_comm_rank_to_pid_local(comm_ptr, idx, &lpid, &avtid); - } - - *gpid_ptr = MPIDIU_GPID_CREATE(avtid, lpid); - return MPI_SUCCESS; -} - #endif /* OFI_PROC_H_INCLUDED */ diff --git a/src/mpid/ch4/netmod/ucx/ucx_proc.h b/src/mpid/ch4/netmod/ucx/ucx_proc.h index 066670c014a..b8481ffd6a6 100644 --- a/src/mpid/ch4/netmod/ucx/ucx_proc.h +++ b/src/mpid/ch4/netmod/ucx/ucx_proc.h @@ -19,21 +19,4 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_NM_rank_is_local(int rank, MPIR_Comm * comm) return ret; } -MPL_STATIC_INLINE_PREFIX int MPIDI_NM_comm_get_gpid(MPIR_Comm * comm_ptr, - int idx, uint64_t * gpid_ptr, bool is_remote) -{ - int avtid = 0, lpid = 0; - if (comm_ptr->comm_kind == MPIR_COMM_KIND__INTRACOMM) { - MPIDIU_comm_rank_to_pid(comm_ptr, idx, &lpid, &avtid); - } else if (is_remote) { - MPIDIU_comm_rank_to_pid(comm_ptr, idx, &lpid, &avtid); - } else { - MPIDIU_comm_rank_to_pid_local(comm_ptr, idx, &lpid, &avtid); - } - - *gpid_ptr = MPIDIU_GPID_CREATE(avtid, lpid); - return MPI_SUCCESS; - -} - #endif /* UCX_PROC_H_INCLUDED */ diff --git a/src/mpid/ch4/src/ch4_impl.h b/src/mpid/ch4/src/ch4_impl.h index 2f5a31dc767..6fa918db043 100644 --- a/src/mpid/ch4/src/ch4_impl.h +++ b/src/mpid/ch4/src/ch4_impl.h @@ -378,17 +378,17 @@ MPL_STATIC_INLINE_PREFIX void MPIDIG_win_hash_clear(MPIR_Win * win) /* We assume this routine is never called with rank=MPI_PROC_NULL. */ MPL_STATIC_INLINE_PREFIX int MPIDIU_valid_group_rank(MPIR_Comm * comm, int rank, MPIR_Group * grp) { - uint64_t gpid; + MPIR_Lpid lpid; int size = grp->size; int z; int ret; MPIR_FUNC_ENTER; - MPIDI_NM_comm_get_gpid(comm, rank, &gpid, FALSE); + MPID_Comm_get_lpid(comm, rank, &lpid, FALSE); for (z = 0; z < size; ++z) { - if (gpid == MPIR_Group_rank_to_lpid(grp, z)) { + if (lpid == MPIR_Group_rank_to_lpid(grp, z)) { break; } } diff --git a/src/mpid/ch4/src/ch4_init.c b/src/mpid/ch4/src/ch4_init.c index 365a12b37ad..e09357352c7 100644 --- a/src/mpid/ch4/src/ch4_init.c +++ b/src/mpid/ch4/src/ch4_init.c @@ -1073,7 +1073,7 @@ int MPID_Free_mem(void *user_buf) goto fn_exit; } -int MPID_Comm_get_lpid(MPIR_Comm * comm_ptr, int idx, uint64_t * lpid_ptr, bool is_remote) +int MPID_Comm_get_lpid(MPIR_Comm * comm_ptr, int idx, MPIR_Lpid * lpid_ptr, bool is_remote) { int mpi_errno = MPI_SUCCESS; int avtid = 0, lpid = 0; From 4a7fd12aa09708948eb1809462312260fb87f5eb Mon Sep 17 00:00:00 2001 From: Hui Zhou Date: Wed, 11 Dec 2024 18:57:37 -0600 Subject: [PATCH 11/19] mpid: replace usage of uint64_t lpid with MPIR_Lpid There is no real difference between lpid and gpid. Thus rename gpid in the device layer to lpid for clarification. Replace the usage of uint64_t as the type of lpid to MPIR_Lpid. This improves consistency. --- src/mpid/ch3/include/mpidpost.h | 5 +-- src/mpid/ch3/include/mpidpre.h | 1 - src/mpid/ch3/src/mpid_vc.c | 16 ++++----- src/mpid/ch4/ch4_api.txt | 10 +++--- src/mpid/ch4/include/mpidch4.h | 6 ++-- src/mpid/ch4/netmod/ofi/ofi_spawn.c | 14 ++++---- src/mpid/ch4/netmod/ucx/ucx_spawn.c | 14 ++++---- src/mpid/ch4/src/ch4_comm.c | 56 ++++++++++++++++------------- src/mpid/ch4/src/ch4_impl.h | 2 +- src/mpid/ch4/src/ch4_proc.c | 6 ++-- src/mpid/ch4/src/ch4_proc.h | 4 +-- src/mpid/ch4/src/ch4_spawn.c | 16 ++++----- src/mpid/ch4/src/ch4i_comm.c | 16 ++++----- src/mpid/ch4/src/ch4i_comm.h | 2 +- 14 files changed, 88 insertions(+), 80 deletions(-) diff --git a/src/mpid/ch3/include/mpidpost.h b/src/mpid/ch3/include/mpidpost.h index 6f76c6aedc1..6b95a0814b8 100644 --- a/src/mpid/ch3/include/mpidpost.h +++ b/src/mpid/ch3/include/mpidpost.h @@ -188,10 +188,11 @@ int MPIDI_PG_ForwardPGInfo( MPIR_Comm *peer_ptr, MPIR_Comm *comm_ptr, int root ); int MPID_Intercomm_exchange_map( MPIR_Comm *local_comm_ptr, int local_leader, MPIR_Comm *peer_comm_ptr, int remote_leader, - int *remote_size, uint64_t **remote_lpids, + int *remote_size, MPIR_Lpid **remote_lpids, int *is_low_group); int MPID_Create_intercomm_from_lpids( MPIR_Comm *newcomm_ptr, - int size, const uint64_t lpids[] ); + int size, const MPIR_Lpid lpids[] ); +int MPID_Comm_get_lpid(MPIR_Comm *comm_ptr, int idx, MPIR_Lpid *lpid_ptr, bool is_remote); #define MPID_INTERCOMM_NO_DYNPROC(comm) (0) diff --git a/src/mpid/ch3/include/mpidpre.h b/src/mpid/ch3/include/mpidpre.h index 595434c3aff..cac9d262c43 100644 --- a/src/mpid/ch3/include/mpidpre.h +++ b/src/mpid/ch3/include/mpidpre.h @@ -829,7 +829,6 @@ int MPID_Progress_poke(void); int MPID_Get_processor_name( char *name, int namelen, int *resultlen); int MPID_Get_universe_size(int * universe_size); -int MPID_Comm_get_lpid(MPIR_Comm *comm_ptr, int idx, uint64_t *lpid_ptr, bool is_remote); #define MPID_Request_create_from_comm(kind, comm) MPIR_Request_create(kind) void MPID_Request_create_hook(MPIR_Request *); diff --git a/src/mpid/ch3/src/mpid_vc.c b/src/mpid/ch3/src/mpid_vc.c index 81cb71c91e6..be840afacbf 100644 --- a/src/mpid/ch3/src/mpid_vc.c +++ b/src/mpid/ch3/src/mpid_vc.c @@ -241,7 +241,7 @@ int MPIDI_VCR_Dup(MPIDI_VCR orig_vcr, MPIDI_VCR * new_vcr) /*@ MPID_Comm_get_lpid - Get the local process ID for a given VC reference @*/ -int MPID_Comm_get_lpid(MPIR_Comm *comm_ptr, int idx, uint64_t *lpid_ptr, bool is_remote) +int MPID_Comm_get_lpid(MPIR_Comm *comm_ptr, int idx, MPIR_Lpid *lpid_ptr, bool is_remote) { MPIR_FUNC_ENTER; @@ -383,7 +383,7 @@ static inline int MPIDI_LPID_GetAllInComm(MPIR_Comm *comm_ptr, int local_size, int mpi_errno = MPI_SUCCESS; MPIR_Assert( comm_ptr->local_size == local_size ); for (i=0; ilocal_size; i++) { - uint64_t tmp_lpid; + MPIR_Lpid tmp_lpid; mpi_errno |= MPID_Comm_get_lpid( comm_ptr, i, &tmp_lpid, FALSE ); local_lpids[i] = tmp_lpid; } @@ -461,13 +461,13 @@ static int check_disjoint_lpids(uint64_t lpids1[], int n1, uint64_t lpids2[], in @*/ int MPID_Intercomm_exchange_map(MPIR_Comm *local_comm_ptr, int local_leader, MPIR_Comm *peer_comm_ptr, int remote_leader, - int *remote_size, uint64_t **remote_lpids, + int *remote_size, MPIR_Lpid **remote_lpids, int *is_low_group) { int mpi_errno = MPI_SUCCESS; int singlePG; int local_size; - uint64_t *local_lpids=0; + MPIR_Lpid *local_lpids=0; MPIDI_Gpid *local_gpids=NULL, *remote_gpids=NULL; int comm_info[2]; int cts_tag; @@ -500,9 +500,9 @@ int MPID_Intercomm_exchange_map(MPIR_Comm *local_comm_ptr, int local_leader, /* With this information, we can now send and receive the global process ids from the peer. */ MPIR_CHKLMEM_MALLOC(remote_gpids,MPIDI_Gpid*,(*remote_size)*sizeof(MPIDI_Gpid), mpi_errno,"remote_gpids", MPL_MEM_DYNAMIC); - *remote_lpids = (uint64_t*) MPL_malloc((*remote_size)*sizeof(uint64_t), MPL_MEM_ADDRESS); + *remote_lpids = MPL_malloc((*remote_size)*sizeof(MPIR_Lpid), MPL_MEM_ADDRESS); MPIR_CHKLMEM_MALLOC(local_gpids,MPIDI_Gpid*,local_size*sizeof(MPIDI_Gpid), mpi_errno,"local_gpids", MPL_MEM_DYNAMIC); - MPIR_CHKLMEM_MALLOC(local_lpids,uint64_t*,local_size*sizeof(uint64_t), mpi_errno,"local_lpids", MPL_MEM_DYNAMIC); + MPIR_CHKLMEM_MALLOC(local_lpids,MPIR_Lpid*,local_size*sizeof(MPIR_Lpid), mpi_errno,"local_lpids", MPL_MEM_DYNAMIC); mpi_errno = MPIDI_GPID_GetAllInComm( local_comm_ptr, local_size, local_gpids, &singlePG ); MPIR_ERR_CHECK(mpi_errno); @@ -570,7 +570,7 @@ int MPID_Intercomm_exchange_map(MPIR_Comm *local_comm_ptr, int local_leader, MPIR_ERR_CHECK(mpi_errno); *remote_size = comm_info[0]; MPIR_CHKLMEM_MALLOC(remote_gpids,MPIDI_Gpid*,(*remote_size)*sizeof(MPIDI_Gpid), mpi_errno,"remote_gpids", MPL_MEM_DYNAMIC); - *remote_lpids = (uint64_t*) MPL_malloc((*remote_size)*sizeof(uint64_t), MPL_MEM_ADDRESS); + *remote_lpids = MPL_malloc((*remote_size)*sizeof(MPIR_Lpid), MPL_MEM_ADDRESS); mpi_errno = MPIR_Bcast( remote_gpids, (*remote_size)*sizeof(MPIDI_Gpid), MPI_BYTE, local_leader, local_comm_ptr, MPIR_ERR_NONE ); MPIR_ERR_CHECK(mpi_errno); @@ -621,7 +621,7 @@ int MPID_Intercomm_exchange_map(MPIR_Comm *local_comm_ptr, int local_leader, 'MPI_Comm_connect/MPI_Comm_accept'. Thus, it is only used for intercommunicators. @*/ int MPID_Create_intercomm_from_lpids( MPIR_Comm *newcomm_ptr, - int size, const uint64_t lpids[] ) + int size, const MPIR_Lpid lpids[] ) { int mpi_errno = MPI_SUCCESS; MPIR_Comm *commworld_ptr; diff --git a/src/mpid/ch4/ch4_api.txt b/src/mpid/ch4/ch4_api.txt index e9a2e2b7e5d..9165d4b8ed0 100644 --- a/src/mpid/ch4/ch4_api.txt +++ b/src/mpid/ch4/ch4_api.txt @@ -89,10 +89,10 @@ Non Native API: SHM*: rank, comm, handler_id, tag, buf-2, count, datatype, src_vci, dst_vci, rreq get_local_upids : int NM : comm, local_upid_size, local_upids - upids_to_gpids : int - NM : size, remote_upid_size, remote_upids, remote_gpids + upids_to_lpids : int + NM : size, remote_upid_size, remote_upids, remote_lpids dynamic_send : int - NM : remote_gpid, tag, buf, size, timeout + NM : remote_lpid, tag, buf, size, timeout dynamic_recv : int NM : tag, buf-2, size, timeout mpi_comm_commit_pre_hook : int @@ -499,8 +499,8 @@ PARAM: recvcounts: const MPI_Aint * recvtype: MPI_Datatype recvtypes: const MPI_Datatype[] - remote_gpid: uint64_t - remote_gpids: uint64_t * + remote_lpid: MPIR_Lpid + remote_lpids: MPIR_Lpid * remote_upid_size: int * remote_upids: char * req: MPIR_Request * diff --git a/src/mpid/ch4/include/mpidch4.h b/src/mpid/ch4/include/mpidch4.h index 3dd3528efbc..f3f57a722c9 100644 --- a/src/mpid/ch4/include/mpidch4.h +++ b/src/mpid/ch4/include/mpidch4.h @@ -26,7 +26,7 @@ int MPID_Comm_get_all_failed_procs(MPIR_Comm *, MPIR_Group **, int); int MPID_Comm_revoke(MPIR_Comm *, int); int MPID_Comm_failure_ack(MPIR_Comm *); MPL_STATIC_INLINE_PREFIX int MPID_Comm_AS_enabled(MPIR_Comm *) MPL_STATIC_INLINE_SUFFIX; -int MPID_Comm_get_lpid(MPIR_Comm *, int, uint64_t *, bool); +int MPID_Comm_get_lpid(MPIR_Comm *, int, MPIR_Lpid *, bool); int MPID_CS_finalize(void); int MPID_Finalize(void); int MPID_Get_universe_size(int *); @@ -167,8 +167,8 @@ int MPID_Type_commit_hook(MPIR_Datatype *); int MPID_Type_free_hook(MPIR_Datatype *); int MPID_Op_commit_hook(MPIR_Op *); int MPID_Op_free_hook(MPIR_Op *); -int MPID_Intercomm_exchange_map(MPIR_Comm *, int, MPIR_Comm *, int, int *, uint64_t **, int *); -int MPID_Create_intercomm_from_lpids(MPIR_Comm *, int, const uint64_t[]); +int MPID_Intercomm_exchange_map(MPIR_Comm *, int, MPIR_Comm *, int, int *, MPIR_Lpid **, int *); +int MPID_Create_intercomm_from_lpids(MPIR_Comm *, int, const MPIR_Lpid[]); int MPID_Comm_commit_pre_hook(MPIR_Comm *); int MPID_Comm_free_hook(MPIR_Comm *); int MPID_Comm_set_hints(MPIR_Comm *, MPIR_Info *); diff --git a/src/mpid/ch4/netmod/ofi/ofi_spawn.c b/src/mpid/ch4/netmod/ofi/ofi_spawn.c index dbe0171f5dd..df0234b3bd1 100644 --- a/src/mpid/ch4/netmod/ofi/ofi_spawn.c +++ b/src/mpid/ch4/netmod/ofi/ofi_spawn.c @@ -7,7 +7,7 @@ #include "ofi_impl.h" #include "ofi_noinline.h" -int MPIDI_OFI_dynamic_send(uint64_t remote_gpid, int tag, const void *buf, int size, int timeout) +int MPIDI_OFI_dynamic_send(MPIR_Lpid remote_lpid, int tag, const void *buf, int size, int timeout) { int mpi_errno = MPI_SUCCESS; @@ -16,8 +16,8 @@ int MPIDI_OFI_dynamic_send(uint64_t remote_gpid, int tag, const void *buf, int s int nic = 0; /* dynamic process only use nic 0 */ int vci = 0; /* dynamic process only use vci 0 */ int ctx_idx = 0; - int avtid = MPIDIU_GPID_GET_AVTID(remote_gpid); - int lpid = MPIDIU_GPID_GET_LPID(remote_gpid); + int avtid = MPIDIU_GPID_GET_AVTID(remote_lpid); + int lpid = MPIDIU_GPID_GET_LPID(remote_lpid); fi_addr_t remote_addr = MPIDI_OFI_av_to_phys(&MPIDIU_get_av(avtid, lpid), nic, vci); MPID_THREAD_CS_ENTER(VCI, MPIDI_VCI(vci).lock); @@ -135,8 +135,8 @@ int MPIDI_OFI_dynamic_recv(int tag, void *buf, int size, int timeout) /* the following functions are "proc" functions, but because they are only used during dynamic * process spawning, having them here provides better context */ -int MPIDI_OFI_upids_to_gpids(int size, int *remote_upid_size, char *remote_upids, - uint64_t * remote_gpids) +int MPIDI_OFI_upids_to_lpids(int size, int *remote_upid_size, char *remote_upids, + MPIR_Lpid * remote_lpids) { int i, mpi_errno = MPI_SUCCESS; int *new_avt_procs; @@ -178,7 +178,7 @@ int MPIDI_OFI_upids_to_gpids(int size, int *remote_upid_size, char *remote_upids MPIDI_OFI_TO_PHYS(k, j, nic), &tbladdr, &sz), 0, avlookup); if (sz == addrname_len && !memcmp(tbladdr, addrname, addrname_len)) { - remote_gpids[i] = MPIDIU_GPID_CREATE(k, j); + remote_lpids[i] = MPIDIU_GPID_CREATE(k, j); found = 1; break; } @@ -217,7 +217,7 @@ int MPIDI_OFI_upids_to_gpids(int size, int *remote_upid_size, char *remote_upids MPIR_ERR_CHECK(mpi_errno); MPIDIU_get_av(avtid, i).node_id = node_id; - remote_gpids[new_avt_procs[i]] = MPIDIU_GPID_CREATE(avtid, i); + remote_lpids[new_avt_procs[i]] = MPIDIU_GPID_CREATE(avtid, i); } } diff --git a/src/mpid/ch4/netmod/ucx/ucx_spawn.c b/src/mpid/ch4/netmod/ucx/ucx_spawn.c index 05e888d5639..e78dc2a0af3 100644 --- a/src/mpid/ch4/netmod/ucx/ucx_spawn.c +++ b/src/mpid/ch4/netmod/ucx/ucx_spawn.c @@ -20,7 +20,7 @@ static void dynamic_recv_cb(void *request, ucs_status_t status, *done = true; } -int MPIDI_UCX_dynamic_send(uint64_t remote_gpid, int tag, const void *buf, int size, int timeout) +int MPIDI_UCX_dynamic_send(MPIR_Lpid remote_lpid, int tag, const void *buf, int size, int timeout) { int mpi_errno = MPI_SUCCESS; @@ -29,8 +29,8 @@ int MPIDI_UCX_dynamic_send(uint64_t remote_gpid, int tag, const void *buf, int s MPID_THREAD_CS_ENTER(VCI, MPIDI_VCI(vci).lock); - int avtid = MPIDIU_GPID_GET_AVTID(remote_gpid); - int lpid = MPIDIU_GPID_GET_LPID(remote_gpid); + int avtid = MPIDIU_GPID_GET_AVTID(remote_lpid); + int lpid = MPIDIU_GPID_GET_LPID(remote_lpid); ucp_ep_h ep = MPIDI_UCX_AV_TO_EP(&MPIDIU_get_av(avtid, lpid), vci, vci); bool done = false; @@ -147,8 +147,8 @@ int MPIDI_UCX_get_local_upids(MPIR_Comm * comm, int **local_upid_size, char **lo goto fn_exit; } -int MPIDI_UCX_upids_to_gpids(int size, int *remote_upid_size, char *remote_upids, - uint64_t * remote_gpids) +int MPIDI_UCX_upids_to_lpids(int size, int *remote_upid_size, char *remote_upids, + MPIR_Lpid * remote_lpids) { int mpi_errno = MPI_SUCCESS; @@ -167,7 +167,7 @@ int MPIDI_UCX_upids_to_gpids(int size, int *remote_upid_size, char *remote_upids for (int i = 0; i < size; i++) { MPIDI_upid_hash *t = MPIDIU_upidhash_find(curr_upid, remote_upid_size[i]); if (t) { - remote_gpids[i] = MPIDIU_GPID_CREATE(t->avtid, t->lpid); + remote_lpids[i] = MPIDIU_GPID_CREATE(t->avtid, t->lpid); } else { new_avt_procs[n_new_procs] = i; new_upids[n_new_procs] = curr_upid; @@ -193,7 +193,7 @@ int MPIDI_UCX_upids_to_gpids(int size, int *remote_upid_size, char *remote_upids MPIDI_UCX_CHK_STATUS(ucx_status); MPIDIU_upidhash_add(new_upids[i], remote_upid_size[new_avt_procs[i]], avtid, i); - remote_gpids[new_avt_procs[i]] = MPIDIU_GPID_CREATE(avtid, i); + remote_lpids[new_avt_procs[i]] = MPIDIU_GPID_CREATE(avtid, i); } } diff --git a/src/mpid/ch4/src/ch4_comm.c b/src/mpid/ch4/src/ch4_comm.c index 808d6f6e21b..aa705061b22 100644 --- a/src/mpid/ch4/src/ch4_comm.c +++ b/src/mpid/ch4/src/ch4_comm.c @@ -391,7 +391,7 @@ int MPID_Comm_set_hints(MPIR_Comm * comm_ptr, MPIR_Info * info_ptr) } int MPID_Intercomm_exchange_map(MPIR_Comm * local_comm, int local_leader, MPIR_Comm * peer_comm, - int remote_leader, int *remote_size, uint64_t ** remote_gpids, + int remote_leader, int *remote_size, MPIR_Lpid ** remote_lpids, int *is_low_group) { int mpi_errno = MPI_SUCCESS; @@ -402,7 +402,7 @@ int MPID_Intercomm_exchange_map(MPIR_Comm * local_comm, int local_leader, MPIR_C int cts_tag = 0; int pure_intracomm = 1; int local_size = 0; - uint64_t *local_gpids = NULL; + MPIR_Lpid *local_lpids = NULL; int *local_upid_size = NULL, *remote_upid_size = NULL; int upid_send_size = 0, upid_recv_size = 0; char *local_upids = NULL, *remote_upids = NULL; @@ -462,13 +462,13 @@ int MPID_Intercomm_exchange_map(MPIR_Comm * local_comm, int local_leader, MPIR_C (MPL_DBG_FDEST, "local size = %d, remote size = %d, pure_intracomm = %d", local_size, *remote_size, pure_intracomm)); - MPIR_CHKPMEM_MALLOC((*remote_gpids), uint64_t *, (*remote_size) * sizeof(uint64_t), - mpi_errno, "remote_gpids", MPL_MEM_ADDRESS); - MPIR_CHKLMEM_MALLOC(local_gpids, uint64_t *, local_size * sizeof(uint64_t), - mpi_errno, "local_gpids", MPL_MEM_ADDRESS); + MPIR_CHKPMEM_MALLOC((*remote_lpids), MPIR_Lpid *, (*remote_size) * sizeof(MPIR_Lpid), + mpi_errno, "remote_lpids", MPL_MEM_ADDRESS); + MPIR_CHKLMEM_MALLOC(local_lpids, MPIR_Lpid *, local_size * sizeof(MPIR_Lpid), + mpi_errno, "local_lpids", MPL_MEM_ADDRESS); for (i = 0; i < local_size; i++) { MPIDIU_comm_rank_to_pid(local_comm, i, &lpid, &avtid); - local_gpids[i] = MPIDIU_GPID_CREATE(avtid, lpid); + local_lpids[i] = MPIDIU_GPID_CREATE(avtid, lpid); } /* TODO: optimizations -- @@ -506,12 +506,12 @@ int MPID_Intercomm_exchange_map(MPIR_Comm * local_comm, int local_leader, MPIR_C MPIR_ERR_CHECK(mpi_errno); /* Stage 1.2 convert remote UPID to GPID and get GPID for local group */ - MPIDIU_upids_to_gpids(*remote_size, remote_upid_size, remote_upids, *remote_gpids); + MPIDIU_upids_to_lpids(*remote_size, remote_upid_size, remote_upids, *remote_lpids); } else { /* Stage 1.1f only exchange GPIDS if no dynamic process involved */ - mpi_errno = MPIC_Sendrecv(local_gpids, local_size, MPI_UINT64_T, + mpi_errno = MPIC_Sendrecv(local_lpids, local_size, MPI_UINT64_T, remote_leader, cts_tag, - *remote_gpids, *remote_size, MPI_UINT64_T, + *remote_lpids, *remote_size, MPI_UINT64_T, remote_leader, cts_tag, peer_comm, MPI_STATUS_IGNORE, MPIR_ERR_NONE); MPIR_ERR_CHECK(mpi_errno); @@ -536,8 +536,8 @@ int MPID_Intercomm_exchange_map(MPIR_Comm * local_comm, int local_leader, MPIR_C { /* Now that we have both the local and remote processes, * check for any overlap */ - mpi_errno = MPIDI_check_disjoint_gpids(local_gpids, local_size, - *remote_gpids, *remote_size); + mpi_errno = MPIDI_check_disjoint_lpids(local_lpids, local_size, + *remote_lpids, *remote_size); MPIR_ERR_CHECK(mpi_errno); } MPID_END_ERROR_CHECKS; @@ -552,7 +552,7 @@ int MPID_Intercomm_exchange_map(MPIR_Comm * local_comm, int local_leader, MPIR_C * local group is always smaller than remote */ if (pure_intracomm) { - *is_low_group = local_gpids[0] < (*remote_gpids)[0]; + *is_low_group = local_lpids[0] < (*remote_lpids)[0]; } else { if (local_upid_size[0] == remote_upid_size[0]) { *is_low_group = memcmp(local_upids, remote_upids, local_upid_size[0]); @@ -568,7 +568,7 @@ int MPID_Intercomm_exchange_map(MPIR_Comm * local_comm, int local_leader, MPIR_C /* At this point, we're done with the local lpids; they'll * be freed with the other local memory on exit */ - local_gpids = NULL; + local_lpids = NULL; } /* @@ -578,7 +578,7 @@ int MPID_Intercomm_exchange_map(MPIR_Comm * local_comm, int local_leader, MPIR_C (MPL_DBG_FDEST, "Intercomm map exchange stage 2: intra-group")); mpi_errno = MPIDIU_Intercomm_map_bcast_intra(local_comm, local_leader, remote_size, is_low_group, pure_intracomm, - remote_upid_size, remote_upids, remote_gpids); + remote_upid_size, remote_upids, remote_lpids); MPIR_ERR_CHECK(mpi_errno); MPIR_CHKPMEM_COMMIT(); @@ -590,14 +590,14 @@ int MPID_Intercomm_exchange_map(MPIR_Comm * local_comm, int local_leader, MPIR_C return mpi_errno; fn_fail: MPIR_CHKPMEM_REAP(); - *remote_gpids = NULL; + *remote_lpids = NULL; goto fn_exit; } int MPIDIU_Intercomm_map_bcast_intra(MPIR_Comm * local_comm, int local_leader, int *remote_size, int *is_low_group, int pure_intracomm, int *remote_upid_size, char *remote_upids, - uint64_t ** remote_gpids) + MPIR_Lpid ** remote_lpids) { int mpi_errno = MPI_SUCCESS; int i; @@ -611,6 +611,14 @@ int MPIDIU_Intercomm_map_bcast_intra(MPIR_Comm * local_comm, int local_leader, i MPIR_FUNC_ENTER; + MPI_Datatype lpid_datatype; + if (sizeof(MPIR_Lpid) == 8) { + lpid_datatype = MPI_UINT64_T; + } else { + MPIR_Assert(sizeof(MPIR_Lpid) == 4); + lpid_datatype = MPI_UINT32_T; + } + if (local_comm->rank == local_leader) { if (!pure_intracomm) { for (i = 0; i < (*remote_size); i++) { @@ -633,7 +641,7 @@ int MPIDIU_Intercomm_map_bcast_intra(MPIR_Comm * local_comm, int local_leader, i local_leader, local_comm, MPIR_ERR_NONE); MPIR_ERR_CHECK(mpi_errno); } else { - mpi_errno = MPIR_Bcast_allcomm_auto(*remote_gpids, *remote_size, MPI_UINT64_T, + mpi_errno = MPIR_Bcast_allcomm_auto(*remote_lpids, *remote_size, lpid_datatype, local_leader, local_comm, MPIR_ERR_NONE); } } else { @@ -645,8 +653,8 @@ int MPIDIU_Intercomm_map_bcast_intra(MPIR_Comm * local_comm, int local_leader, i *is_low_group = map_info[2]; pure_intracomm = map_info[3]; - MPIR_CHKPMEM_MALLOC((*remote_gpids), uint64_t *, (*remote_size) * sizeof(uint64_t), - mpi_errno, "remote_gpids", MPL_MEM_COMM); + MPIR_CHKPMEM_MALLOC((*remote_lpids), MPIR_Lpid *, (*remote_size) * sizeof(MPIR_Lpid), + mpi_errno, "remote_lpids", MPL_MEM_COMM); if (!pure_intracomm) { MPIR_CHKLMEM_MALLOC(_remote_upid_size, int *, (*remote_size) * sizeof(int), mpi_errno, "_remote_upid_size", MPL_MEM_COMM); @@ -659,9 +667,9 @@ int MPIDIU_Intercomm_map_bcast_intra(MPIR_Comm * local_comm, int local_leader, i local_leader, local_comm, MPIR_ERR_NONE); MPIR_ERR_CHECK(mpi_errno); - MPIDIU_upids_to_gpids(*remote_size, _remote_upid_size, _remote_upids, *remote_gpids); + MPIDIU_upids_to_lpids(*remote_size, _remote_upid_size, _remote_upids, *remote_lpids); } else { - mpi_errno = MPIR_Bcast_allcomm_auto(*remote_gpids, *remote_size, MPI_UINT64_T, + mpi_errno = MPIR_Bcast_allcomm_auto(*remote_lpids, *remote_size, lpid_datatype, local_leader, local_comm, MPIR_ERR_NONE); } } @@ -673,11 +681,11 @@ int MPIDIU_Intercomm_map_bcast_intra(MPIR_Comm * local_comm, int local_leader, i return mpi_errno; fn_fail: MPIR_CHKPMEM_REAP(); - *remote_gpids = NULL; + *remote_lpids = NULL; goto fn_exit; } -int MPID_Create_intercomm_from_lpids(MPIR_Comm * newcomm_ptr, int size, const uint64_t lpids[]) +int MPID_Create_intercomm_from_lpids(MPIR_Comm * newcomm_ptr, int size, const MPIR_Lpid lpids[]) { int mpi_errno = MPI_SUCCESS, i; MPIR_FUNC_ENTER; diff --git a/src/mpid/ch4/src/ch4_impl.h b/src/mpid/ch4/src/ch4_impl.h index 6fa918db043..7726ee90992 100644 --- a/src/mpid/ch4/src/ch4_impl.h +++ b/src/mpid/ch4/src/ch4_impl.h @@ -16,7 +16,7 @@ int MPIDIU_Intercomm_map_bcast_intra(MPIR_Comm * local_comm, int local_leader, int *remote_size, int *is_low_group, int pure_intracomm, int *remote_upid_size, char *remote_upids, - uint64_t ** remote_gpids); + MPIR_Lpid ** remote_lpids); int MPIDIG_get_context_index(uint64_t context_id); uint64_t MPIDIG_generate_win_id(MPIR_Comm * comm_ptr); diff --git a/src/mpid/ch4/src/ch4_proc.c b/src/mpid/ch4/src/ch4_proc.c index 56cb70b48c1..01b182da582 100644 --- a/src/mpid/ch4/src/ch4_proc.c +++ b/src/mpid/ch4/src/ch4_proc.c @@ -249,14 +249,14 @@ void MPIDIU_upidhash_free(void) /* convert upid to gpid by netmod. * For ofi netmod, it inserts the address and fills an av entry. */ -int MPIDIU_upids_to_gpids(int size, int *remote_upid_size, char *remote_upids, - uint64_t * remote_gpids) +int MPIDIU_upids_to_lpids(int size, int *remote_upid_size, char *remote_upids, + MPIR_Lpid * remote_lpids) { int mpi_errno = MPI_SUCCESS; MPIR_FUNC_ENTER; MPID_THREAD_CS_ENTER(VCI, MPIDIU_THREAD_DYNPROC_MUTEX); - mpi_errno = MPIDI_NM_upids_to_gpids(size, remote_upid_size, remote_upids, remote_gpids); + mpi_errno = MPIDI_NM_upids_to_lpids(size, remote_upid_size, remote_upids, remote_lpids); MPIR_ERR_CHECK(mpi_errno); fn_exit: diff --git a/src/mpid/ch4/src/ch4_proc.h b/src/mpid/ch4/src/ch4_proc.h index 749afce18f5..0bb9bbbbda8 100644 --- a/src/mpid/ch4/src/ch4_proc.h +++ b/src/mpid/ch4/src/ch4_proc.h @@ -33,8 +33,8 @@ void MPIDIU_upidhash_add(const void *upid, int upid_len, int avtid, int lpid); MPIDI_upid_hash *MPIDIU_upidhash_find(const void *upid, int upid_len); void MPIDIU_upidhash_free(void); #endif -int MPIDIU_upids_to_gpids(int size, int *remote_upid_size, char *remote_upids, - uint64_t * remote_gpids); +int MPIDIU_upids_to_lpids(int size, int *remote_upid_size, char *remote_upids, + MPIR_Lpid * remote_lpids); int MPIDIU_alloc_lut(MPIDI_rank_map_lut_t ** lut, int size); int MPIDIU_release_lut(MPIDI_rank_map_lut_t * lut); int MPIDIU_alloc_mlut(MPIDI_rank_map_mlut_t ** mlut, int size); diff --git a/src/mpid/ch4/src/ch4_spawn.c b/src/mpid/ch4/src/ch4_spawn.c index 6b59d171620..10241261336 100644 --- a/src/mpid/ch4/src/ch4_spawn.c +++ b/src/mpid/ch4/src/ch4_spawn.c @@ -290,7 +290,7 @@ static int peer_intercomm_create(char *remote_addrname, int len, int tag, { int mpi_errno = MPI_SUCCESS; int context_id, recvcontext_id; - uint64_t remote_gpid; + MPIR_Lpid remote_lpid; mpi_errno = MPIR_Get_contextid_sparse(MPIR_Process.comm_self, &recvcontext_id, FALSE); MPIR_ERR_CHECK(mpi_errno); @@ -299,8 +299,8 @@ static int peer_intercomm_create(char *remote_addrname, int len, int tag, if (is_sender) { /* insert remote address */ int addrname_len = len; - uint64_t *remote_gpids = &remote_gpid; - mpi_errno = MPIDIU_upids_to_gpids(1, &addrname_len, remote_addrname, remote_gpids); + MPIR_Lpid *remote_lpids = &remote_lpid; + mpi_errno = MPIDIU_upids_to_lpids(1, &addrname_len, remote_addrname, remote_lpids); MPIR_ERR_CHECK(mpi_errno); /* fill hdr with context_id and addrname */ @@ -317,7 +317,7 @@ static int peer_intercomm_create(char *remote_addrname, int len, int tag, /* send remote context_id + addrname */ int hdr_sz = sizeof(hdr) - MPIDI_DYNPROC_NAME_MAX + hdr.addrname_len; - mpi_errno = MPIDI_NM_dynamic_send(remote_gpid, tag, &hdr, hdr_sz, timeout); + mpi_errno = MPIDI_NM_dynamic_send(remote_lpid, tag, &hdr, hdr_sz, timeout); MPL_free(addrname); MPL_free(addrname_size); MPIR_ERR_CHECK(mpi_errno); @@ -333,19 +333,19 @@ static int peer_intercomm_create(char *remote_addrname, int len, int tag, /* insert remote address */ int addrname_len = hdr.addrname_len; - uint64_t *remote_gpids = &remote_gpid; - mpi_errno = MPIDIU_upids_to_gpids(1, &addrname_len, hdr.addrname, remote_gpids); + MPIR_Lpid *remote_lpids = &remote_lpid; + mpi_errno = MPIDIU_upids_to_lpids(1, &addrname_len, hdr.addrname, remote_lpids); MPIR_ERR_CHECK(mpi_errno); /* send remote context_id */ hdr.context_id = recvcontext_id; - mpi_errno = MPIDI_NM_dynamic_send(remote_gpid, tag, &hdr, sizeof(hdr.context_id), timeout); + mpi_errno = MPIDI_NM_dynamic_send(remote_lpid, tag, &hdr, sizeof(hdr.context_id), timeout); MPIR_ERR_CHECK(mpi_errno); } /* create peer intercomm */ mpi_errno = MPIR_peer_intercomm_create(context_id, recvcontext_id, - remote_gpid, is_sender, newcomm); + remote_lpid, is_sender, newcomm); MPIR_ERR_CHECK(mpi_errno); fn_exit: diff --git a/src/mpid/ch4/src/ch4i_comm.c b/src/mpid/ch4/src/ch4i_comm.c index 7a8b5a97d9f..d8a3fe3e9f6 100644 --- a/src/mpid/ch4/src/ch4i_comm.c +++ b/src/mpid/ch4/src/ch4i_comm.c @@ -928,7 +928,7 @@ static uint64_t shrink(uint64_t x, int num_low_bits) return ((x >> 32) << num_low_bits) + (x & 0xffffffff); } -int MPIDI_check_disjoint_gpids(uint64_t gpids1[], int n1, uint64_t gpids2[], int n2) +int MPIDI_check_disjoint_lpids(MPIR_Lpid lpids1[], int n1, MPIR_Lpid lpids2[], int n2) { int mpi_errno = MPI_SUCCESS; uint32_t gpidmaskPrealloc[128]; @@ -944,12 +944,12 @@ int MPIDI_check_disjoint_gpids(uint64_t gpids1[], int n1, uint64_t gpids2[], int /* Find the max low-32-bit gpid */ uint64_t max_lpid = 0; for (int i = 0; i < n1; i++) { - uint64_t n = gpids1[i] & 0xffffffff; + uint64_t n = lpids1[i] & 0xffffffff; if (n > max_lpid) max_lpid = n; } for (int i = 0; i < n2; i++) { - uint64_t n = gpids2[i] & 0xffffffff; + uint64_t n = lpids2[i] & 0xffffffff; if (n > max_lpid) max_lpid = n; } @@ -958,12 +958,12 @@ int MPIDI_check_disjoint_gpids(uint64_t gpids1[], int n1, uint64_t gpids2[], int uint64_t max_gpid = 0; for (int i = 0; i < n1; i++) { - uint64_t n = shrink(gpids1[i], num_low_bits); + uint64_t n = shrink(lpids1[i], num_low_bits); if (n > max_gpid) max_gpid = n; } for (int i = 0; i < n2; i++) { - uint64_t n = shrink(gpids2[i], num_low_bits); + uint64_t n = shrink(lpids2[i], num_low_bits); if (n > max_gpid) max_gpid = n; } @@ -981,7 +981,7 @@ int MPIDI_check_disjoint_gpids(uint64_t gpids1[], int n1, uint64_t gpids2[], int /* Set the bits for the first array */ for (int i = 0; i < n1; i++) { - uint64_t n = shrink(gpids1[i], num_low_bits); + uint64_t n = shrink(lpids1[i], num_low_bits); int idx = n / 32; int bit = n % 32; gpidmask[idx] = gpidmask[idx] | (1 << bit); @@ -990,12 +990,12 @@ int MPIDI_check_disjoint_gpids(uint64_t gpids1[], int n1, uint64_t gpids2[], int /* Look for any duplicates in the second array */ for (int i = 0; i < n2; i++) { - uint64_t n = shrink(gpids2[i], num_low_bits); + uint64_t n = shrink(lpids2[i], num_low_bits); int idx = n / 32; int bit = n % 32; if (gpidmask[idx] & (1 << bit)) { MPIR_ERR_SET1(mpi_errno, MPI_ERR_COMM, - "**dupprocesses", "**dupprocesses %d", gpids2[i]); + "**dupprocesses", "**dupprocesses %d", (int) lpids2[i]); goto fn_fail; } /* Add a check on duplicates *within* group 2 */ diff --git a/src/mpid/ch4/src/ch4i_comm.h b/src/mpid/ch4/src/ch4i_comm.h index 823d945ded4..40032de5698 100644 --- a/src/mpid/ch4/src/ch4i_comm.h +++ b/src/mpid/ch4/src/ch4i_comm.h @@ -9,6 +9,6 @@ #include "ch4_types.h" int MPIDI_comm_create_rank_map(MPIR_Comm * comm); -int MPIDI_check_disjoint_gpids(uint64_t gpids1[], int n1, uint64_t gpids2[], int n2); +int MPIDI_check_disjoint_lpids(MPIR_Lpid lpids1[], int n1, MPIR_Lpid lpids2[], int n2); #endif /* CH4I_COMM_H_INCLUDED */ From b18ac2743bb139d52c031095999bcc9a04afbfd0 Mon Sep 17 00:00:00 2001 From: Hui Zhou Date: Wed, 11 Dec 2024 14:39:45 -0600 Subject: [PATCH 12/19] group: add MPIR_Worlds We need a device-independent way of identifying processes. One way is to use the combination of (world_idx, world_rank). Thus, we need maintain a list of worlds so that the world_idx points to the world record. This may not fit in the concept of MPI group, but since the group need a ways of id processes, thus it seems most closely related. The first world, world_idx 0, is always initialized at init. Due to session re-init, we need make sure to reset num_worlds to 0 at finalize. New worlds will be added upon spawning or connecting dynamic processes (to-be-implemented). --- src/include/mpir_group.h | 26 ++++++++++++++++++++++++++ src/mpi/group/grouputil.c | 30 ++++++++++++++++++++++++++++++ src/mpi/init/mpir_init.c | 1 + src/util/mpir_pmi.c | 3 +++ 4 files changed, 60 insertions(+) diff --git a/src/include/mpir_group.h b/src/include/mpir_group.h index 1148a8e8006..5f8c619f1e0 100644 --- a/src/include/mpir_group.h +++ b/src/include/mpir_group.h @@ -43,6 +43,31 @@ S*/ +/* Worlds - + * We need a device-independent way of identifying processes. Assuming the concept of + * "worlds", we can describe a process with (world_idx, world_rank). + * + * The world_idx is a local id because each process may not see all worlds. Thus, + * each process only can maintain a list of worlds as it encounters them. Thus, + * a process id derived from (world_idx, world_rank) is referred as LPID, or + * "local process id". + * + * Each process should maintain a table of worlds with sufficient information so + * processes can match worlds upon connection or making address exchange. + */ + +#define MPIR_NAMESPACE_MAX 128 +struct MPIR_World { + char namespace[MPIR_NAMESPACE_MAX]; + /* other useful fields */ + int num_procs; +}; + +extern struct MPIR_World MPIR_Worlds[]; + +int MPIR_add_world(const char *namespace, int num_procs); +int MPIR_find_world(const char *namespace); + /* Abstract the integer type for lpid (process id). It is possible to use 32-bit * in principle, but 64-bit is simpler since we can trivially combine * (world_idx, world_rank). @@ -115,5 +140,6 @@ int MPIR_Group_lpid_to_rank(MPIR_Group * group, MPIR_Lpid lpid); int MPIR_Group_check_subset(MPIR_Group * group_ptr, MPIR_Comm * comm_ptr); void MPIR_Group_set_session_ptr(MPIR_Group * group_ptr, MPIR_Session * session_out); int MPIR_Group_init(void); +void MPIR_Group_finalize(void); #endif /* MPIR_GROUP_H_INCLUDED */ diff --git a/src/mpi/group/grouputil.c b/src/mpi/group/grouputil.c index 59c45561eca..9186cdaf5e5 100644 --- a/src/mpi/group/grouputil.c +++ b/src/mpi/group/grouputil.c @@ -6,6 +6,32 @@ #include "mpiimpl.h" #include "group.h" +/* Global world list. + * world_idx, part of MPIR_Lpid, points to this array */ +#define MPIR_MAX_WORLDS 1024 +static int num_worlds = 0; +struct MPIR_World MPIR_Worlds[MPIR_MAX_WORLDS]; + +int MPIR_add_world(const char *namespace, int num_procs) +{ + int world_idx = num_worlds++; + + MPL_strncpy(MPIR_Worlds[world_idx].namespace, namespace, MPIR_NAMESPACE_MAX); + MPIR_Worlds[world_idx].num_procs = num_procs; + + return world_idx; +} + +int MPIR_find_world(const char *namespace) +{ + for (int i = 0; i < num_worlds; i++) { + if (strncmp(MPIR_Worlds[i].namespace, namespace, MPIR_NAMESPACE_MAX) == 0) { + return i; + } + } + return -1; +} + /* Preallocated group objects */ MPIR_Group MPIR_Group_builtin[MPIR_GROUP_N_BUILTIN]; MPIR_Group MPIR_Group_direct[MPIR_GROUP_PREALLOC]; @@ -34,6 +60,10 @@ int MPIR_Group_init(void) return mpi_errno; } +void MPIR_Group_finalize(void) +{ + num_worlds = 0; +} int MPIR_Group_release(MPIR_Group * group_ptr) { diff --git a/src/mpi/init/mpir_init.c b/src/mpi/init/mpir_init.c index 2f1c115aa13..6e04cabd400 100644 --- a/src/mpi/init/mpir_init.c +++ b/src/mpi/init/mpir_init.c @@ -484,6 +484,7 @@ int MPII_Finalize(MPIR_Session * session_ptr) MPII_thread_mutex_destroy(); MPIR_Typerep_finalize(); + MPIR_Group_finalize(); MPL_atomic_store_int(&MPIR_Process.mpich_state, MPICH_MPI_STATE__UNINITIALIZED); fn_exit: diff --git a/src/util/mpir_pmi.c b/src/util/mpir_pmi.c index 9aff4e38dfa..d2b9eae8e5d 100644 --- a/src/util/mpir_pmi.c +++ b/src/util/mpir_pmi.c @@ -168,6 +168,9 @@ int MPIR_pmi_init(void) pmi_connected = true; } + int world_idx = MPIR_add_world(pmi_kvs_name, size); + MPIR_Assertp(world_idx == 0); + MPIR_Process.has_parent = has_parent; MPIR_Process.rank = rank; MPIR_Process.size = size; From c28b290e7bb5626e75473037adaf8fdc5d6234bb Mon Sep 17 00:00:00 2001 From: Hui Zhou Date: Wed, 11 Dec 2024 16:22:31 -0600 Subject: [PATCH 13/19] group: add builtin MPIR_GROUP_{WORLD,SELF} Add builtin MPIR_GROUP_WORLD and MPIR_GROUP_SELF, so we can create builtin communicators from builtin groups. --- src/include/mpir_group.h | 7 +++++++ src/include/mpir_objects.h | 2 +- src/mpi/group/group_impl.c | 9 ++++----- src/mpi/group/grouputil.c | 28 +++++++++++++++++++++++++++- 4 files changed, 39 insertions(+), 7 deletions(-) diff --git a/src/include/mpir_group.h b/src/include/mpir_group.h index 5f8c619f1e0..73fa1274425 100644 --- a/src/include/mpir_group.h +++ b/src/include/mpir_group.h @@ -43,6 +43,13 @@ S*/ +/* In addition to MPI_GROUP_EMPTY, internally we have a few more builtins */ +#define MPIR_GROUP_WORLD ((MPI_Group)0x48000001) +#define MPIR_GROUP_SELF ((MPI_Group)0x48000002) + +#define MPIR_GROUP_WORLD_PTR (MPIR_Group_builtin + 1) +#define MPIR_GROUP_SELF_PTR (MPIR_Group_builtin + 2) + /* Worlds - * We need a device-independent way of identifying processes. Assuming the concept of * "worlds", we can describe a process with (world_idx, world_rank). diff --git a/src/include/mpir_objects.h b/src/include/mpir_objects.h index 89e7aea8d35..2f2ffeb6dae 100644 --- a/src/include/mpir_objects.h +++ b/src/include/mpir_objects.h @@ -210,7 +210,7 @@ const char *MPIR_Handle_get_kind_str(int kind); #define MPIR_COMM_PREALLOC 8 #endif -#define MPIR_GROUP_N_BUILTIN 1 +#define MPIR_GROUP_N_BUILTIN 3 #ifdef MPID_GROUP_PREALLOC #define MPIR_GROUP_PREALLOC MPID_GROUP_PREALLOC #else diff --git a/src/mpi/group/group_impl.c b/src/mpi/group/group_impl.c index fa123a70efc..e10a2a486d1 100644 --- a/src/mpi/group/group_impl.c +++ b/src/mpi/group/group_impl.c @@ -438,12 +438,11 @@ int MPIR_Group_from_session_pset_impl(MPIR_Session * session_ptr, const char *ps int mpi_errno = MPI_SUCCESS; if (MPL_stricmp(pset_name, "mpi://WORLD") == 0) { - mpi_errno = MPIR_Group_create_stride(MPIR_Process.size, MPIR_Process.rank, session_ptr, - 0, 1, 1, new_group_ptr); - MPIR_ERR_CHECK(mpi_errno); + *new_group_ptr = MPIR_GROUP_WORLD_PTR; + MPIR_Group_add_ref(*new_group_ptr); } else if (MPL_stricmp(pset_name, "mpi://SELF") == 0) { - mpi_errno = MPIR_Group_create_stride(1, 0, session_ptr, 0, 1, 1, new_group_ptr); - MPIR_ERR_CHECK(mpi_errno); + *new_group_ptr = MPIR_GROUP_SELF_PTR; + MPIR_Group_add_ref(*new_group_ptr); } else { /* TODO: Implement pset struct, locate pset struct ptr */ MPIR_ERR_SETANDSTMT(mpi_errno, MPI_ERR_ARG, goto fn_fail, "**psetinvalidname"); diff --git a/src/mpi/group/grouputil.c b/src/mpi/group/grouputil.c index 9186cdaf5e5..2fdbd209289 100644 --- a/src/mpi/group/grouputil.c +++ b/src/mpi/group/grouputil.c @@ -48,7 +48,9 @@ int MPIR_Group_init(void) { int mpi_errno = MPI_SUCCESS; - MPIR_Assert(MPIR_GROUP_N_BUILTIN == 1); /* update this func if this ever triggers */ + MPIR_Assert(MPIR_GROUP_N_BUILTIN == 3); /* update this func if this ever triggers */ + + struct MPIR_Pmap *pmap; MPIR_Group_builtin[0].handle = MPI_GROUP_EMPTY; MPIR_Object_set_ref(&MPIR_Group_builtin[0], 1); @@ -57,6 +59,30 @@ int MPIR_Group_init(void) MPIR_Group_builtin[0].session_ptr = NULL; memset(&MPIR_Group_builtin[0].pmap, 0, sizeof(struct MPIR_Pmap)); + MPIR_Group_builtin[1].handle = MPIR_GROUP_WORLD; + MPIR_Object_set_ref(&MPIR_Group_builtin[1], 1); + MPIR_Group_builtin[1].size = MPIR_Process.size; + MPIR_Group_builtin[1].rank = MPIR_Process.rank; + MPIR_Group_builtin[1].session_ptr = NULL; + pmap = &MPIR_Group_builtin[1].pmap; + pmap->size = MPIR_Process.size; + pmap->use_map = false; + pmap->u.stride.offset = 0; + pmap->u.stride.stride = 1; + pmap->u.stride.blocksize = 1; + + MPIR_Group_builtin[2].handle = MPIR_GROUP_SELF; + MPIR_Object_set_ref(&MPIR_Group_builtin[2], 1); + MPIR_Group_builtin[2].size = 1; + MPIR_Group_builtin[2].rank = 0; + MPIR_Group_builtin[2].session_ptr = NULL; + pmap = &MPIR_Group_builtin[2].pmap; + pmap->size = 1; + pmap->use_map = false; + pmap->u.stride.offset = MPIR_Process.rank; + pmap->u.stride.stride = 1; + pmap->u.stride.blocksize = 1; + return mpi_errno; } From 21b288e07d156406dcd9e951e0e1b39c2090b83a Mon Sep 17 00:00:00 2001 From: Hui Zhou Date: Fri, 13 Dec 2024 07:43:05 -0600 Subject: [PATCH 14/19] group: add MPIR_Group_dup Internally the only reason to duplicate a group is to copy from NULL session to a new session. Otherwise, we can just use the same group and increment the reference count. --- src/include/mpir_group.h | 1 + src/mpi/group/group_impl.c | 8 ++++---- src/mpi/group/grouputil.c | 35 +++++++++++++++++++++++++++++++++++ 3 files changed, 40 insertions(+), 4 deletions(-) diff --git a/src/include/mpir_group.h b/src/include/mpir_group.h index 73fa1274425..da43528e168 100644 --- a/src/include/mpir_group.h +++ b/src/include/mpir_group.h @@ -136,6 +136,7 @@ int MPIR_Group_check_valid_ranges(MPIR_Group *, int[][3], int); int MPIR_Group_create(int, MPIR_Group **); int MPIR_Group_release(MPIR_Group * group_ptr); +int MPIR_Group_dup(MPIR_Group * old_group, MPIR_Session * session_ptr, MPIR_Group ** new_group_ptr); int MPIR_Group_create_map(int size, int rank, MPIR_Session * session_ptr, MPIR_Lpid * map, MPIR_Group ** new_group_ptr); int MPIR_Group_create_stride(int size, int rank, MPIR_Session * session_ptr, diff --git a/src/mpi/group/group_impl.c b/src/mpi/group/group_impl.c index e10a2a486d1..8e09e216554 100644 --- a/src/mpi/group/group_impl.c +++ b/src/mpi/group/group_impl.c @@ -438,11 +438,11 @@ int MPIR_Group_from_session_pset_impl(MPIR_Session * session_ptr, const char *ps int mpi_errno = MPI_SUCCESS; if (MPL_stricmp(pset_name, "mpi://WORLD") == 0) { - *new_group_ptr = MPIR_GROUP_WORLD_PTR; - MPIR_Group_add_ref(*new_group_ptr); + mpi_errno = MPIR_Group_dup(MPIR_GROUP_WORLD_PTR, session_ptr, new_group_ptr); + MPIR_ERR_CHECK(mpi_errno); } else if (MPL_stricmp(pset_name, "mpi://SELF") == 0) { - *new_group_ptr = MPIR_GROUP_SELF_PTR; - MPIR_Group_add_ref(*new_group_ptr); + mpi_errno = MPIR_Group_dup(MPIR_GROUP_SELF_PTR, session_ptr, new_group_ptr); + MPIR_ERR_CHECK(mpi_errno); } else { /* TODO: Implement pset struct, locate pset struct ptr */ MPIR_ERR_SETANDSTMT(mpi_errno, MPI_ERR_ARG, goto fn_fail, "**psetinvalidname"); diff --git a/src/mpi/group/grouputil.c b/src/mpi/group/grouputil.c index 2fdbd209289..939e321a56c 100644 --- a/src/mpi/group/grouputil.c +++ b/src/mpi/group/grouputil.c @@ -141,6 +141,41 @@ int MPIR_Group_create(int nproc, MPIR_Group ** new_group_ptr) return mpi_errno; } +/* Internally the only reason to duplicate a group is to copy from NULL session to a new session. + * Otherwise, we can just use the same group and increment the reference count. + */ +int MPIR_Group_dup(MPIR_Group * old_group, MPIR_Session * session_ptr, MPIR_Group ** new_group_ptr) +{ + int mpi_errno = MPI_SUCCESS; + MPIR_Group *new_group; + + new_group = (MPIR_Group *) MPIR_Handle_obj_alloc(&MPIR_Group_mem); + if (!new_group) { + mpi_errno = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, "MPIR_Group_dup", + __LINE__, MPI_ERR_OTHER, "**nomem", 0); + goto fn_fail; + } + MPIR_Object_set_ref(new_group, 1); + + /* initialize fields */ + new_group->size = old_group->size; + new_group->rank = old_group->rank; + MPIR_Group_set_session_ptr(new_group, session_ptr); + memcpy(&new_group->pmap, &old_group->pmap, sizeof(struct MPIR_Pmap)); + if (old_group->pmap.use_map) { + new_group->pmap.u.map = MPL_malloc(old_group->size * sizeof(MPIR_Lpid), MPL_MEM_GROUP); + MPIR_ERR_CHKANDJUMP(!new_group->pmap.u.map, mpi_errno, MPI_ERR_OTHER, "**nomem"); + memcpy(new_group->pmap.u.map, old_group->pmap.u.map, old_group->size * sizeof(MPIR_Lpid)); + } + + *new_group_ptr = new_group; + + fn_exit: + return mpi_errno; + fn_fail: + goto fn_exit; +} + int MPIR_Group_create_map(int size, int rank, MPIR_Session * session_ptr, MPIR_Lpid * map, MPIR_Group ** new_group_ptr) { From b24bbcb8b20fafbfdcc20aa70258e40dd85ab2bd Mon Sep 17 00:00:00 2001 From: Hui Zhou Date: Thu, 12 Dec 2024 13:45:00 -0600 Subject: [PATCH 15/19] binding/group: remove error check in MPI_Group_free Since builting groups can be returned to users, they should be allowed to free. They are reference counted anyway. --- src/binding/c/group_api.txt | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/src/binding/c/group_api.txt b/src/binding/c/group_api.txt index dd2074024d0..532389d61ac 100644 --- a/src/binding/c/group_api.txt +++ b/src/binding/c/group_api.txt @@ -37,18 +37,6 @@ MPI_Group_excl: MPI_Group_free: .desc: Frees a group -{ -- error_check -- - /* Cannot free the predefined groups, but allow GROUP_EMPTY - * because otherwise many tests fail */ - if ((HANDLE_IS_BUILTIN(*group)) && *group != MPI_GROUP_EMPTY) { - mpi_errno = MPIR_Err_create_code(MPI_SUCCESS, - MPIR_ERR_RECOVERABLE, __func__, __LINE__, - MPI_ERR_GROUP, "**groupperm", 0); - } - if (mpi_errno) { - goto fn_fail; - } -} MPI_Group_incl: .desc: Produces a group by reordering an existing group and taking only listed members From 6082a682cb29cae052fbfbf677a80e9e9a425610 Mon Sep 17 00:00:00 2001 From: Hui Zhou Date: Wed, 11 Dec 2024 16:45:22 -0600 Subject: [PATCH 16/19] comm: always set local_group and remote_group To make MPI group a first-class citizen, we will always have group before creating communicators, so that when device layer activate communiators, e.g. in MPID_Comm_commit_pre_hook, it can rely on the group to look up the involved processes. It also removes the necessity to maintain any other process addressing schems. --- src/include/mpir_comm.h | 6 +-- src/mpi/comm/builtin_comms.c | 9 ++++ src/mpi/comm/comm_impl.c | 93 +++++++++++++++++++++++++++++++----- src/mpi/comm/comm_split.c | 13 +++++ src/mpi/comm/commutil.c | 34 +++++++++++++ src/mpid/ch3/src/ch3u_port.c | 24 +++++++++- src/mpid/ch4/src/ch4_comm.c | 4 ++ src/mpid/ch4/src/init_comm.c | 15 +++++- 8 files changed, 179 insertions(+), 19 deletions(-) diff --git a/src/include/mpir_comm.h b/src/include/mpir_comm.h index 8af43abc6d7..af50031ebaf 100644 --- a/src/include/mpir_comm.h +++ b/src/include/mpir_comm.h @@ -166,9 +166,9 @@ struct MPIR_Comm { int rank; /* Value of MPI_Comm_rank */ MPIR_Attribute *attributes; /* List of attributes */ int local_size; /* Value of MPI_Comm_size for local group */ - MPIR_Group *local_group, /* Groups in communicator. */ - *remote_group; /* The local and remote groups are the - * same for intra communicators */ + MPIR_Group *local_group; /* Groups in communicator. */ + MPIR_Group *remote_group; /* The remote group in a inter communicator. + * Must be NULL in a intra communicator. */ MPIR_Comm_kind_t comm_kind; /* MPIR_COMM_KIND__INTRACOMM or MPIR_COMM_KIND__INTERCOMM */ char name[MPI_MAX_OBJECT_NAME]; /* Required for MPI-2 */ MPIR_Errhandler *errhandler; /* Pointer to the error handler structure */ diff --git a/src/mpi/comm/builtin_comms.c b/src/mpi/comm/builtin_comms.c index 16a75588036..7e0273a677f 100644 --- a/src/mpi/comm/builtin_comms.c +++ b/src/mpi/comm/builtin_comms.c @@ -30,6 +30,9 @@ int MPIR_init_comm_world(void) MPIR_Process.comm_world->remote_size = MPIR_Process.size; MPIR_Process.comm_world->local_size = MPIR_Process.size; + MPIR_Process.comm_world->local_group = MPIR_GROUP_WORLD_PTR; + MPIR_Group_add_ref(MPIR_GROUP_WORLD_PTR); + mpi_errno = MPIR_Comm_commit(MPIR_Process.comm_world); MPIR_ERR_CHECK(mpi_errno); @@ -59,6 +62,9 @@ int MPIR_init_comm_self(void) MPIR_Process.comm_self->remote_size = 1; MPIR_Process.comm_self->local_size = 1; + MPIR_Process.comm_self->local_group = MPIR_GROUP_SELF_PTR; + MPIR_Group_add_ref(MPIR_GROUP_SELF_PTR); + mpi_errno = MPIR_Comm_commit(MPIR_Process.comm_self); MPIR_ERR_CHECK(mpi_errno); @@ -91,6 +97,9 @@ int MPIR_init_icomm_world(void) MPIR_Process.icomm_world->remote_size = MPIR_Process.size; MPIR_Process.icomm_world->local_size = MPIR_Process.size; + MPIR_Process.icomm_world->local_group = MPIR_GROUP_WORLD_PTR; + MPIR_Group_add_ref(MPIR_GROUP_WORLD_PTR); + mpi_errno = MPIR_Comm_commit(MPIR_Process.icomm_world); MPIR_ERR_CHECK(mpi_errno); diff --git a/src/mpi/comm/comm_impl.c b/src/mpi/comm/comm_impl.c index 56db002f58c..746b2825b6a 100644 --- a/src/mpi/comm/comm_impl.c +++ b/src/mpi/comm/comm_impl.c @@ -337,8 +337,7 @@ int MPIR_Comm_create_intra(MPIR_Comm * comm_ptr, MPIR_Group * group_ptr, MPIR_Co (*newcomm_ptr)->local_group = group_ptr; MPIR_Group_add_ref(group_ptr); - (*newcomm_ptr)->remote_group = group_ptr; - MPIR_Group_add_ref(group_ptr); + (*newcomm_ptr)->remote_group = NULL; (*newcomm_ptr)->context_id = (*newcomm_ptr)->recvcontext_id; (*newcomm_ptr)->remote_size = (*newcomm_ptr)->local_size = n; @@ -382,15 +381,12 @@ int MPIR_Comm_create_inter(MPIR_Comm * comm_ptr, MPIR_Group * group_ptr, MPIR_Co int mpi_errno = MPI_SUCCESS; int new_context_id; int *mapping = NULL; - int *remote_mapping = NULL; - MPIR_Comm *mapping_comm = NULL; - int remote_size = -1; - int rinfo[2]; MPIR_CHKLMEM_DECL(1); MPIR_FUNC_ENTER; MPIR_Assert(comm_ptr->comm_kind == MPIR_COMM_KIND__INTERCOMM); + MPIR_Session *session_ptr = comm_ptr->session_ptr; /* Create a new communicator from the specified group members */ @@ -409,6 +405,7 @@ int MPIR_Comm_create_inter(MPIR_Comm * comm_ptr, MPIR_Group * group_ptr, MPIR_Co MPIR_Assert(new_context_id != 0); MPIR_Assert(new_context_id != comm_ptr->recvcontext_id); + MPIR_Comm *mapping_comm; mpi_errno = MPII_Comm_create_calculate_mapping(group_ptr, comm_ptr, &mapping, &mapping_comm); MPIR_ERR_CHECK(mpi_errno); @@ -434,7 +431,7 @@ int MPIR_Comm_create_inter(MPIR_Comm * comm_ptr, MPIR_Group * group_ptr, MPIR_Co (*newcomm_ptr)->is_low_group = comm_ptr->is_low_group; - MPIR_Comm_set_session_ptr(*newcomm_ptr, comm_ptr->session_ptr); + MPIR_Comm_set_session_ptr(*newcomm_ptr, session_ptr); } /* There is an additional step. We must communicate the @@ -445,6 +442,11 @@ int MPIR_Comm_create_inter(MPIR_Comm * comm_ptr, MPIR_Group * group_ptr, MPIR_Co * in the remote group, from which the remote network address * mapping can be constructed. We need to use the "collective" * context in the original intercommunicator */ + + int remote_size = -1; + int *remote_mapping; /* a list of remote ranks */ + int rinfo[2]; + if (comm_ptr->rank == 0) { int info[2]; info[0] = new_context_id; @@ -494,6 +496,7 @@ int MPIR_Comm_create_inter(MPIR_Comm * comm_ptr, MPIR_Group * group_ptr, MPIR_Co MPIR_Assert(remote_size >= 0); + if (group_ptr->rank != MPI_UNDEFINED) { (*newcomm_ptr)->remote_size = remote_size; /* Now, everyone has the remote_mapping, and can apply that to @@ -505,6 +508,23 @@ int MPIR_Comm_create_inter(MPIR_Comm * comm_ptr, MPIR_Group * group_ptr, MPIR_Co mapping, remote_mapping, mapping_comm, *newcomm_ptr); MPIR_ERR_CHECK(mpi_errno); + /* create remote_group. + * FIXME: we can directly exchange group maps once we get rid of comm mappers */ + MPIR_Group *remote_group; + + MPIR_Lpid *remote_map; + remote_map = MPL_malloc(remote_size * sizeof(MPIR_Lpid), MPL_MEM_GROUP); + MPIR_ERR_CHKANDJUMP(!remote_map, mpi_errno, MPI_ERR_OTHER, "**nomem"); + + MPIR_Group *mapping_group = mapping_comm->remote_group; + MPIR_Assert(mapping_group); + for (int i = 0; i < remote_size; i++) { + remote_map[i] = MPIR_Group_rank_to_lpid(mapping_group, remote_mapping[i]); + } + mpi_errno = MPIR_Group_create_map(remote_size, MPI_UNDEFINED, session_ptr, remote_map, + &remote_group); + (*newcomm_ptr)->remote_group = remote_group; + (*newcomm_ptr)->tainted = comm_ptr->tainted; mpi_errno = MPIR_Comm_commit(*newcomm_ptr); MPIR_ERR_CHECK(mpi_errno); @@ -605,8 +625,7 @@ int MPIR_Comm_create_group_impl(MPIR_Comm * comm_ptr, MPIR_Group * group_ptr, in (*newcomm_ptr)->local_group = group_ptr; MPIR_Group_add_ref(group_ptr); - (*newcomm_ptr)->remote_group = group_ptr; - MPIR_Group_add_ref(group_ptr); + (*newcomm_ptr)->remote_group = NULL; (*newcomm_ptr)->context_id = (*newcomm_ptr)->recvcontext_id; (*newcomm_ptr)->remote_size = (*newcomm_ptr)->local_size = n; @@ -913,6 +932,9 @@ int MPIR_Comm_remote_group_impl(MPIR_Comm * comm_ptr, MPIR_Group ** group_ptr) int mpi_errno = MPI_SUCCESS; MPIR_FUNC_ENTER; + /* FIXME: remove the following remote_group creation once this assertion passes */ + MPIR_Assert(comm_ptr->comm_kind == MPIR_COMM_KIND__INTERCOMM && comm_ptr->remote_group); + /* Create a group and populate it with the local process ids */ if (!comm_ptr->remote_group) { int n = comm_ptr->remote_size; @@ -965,6 +987,7 @@ int MPIR_Intercomm_create_impl(MPIR_Comm * local_comm_ptr, int local_leader, uint64_t *remote_lpids = NULL; int comm_info[3]; int is_low_group = 0; + MPIR_Session *session_ptr = local_comm_ptr->session_ptr; MPIR_FUNC_ENTER; @@ -1042,7 +1065,14 @@ int MPIR_Intercomm_create_impl(MPIR_Comm * local_comm_ptr, int local_leader, (*new_intercomm_ptr)->local_comm = 0; (*new_intercomm_ptr)->is_low_group = is_low_group; - MPIR_Comm_set_session_ptr(*new_intercomm_ptr, local_comm_ptr->session_ptr); + (*new_intercomm_ptr)->local_group = local_comm_ptr->local_group; + MPIR_Group_add_ref(local_comm_ptr->local_group); + + /* construct remote_group */ + mpi_errno = MPIR_Group_create_map(remote_size, MPI_UNDEFINED, session_ptr, remote_lpids, + &(*new_intercomm_ptr)->remote_group); + + MPIR_Comm_set_session_ptr(*new_intercomm_ptr, session_ptr); mpi_errno = MPID_Create_intercomm_from_lpids(*new_intercomm_ptr, remote_size, remote_lpids); if (mpi_errno) @@ -1064,8 +1094,6 @@ int MPIR_Intercomm_create_impl(MPIR_Comm * local_comm_ptr, int local_leader, fn_exit: - MPL_free(remote_lpids); - remote_lpids = NULL; MPIR_FUNC_EXIT; return mpi_errno; fn_fail: @@ -1106,6 +1134,15 @@ int MPIR_peer_intercomm_create(int context_id, int recvcontext_id, } MPID_THREAD_CS_EXIT(VCI, comm_self->mutex); + MPIR_Session *session_ptr = NULL; /* Can we just use NULL session since peer_intercomm is always temporary? */ + MPIR_Lpid my_lpid = MPIR_Group_rank_to_lpid(comm_self->local_group, 0); + mpi_errno = MPIR_Group_create_stride(1, 0, session_ptr, my_lpid, 1, 1, + &(*newcomm)->local_group); + MPIR_ERR_CHECK(mpi_errno); + mpi_errno = MPIR_Group_create_stride(1, 0, session_ptr, remote_lpid, 1, 1, + &(*newcomm)->remote_group); + MPIR_ERR_CHECK(mpi_errno); + (*newcomm)->tainted = 1; mpi_errno = MPIR_Comm_commit(*newcomm); MPIR_ERR_CHECK(mpi_errno); @@ -1222,6 +1259,37 @@ int MPIR_Intercomm_merge_impl(MPIR_Comm * comm_ptr, int high, MPIR_Comm ** new_i MPIR_Comm_set_session_ptr(*new_intracomm_ptr, comm_ptr->session_ptr); + /* construct local_group */ + MPIR_Group *new_local_group; + + MPIR_Lpid *map; + map = MPL_malloc(new_size * sizeof(MPIR_Lpid), MPL_MEM_GROUP); + MPIR_ERR_CHKANDJUMP(!map, mpi_errno, MPI_ERR_OTHER, "**nomem"); + + int myrank; + MPIR_Group *group1, *group2; + if (local_high) { + group1 = comm_ptr->remote_group; + group2 = comm_ptr->local_group; + myrank = group1->size + group2->rank; + } else { + group1 = comm_ptr->local_group; + group2 = comm_ptr->remote_group; + myrank = group1->rank; + } + for (int i = 0; i < group1->size; i++) { + map[i] = MPIR_Group_rank_to_lpid(group1, i); + } + for (int i = 0; i < group2->size; i++) { + map[group1->size + i] = MPIR_Group_rank_to_lpid(group2, i); + } + + mpi_errno = MPIR_Group_create_map(new_size, myrank, comm_ptr->session_ptr, map, + &new_local_group); + + (*new_intracomm_ptr)->local_group = new_local_group; + MPIR_Group_add_ref(new_local_group); + /* Now we know which group comes first. Build the new mapping * from the existing comm */ mpi_errno = create_and_map(comm_ptr, local_high, (*new_intracomm_ptr)); @@ -1260,6 +1328,7 @@ int MPIR_Intercomm_merge_impl(MPIR_Comm * comm_ptr, int high, MPIR_Comm ** new_i (*new_intracomm_ptr)->recvcontext_id = new_context_id; MPIR_Comm_set_session_ptr(*new_intracomm_ptr, comm_ptr->session_ptr); + (*new_intracomm_ptr)->local_group = new_local_group; mpi_errno = create_and_map(comm_ptr, local_high, (*new_intracomm_ptr)); MPIR_ERR_CHECK(mpi_errno); diff --git a/src/mpi/comm/comm_split.c b/src/mpi/comm/comm_split.c index 7c5519278e4..4c0e5a826c2 100644 --- a/src/mpi/comm/comm_split.c +++ b/src/mpi/comm/comm_split.c @@ -292,6 +292,10 @@ int MPIR_Comm_split_impl(MPIR_Comm * comm_ptr, int color, int key, MPIR_Comm ** (*newcomm_ptr)->rank = i; } + mpi_errno = MPIR_Group_incl_impl(comm_ptr->local_group, new_size, mapper->src_mapping, + &(*newcomm_ptr)->local_group); + MPIR_ERR_CHECK(mpi_errno); + /* For the remote group, the situation is more complicated. * We need to find the size of our "partner" group in the * remote comm. The easiest way (in terms of code) is for @@ -313,6 +317,11 @@ int MPIR_Comm_split_impl(MPIR_Comm * comm_ptr, int color, int key, MPIR_Comm ** for (i = 0; i < new_remote_size; i++) mapper->src_mapping[i] = remotekeytable[i].color; + mpi_errno = MPIR_Group_incl_impl(comm_ptr->remote_group, + new_remote_size, mapper->src_mapping, + &(*newcomm_ptr)->remote_group); + MPIR_ERR_CHECK(mpi_errno); + (*newcomm_ptr)->context_id = remote_context_id; (*newcomm_ptr)->remote_size = new_remote_size; (*newcomm_ptr)->local_comm = 0; @@ -331,6 +340,10 @@ int MPIR_Comm_split_impl(MPIR_Comm * comm_ptr, int color, int key, MPIR_Comm ** if (keytable[i].color == comm_ptr->rank) (*newcomm_ptr)->rank = i; } + + mpi_errno = MPIR_Group_incl_impl(comm_ptr->local_group, new_size, mapper->src_mapping, + &(*newcomm_ptr)->local_group); + MPIR_ERR_CHECK(mpi_errno); } /* Inherit the error handler (if any) */ diff --git a/src/mpi/comm/commutil.c b/src/mpi/comm/commutil.c index 9a51e8565ee..da824bff420 100644 --- a/src/mpi/comm/commutil.c +++ b/src/mpi/comm/commutil.c @@ -382,6 +382,10 @@ int MPII_Setup_intercomm_localcomm(MPIR_Comm * intercomm_ptr) mpi_errno = MPII_Comm_init(localcomm_ptr); MPIR_ERR_CHECK(mpi_errno); + MPIR_Assert(intercomm_ptr->local_group); + localcomm_ptr->local_group = intercomm_ptr->local_group; + MPIR_Group_add_ref(intercomm_ptr->local_group); + MPIR_Comm_set_session_ptr(localcomm_ptr, intercomm_ptr->session_ptr); /* use the parent intercomm's recv ctx as the basis for our ctx */ @@ -687,6 +691,14 @@ int MPIR_Comm_create_subcomms(MPIR_Comm * comm) /* Copy relevant hints to node_comm */ propagate_hints_to_subcomm(comm, comm->node_comm); + /* construct local_group */ + MPIR_Group *parent_group = comm->local_group; + MPIR_Assert(parent_group); + mpi_errno = MPIR_Group_incl_impl(parent_group, num_local, local_procs, + &comm->node_comm->local_group); + MPIR_ERR_CHECK(mpi_errno); + + /* mapper */ MPIR_Comm_map_irregular(comm->node_comm, comm, local_procs, num_local, MPIR_COMM_MAP_DIR__L2L, NULL); mpi_errno = MPIR_Comm_commit_internal(comm->node_comm); @@ -714,6 +726,14 @@ int MPIR_Comm_create_subcomms(MPIR_Comm * comm) /* Copy relevant hints to node_roots_comm */ propagate_hints_to_subcomm(comm, comm->node_roots_comm); + /* construct local_group */ + MPIR_Group *parent_group = comm->local_group; + MPIR_Assert(parent_group); + mpi_errno = MPIR_Group_incl_impl(parent_group, num_external, external_procs, + &comm->node_roots_comm->local_group); + MPIR_ERR_CHECK(mpi_errno); + + /* mapper */ MPIR_Comm_map_irregular(comm->node_roots_comm, comm, external_procs, num_external, MPIR_COMM_MAP_DIR__L2L, NULL); mpi_errno = MPIR_Comm_commit_internal(comm->node_roots_comm); @@ -961,6 +981,13 @@ int MPII_Comm_copy(MPIR_Comm * comm_ptr, int size, MPIR_Info * info, MPIR_Comm * newcomm_ptr->comm_kind = comm_ptr->comm_kind; newcomm_ptr->local_comm = 0; + newcomm_ptr->local_group = comm_ptr->local_group; + MPIR_Group_add_ref(comm_ptr->local_group); + if (comm_ptr->comm_kind == MPIR_COMM_KIND__INTERCOMM) { + newcomm_ptr->remote_group = comm_ptr->remote_group; + MPIR_Group_add_ref(comm_ptr->remote_group); + } + MPIR_Comm_set_session_ptr(newcomm_ptr, comm_ptr->session_ptr); /* There are two cases here - size is the same as the old communicator, @@ -1059,6 +1086,13 @@ int MPII_Comm_copy_data(MPIR_Comm * comm_ptr, MPIR_Info * info, MPIR_Comm ** out newcomm_ptr->comm_kind = comm_ptr->comm_kind; newcomm_ptr->local_comm = 0; + newcomm_ptr->local_group = comm_ptr->local_group; + MPIR_Group_add_ref(comm_ptr->local_group); + if (comm_ptr->comm_kind == MPIR_COMM_KIND__INTERCOMM) { + newcomm_ptr->remote_group = comm_ptr->remote_group; + MPIR_Group_add_ref(comm_ptr->remote_group); + } + if (comm_ptr->comm_kind == MPIR_COMM_KIND__INTRACOMM) MPIR_Comm_map_dup(newcomm_ptr, comm_ptr, MPIR_COMM_MAP_DIR__L2L); else diff --git a/src/mpid/ch3/src/ch3u_port.c b/src/mpid/ch3/src/ch3u_port.c index bd6c8bebfeb..39249e73035 100644 --- a/src/mpid/ch3/src/ch3u_port.c +++ b/src/mpid/ch3/src/ch3u_port.c @@ -544,6 +544,13 @@ static int MPIDI_CH3I_Initialize_tmp_comm(MPIR_Comm **comm_pptr, MPIR_Coll_comm_init(tmp_comm); + MPIR_Lpid local_lpid = tmp_comm->dev.local_vcrt->vcr_table[0]->lpid; + MPIR_Lpid remote_lpid = tmp_comm->dev.vcrt->vcr_table[0]->lpid; + mpi_errno = MPIR_Group_create_stride(1, 0, commself_ptr->session_ptr, local_lpid, 1, 1, + &tmp_comm->local_group); + mpi_errno = MPIR_Group_create_stride(1, 0, commself_ptr->session_ptr, remote_lpid, 1, 1, + &tmp_comm->remote_group); + /* Even though this is a tmp comm and we don't call MPI_Comm_commit, we still need to call the creation hook because the destruction hook will be called in comm_release */ @@ -1337,8 +1344,6 @@ static int SetupNewIntercomm( MPIR_Comm *comm_ptr, int remote_comm_size, intercomm->remote_size = remote_comm_size; intercomm->local_size = comm_ptr->local_size; intercomm->rank = comm_ptr->rank; - intercomm->local_group = NULL; - intercomm->remote_group = NULL; intercomm->comm_kind = MPIR_COMM_KIND__INTERCOMM; intercomm->local_comm = NULL; @@ -1356,6 +1361,21 @@ static int SetupNewIntercomm( MPIR_Comm *comm_ptr, int remote_comm_size, remote_translation[i].pg_rank, &intercomm->dev.vcrt->vcr_table[i]); } + intercomm->local_group = comm_ptr->local_group; + MPIR_Group_add_ref(comm_ptr->local_group); + + MPIR_Lpid *remote_map; + remote_map = MPL_malloc(remote_comm_size * sizeof(MPIR_Lpid), MPL_MEM_GROUP); + MPIR_ERR_CHKANDJUMP(!remote_map, mpi_errno, MPI_ERR_OTHER, "**nomem"); + for (i=0; i < intercomm->remote_size; i++) { + MPIDI_PG_t *pg = remote_pg[remote_translation[i].pg_index]; + int rank = remote_translation[i].pg_rank; + remote_map[i] = pg->vct[rank].lpid; + } + mpi_errno = MPIR_Group_create_map(remote_comm_size, MPI_UNDEFINED, comm_ptr->session_ptr, + remote_map, &intercomm->remote_group); + MPIR_ERR_CHECK(mpi_errno); + mpi_errno = MPIR_Comm_commit(intercomm); MPIR_ERR_CHECK(mpi_errno); diff --git a/src/mpid/ch4/src/ch4_comm.c b/src/mpid/ch4/src/ch4_comm.c index aa705061b22..2ca6c693cfa 100644 --- a/src/mpid/ch4/src/ch4_comm.c +++ b/src/mpid/ch4/src/ch4_comm.c @@ -795,6 +795,10 @@ int MPIDI_Comm_create_multi_leaders(MPIR_Comm * comm) MPIR_Comm_map_irregular(MPIDI_COMM(comm, multi_leads_comm), comm, external_procs, num_external, MPIR_COMM_MAP_DIR__L2L, NULL); + mpi_errno = MPIR_Group_incl_impl(comm->local_group, num_external, external_procs, + &MPIDI_COMM(comm, multi_leads_comm)->local_group); + MPIR_ERR_CHECK(mpi_errno); + /* Notify device of communicator creation */ mpi_errno = MPID_Comm_commit_pre_hook(MPIDI_COMM(comm, multi_leads_comm)); if (mpi_errno) diff --git a/src/mpid/ch4/src/init_comm.c b/src/mpid/ch4/src/init_comm.c index e546337bd6f..249d7700324 100644 --- a/src/mpid/ch4/src/init_comm.c +++ b/src/mpid/ch4/src/init_comm.c @@ -33,6 +33,17 @@ int MPIDI_create_init_comm(MPIR_Comm ** comm) init_comm->remote_size = node_roots_comm_size; init_comm->local_size = node_roots_comm_size; init_comm->coll.pof2 = MPL_pof2(node_roots_comm_size); + + MPIR_Lpid *map; + map = MPL_malloc(node_roots_comm_size * sizeof(MPIR_Lpid), MPL_MEM_GROUP); + MPIR_ERR_CHKANDJUMP(!map, mpi_errno, MPI_ERR_OTHER, "**nomem"); + for (i = 0; i < node_roots_comm_size; ++i) { + map[i] = MPIR_Process.node_root_map[i]; + } + mpi_errno = MPIR_Group_create_map(node_roots_comm_size, node_roots_comm_rank, NULL, + map, &init_comm->local_group); + MPIR_ERR_CHECK(mpi_errno); + MPIDI_COMM(init_comm, map).mode = MPIDI_RANK_MAP_LUT_INTRA; mpi_errno = MPIDIU_alloc_lut(&lut, node_roots_comm_size); MPIR_ERR_CHECK(mpi_errno); @@ -47,8 +58,8 @@ int MPIDI_create_init_comm(MPIR_Comm ** comm) mpi_errno = MPIDIG_init_comm(init_comm); MPIR_ERR_CHECK(mpi_errno); /* hacky, consider a separate MPIDI_{NM,SHM}_init_comm_hook - * to initialize the init_comm, e.g. to eliminate potential - * runtime features for stability during init */ + * to initialize the init_comm, e.g. to eliminate potential + * runtime features for stability during init */ mpi_errno = MPIDI_NM_mpi_comm_commit_pre_hook(init_comm); MPIR_ERR_CHECK(mpi_errno); From 138bad2fb56e3d96fe7fe6e0ef58a69948baaed4 Mon Sep 17 00:00:00 2001 From: Hui Zhou Date: Thu, 12 Dec 2024 22:53:50 -0600 Subject: [PATCH 17/19] group: avoid freeing MPIR_Group_empty Many places we just return MPIR_Group_empty without increment the ref_count. This is fixable. But for now, let's avoid freeing it. --- src/mpi/group/grouputil.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/mpi/group/grouputil.c b/src/mpi/group/grouputil.c index 939e321a56c..b61a85d1b15 100644 --- a/src/mpi/group/grouputil.c +++ b/src/mpi/group/grouputil.c @@ -94,10 +94,16 @@ void MPIR_Group_finalize(void) int MPIR_Group_release(MPIR_Group * group_ptr) { int mpi_errno = MPI_SUCCESS; - int inuse; + /* MPIR_Group_empty was not properly reference counted - FIXME */ + if (group_ptr == MPIR_Group_empty) { + goto fn_exit; + } + + int inuse; MPIR_Group_release_ref(group_ptr, &inuse); if (!inuse) { + MPIR_Assert(!HANDLE_IS_BUILTIN(group_ptr->handle)); /* Only if refcount is 0 do we actually free. */ if (group_ptr->pmap.use_map) { MPL_free(group_ptr->pmap.u.map); @@ -108,6 +114,8 @@ int MPIR_Group_release(MPIR_Group * group_ptr) } MPIR_Handle_obj_free(&MPIR_Group_mem, group_ptr); } + + fn_exit: return mpi_errno; } From 6d281b9c4410e156c6052af8fc3e47a16fc658a6 Mon Sep 17 00:00:00 2001 From: Hui Zhou Date: Thu, 12 Dec 2024 23:39:02 -0600 Subject: [PATCH 18/19] ch4: release init_comm->local_group The init_comm does the release manually. --- src/mpid/ch4/src/init_comm.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/mpid/ch4/src/init_comm.c b/src/mpid/ch4/src/init_comm.c index 249d7700324..17915496417 100644 --- a/src/mpid/ch4/src/init_comm.c +++ b/src/mpid/ch4/src/init_comm.c @@ -78,6 +78,7 @@ void MPIDI_destroy_init_comm(MPIR_Comm ** comm_ptr) if (*comm_ptr != NULL) { comm = *comm_ptr; MPIDIU_release_lut(MPIDI_COMM(comm, map).irreg.lut.t); + MPIR_Group_release(comm->local_group); MPIDIG_destroy_comm(comm); MPIR_Object_release_ref(comm, &in_use); MPIR_Assertp(in_use == 0); From 5a53a625ee681fd32b40c46f5ef9e826a6d5edeb Mon Sep 17 00:00:00 2001 From: Hui Zhou Date: Thu, 12 Dec 2024 11:06:53 -0600 Subject: [PATCH 19/19] ch4: assert group before communicator commit Add assertions to make sure the local_group and remote_group (for inter communicators) are always set before MPID_Comm_commit_pre_hook. --- src/mpid/ch4/src/ch4_comm.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/mpid/ch4/src/ch4_comm.c b/src/mpid/ch4/src/ch4_comm.c index 2ca6c693cfa..8429acb8290 100644 --- a/src/mpid/ch4/src/ch4_comm.c +++ b/src/mpid/ch4/src/ch4_comm.c @@ -140,6 +140,9 @@ int MPID_Comm_commit_pre_hook(MPIR_Comm * comm) int mpi_errno; MPIR_FUNC_ENTER; + MPIR_Assert(comm->local_group); + MPIR_Assert(comm->comm_kind == MPIR_COMM_KIND__INTRACOMM || comm->remote_group); + if (comm == MPIR_Process.comm_world) { MPIDI_COMM(comm, map).mode = MPIDI_RANK_MAP_DIRECT_INTRA; MPIDI_COMM(comm, map).avtid = 0;