diff --git a/dummy b/dummy new file mode 100644 index 00000000000..e8183f05f5d --- /dev/null +++ b/dummy @@ -0,0 +1,3 @@ +1 +1 +1 diff --git a/maint/local_python/binding_c.py b/maint/local_python/binding_c.py index f219ee4e194..2654a58cc75 100644 --- a/maint/local_python/binding_c.py +++ b/maint/local_python/binding_c.py @@ -1106,7 +1106,7 @@ def out_can_be_undefined(p): G.out.append("int ret = " + static_call + ";") for l in post_filters: G.out.append(l) - if re.match(r'MPI_(Init|Init_thread|Session_init)$', func_name, re.IGNORECASE): + if re.match(r'MPI_(Init|Init_thread|Session_init)|MPI_T_init_thread$', func_name, re.IGNORECASE): G.out.append("ABI_init_builtins();") G.out.append("return ret;") G.out.append("DEDENT") diff --git a/src/binding/abi/mpi_abi_util.h b/src/binding/abi/mpi_abi_util.h index 5793b044c87..07be78989d8 100644 --- a/src/binding/abi/mpi_abi_util.h +++ b/src/binding/abi/mpi_abi_util.h @@ -137,6 +137,7 @@ static inline ABI_Datatype ABI_Datatype_from_mpi(MPI_Datatype in) return (ABI_Datatype) ((intptr_t) ABI_DATATYPE_NULL + i); } } + MPIR_Assert(0); } MPIR_Datatype *ptr; MPIR_Datatype_get_ptr(in, ptr); diff --git a/src/binding/c/group_api.txt b/src/binding/c/group_api.txt index dd2074024d0..532389d61ac 100644 --- a/src/binding/c/group_api.txt +++ b/src/binding/c/group_api.txt @@ -37,18 +37,6 @@ MPI_Group_excl: MPI_Group_free: .desc: Frees a group -{ -- error_check -- - /* Cannot free the predefined groups, but allow GROUP_EMPTY - * because otherwise many tests fail */ - if ((HANDLE_IS_BUILTIN(*group)) && *group != MPI_GROUP_EMPTY) { - mpi_errno = MPIR_Err_create_code(MPI_SUCCESS, - MPIR_ERR_RECOVERABLE, __func__, __LINE__, - MPI_ERR_GROUP, "**groupperm", 0); - } - if (mpi_errno) { - goto fn_fail; - } -} MPI_Group_incl: .desc: Produces a group by reordering an existing group and taking only listed members diff --git a/src/include/mpiimpl.h b/src/include/mpiimpl.h index d665d3e2eff..c118802410d 100644 --- a/src/include/mpiimpl.h +++ b/src/include/mpiimpl.h @@ -169,6 +169,7 @@ typedef struct MPIR_Stream MPIR_Stream; #include "mpir_errhandler.h" #include "mpir_attr_generic.h" #include "mpir_contextid.h" +#include "mpir_lpid.h" #include "mpir_status.h" #include "mpir_debugger.h" #include "mpir_op.h" diff --git a/src/include/mpir_comm.h b/src/include/mpir_comm.h index 8af43abc6d7..2820359d26b 100644 --- a/src/include/mpir_comm.h +++ b/src/include/mpir_comm.h @@ -31,44 +31,6 @@ typedef enum MPIR_Comm_hierarchy_kind_t { MPIR_COMM_HIERARCHY_KIND__MULTI_LEADS = 4, /* is the multi_leaders_comm for a node */ } MPIR_Comm_hierarchy_kind_t; -typedef enum { - MPIR_COMM_MAP_TYPE__DUP, - MPIR_COMM_MAP_TYPE__IRREGULAR -} MPIR_Comm_map_type_t; - -/* direction of mapping: local to local, local to remote, remote to - * local, remote to remote */ -typedef enum { - MPIR_COMM_MAP_DIR__L2L, - MPIR_COMM_MAP_DIR__L2R, - MPIR_COMM_MAP_DIR__R2L, - MPIR_COMM_MAP_DIR__R2R -} MPIR_Comm_map_dir_t; - -typedef struct MPIR_Comm_map { - MPIR_Comm_map_type_t type; - - struct MPIR_Comm *src_comm; - - /* mapping direction for intercomms, which contain local and - * remote groups */ - MPIR_Comm_map_dir_t dir; - - /* only valid for irregular map type */ - int src_mapping_size; - int *src_mapping; - int free_mapping; /* we allocated the mapping */ - - struct MPIR_Comm_map *next; -} MPIR_Comm_map_t; - -int MPIR_Comm_map_irregular(struct MPIR_Comm *newcomm, struct MPIR_Comm *src_comm, - int *src_mapping, int src_mapping_size, - MPIR_Comm_map_dir_t dir, MPIR_Comm_map_t ** map); -int MPIR_Comm_map_dup(struct MPIR_Comm *newcomm, struct MPIR_Comm *src_comm, - MPIR_Comm_map_dir_t dir); -int MPIR_Comm_map_free(struct MPIR_Comm *comm); - /* Communicator info hint */ #define MPIR_COMM_HINT_TYPE_BOOL 0 #define MPIR_COMM_HINT_TYPE_INT 1 @@ -166,9 +128,9 @@ struct MPIR_Comm { int rank; /* Value of MPI_Comm_rank */ MPIR_Attribute *attributes; /* List of attributes */ int local_size; /* Value of MPI_Comm_size for local group */ - MPIR_Group *local_group, /* Groups in communicator. */ - *remote_group; /* The local and remote groups are the - * same for intra communicators */ + MPIR_Group *local_group; /* Groups in communicator. */ + MPIR_Group *remote_group; /* The remote group in a inter communicator. + * Must be NULL in a intra communicator. */ MPIR_Comm_kind_t comm_kind; /* MPIR_COMM_KIND__INTRACOMM or MPIR_COMM_KIND__INTERCOMM */ char name[MPI_MAX_OBJECT_NAME]; /* Required for MPI-2 */ MPIR_Errhandler *errhandler; /* Pointer to the error handler structure */ @@ -254,12 +216,6 @@ struct MPIR_Comm { hcoll_comm_priv_t hcoll_priv; #endif /* HAVE_HCOLL */ - /* the mapper is temporarily filled out in order to allow the - * device to setup its network addresses. it will be freed after - * the device has initialized the comm. */ - MPIR_Comm_map_t *mapper_head; - MPIR_Comm_map_t *mapper_tail; - enum { MPIR_STREAM_COMM_NONE, MPIR_STREAM_COMM_SINGLE, MPIR_STREAM_COMM_MULTIPLEX } stream_comm_type; union { @@ -296,6 +252,15 @@ void MPIR_stream_comm_free(MPIR_Comm * comm_ptr); int MPIR_Comm_copy_stream(MPIR_Comm * oldcomm, MPIR_Comm * newcomm); int MPIR_get_local_gpu_stream(MPIR_Comm * comm_ptr, MPL_gpu_stream_t * gpu_stream); +MPL_STATIC_INLINE_PREFIX MPIR_Lpid MPIR_comm_rank_to_lpid(MPIR_Comm * comm_ptr, int rank) +{ + if (comm_ptr->comm_kind == MPIR_COMM_KIND__INTRACOMM) { + return MPIR_Group_rank_to_lpid(comm_ptr->local_group, rank); + } else { + return MPIR_Group_rank_to_lpid(comm_ptr->remote_group, rank); + } +} + MPL_STATIC_INLINE_PREFIX MPIR_Stream *MPIR_stream_comm_get_local_stream(MPIR_Comm * comm_ptr) { if (comm_ptr->stream_comm_type == MPIR_STREAM_COMM_SINGLE) { @@ -377,10 +342,6 @@ int MPIR_Comm_commit(MPIR_Comm *); int MPIR_Comm_is_parent_comm(MPIR_Comm *); -/* peer intercomm is an internal 1-to-1 intercomm used for connecting dynamic processes */ -int MPIR_peer_intercomm_create(int context_id, int recvcontext_id, - uint64_t remote_lpid, int is_low_group, MPIR_Comm ** newcomm); - #define MPIR_Comm_rank(comm_ptr) ((comm_ptr)->rank) #define MPIR_Comm_size(comm_ptr) ((comm_ptr)->local_size) @@ -420,6 +381,10 @@ int MPIR_Comm_split_type(MPIR_Comm * comm_ptr, int split_type, int key, MPIR_Inf int MPIR_Comm_split_type_neighborhood(MPIR_Comm * comm_ptr, int split_type, int key, MPIR_Info * info_ptr, MPIR_Comm ** newcomm_ptr); +int MPIR_Intercomm_create_timeout(MPIR_Comm * local_comm_ptr, int local_leader, + MPIR_Comm * peer_comm_ptr, int remote_leader, + int tag, int timeout, MPIR_Comm ** new_intercomm_ptr); + /* Preallocated comm objects. There are 3: comm_world, comm_self, and a private (non-user accessible) dup of comm world that is provided if needed in MPI_Finalize. Having a separate version of comm_world @@ -449,16 +414,6 @@ int MPII_Comm_copy_data(MPIR_Comm * comm_ptr, MPIR_Info * info, MPIR_Comm ** out int MPII_Setup_intercomm_localcomm(MPIR_Comm *); -/* comm_create helper functions, used by both comm_create and comm_create_group */ -int MPII_Comm_create_calculate_mapping(MPIR_Group * group_ptr, - MPIR_Comm * comm_ptr, - int **mapping_out, MPIR_Comm ** mapping_comm); - -int MPII_Comm_create_map(int local_n, - int remote_n, - int *local_mapping, - int *remote_mapping, MPIR_Comm * mapping_comm, MPIR_Comm * newcomm); - int MPII_Comm_set_hints(MPIR_Comm * comm_ptr, MPIR_Info * info, bool in_comm_create); int MPII_Comm_get_hints(MPIR_Comm * comm_ptr, MPIR_Info * info); int MPII_Comm_check_hints(MPIR_Comm * comm_ptr); diff --git a/src/include/mpir_group.h b/src/include/mpir_group.h index c40f22fe877..441a542f0ce 100644 --- a/src/include/mpir_group.h +++ b/src/include/mpir_group.h @@ -11,18 +11,7 @@ * only because they are required for the group operations (e.g., * MPI_Group_intersection) and for the scalable RMA synchronization *---------------------------------------------------------------------------*/ -/* This structure is used to implement the group operations such as - MPI_Group_translate_ranks */ -/* note: next_lpid (with idx_of_first_lpid in MPIR_Group) gives a linked list - * in a sorted lpid ascending order */ -typedef struct MPII_Group_pmap_t { - uint64_t lpid; /* local process id, from VCONN */ - int next_lpid; /* Index of next lpid (in lpid order) */ -} MPII_Group_pmap_t; - -/* Any changes in the MPIR_Group structure must be made to the - predefined value in MPIR_Group_builtin for MPI_GROUP_EMPTY in - src/mpi/group/grouputil.c */ + /*S MPIR_Group - Description of the Group data structure @@ -53,22 +42,34 @@ typedef struct MPII_Group_pmap_t { Group-DS S*/ + +/* In addition to MPI_GROUP_EMPTY, internally we have a few more builtins */ +#define MPIR_GROUP_WORLD ((MPI_Group)0x48000001) +#define MPIR_GROUP_SELF ((MPI_Group)0x48000002) + +#define MPIR_GROUP_WORLD_PTR (MPIR_Group_builtin + 1) +#define MPIR_GROUP_SELF_PTR (MPIR_Group_builtin + 2) + +struct MPIR_Pmap { + bool use_map; + union { + MPIR_Lpid *map; + struct { + MPIR_Lpid offset; + MPIR_Lpid stride; + } stride; + } u; +}; + struct MPIR_Group { MPIR_OBJECT_HEADER; /* adds handle and ref_count fields */ int size; /* Size of a group */ - int rank; /* rank of this process relative to this - * group */ - int idx_of_first_lpid; - MPII_Group_pmap_t *lrank_to_lpid; /* Array mapping a local rank to local - * process number */ - int is_local_dense_monotonic; /* see NOTE-G1 */ - - /* We may want some additional data for the RMA syncrhonization calls */ - /* Other, device-specific information */ + int rank; /* rank of this process relative to this group */ + struct MPIR_Pmap pmap; + MPIR_Session *session_ptr; /* Pointer to session to which this group belongs */ #ifdef MPID_DEV_GROUP_DECL MPID_DEV_GROUP_DECL #endif - MPIR_Session * session_ptr; /* Pointer to session to which this group belongs */ }; /* NOTE-G1: is_local_dense_monotonic will be true iff the group meets the @@ -97,18 +98,34 @@ extern MPIR_Group *const MPIR_Group_empty; #define MPIR_Group_release_ref(_group, _inuse) \ do { MPIR_Object_release_ref(_group, _inuse); } while (0) -void MPII_Group_setup_lpid_list(MPIR_Group *); int MPIR_Group_check_valid_ranks(MPIR_Group *, const int[], int); int MPIR_Group_check_valid_ranges(MPIR_Group *, int[][3], int); -void MPIR_Group_setup_lpid_pairs(MPIR_Group *, MPIR_Group *); int MPIR_Group_create(int, MPIR_Group **); int MPIR_Group_release(MPIR_Group * group_ptr); +int MPIR_Group_dup(MPIR_Group * old_group, MPIR_Session * session_ptr, MPIR_Group ** new_group_ptr); +int MPIR_Group_create_map(int size, int rank, MPIR_Session * session_ptr, MPIR_Lpid * map, + MPIR_Group ** new_group_ptr); +int MPIR_Group_create_stride(int size, int rank, MPIR_Session * session_ptr, + MPIR_Lpid offset, MPIR_Lpid stride, MPIR_Group ** new_group_ptr); +int MPIR_Group_lpid_to_rank(MPIR_Group * group, MPIR_Lpid lpid); + int MPIR_Group_check_subset(MPIR_Group * group_ptr, MPIR_Comm * comm_ptr); void MPIR_Group_set_session_ptr(MPIR_Group * group_ptr, MPIR_Session * session_out); int MPIR_Group_init(void); - -/* internal functions */ -void MPII_Group_setup_lpid_list(MPIR_Group *); +int MPIR_Group_finalize(void); + +MPL_STATIC_INLINE_PREFIX MPIR_Lpid MPIR_Group_rank_to_lpid(MPIR_Group * group, int rank) +{ + if (rank < 0 || rank >= group->size) { + return MPI_UNDEFINED; + } + + if (group->pmap.use_map) { + return group->pmap.u.map[rank]; + } else { + return group->pmap.u.stride.offset + rank * group->pmap.u.stride.stride; + } +} #endif /* MPIR_GROUP_H_INCLUDED */ diff --git a/src/include/mpir_lpid.h b/src/include/mpir_lpid.h new file mode 100644 index 00000000000..dfa16ac75c3 --- /dev/null +++ b/src/include/mpir_lpid.h @@ -0,0 +1,46 @@ +/* + * Copyright (C) by Argonne National Laboratory + * See COPYRIGHT in top-level directory + */ + +#ifndef MPIR_LPID_H_INCLUDED +#define MPIR_LPID_H_INCLUDED + +/* Worlds - + * We need a device-independent way of identifying processes. Assuming the concept of + * "worlds", we can describe a process with (world_idx, world_rank). + * + * The world_idx is a local id because each process may not see all worlds. Thus, + * each process only can maintain a list of worlds as it encounters them. Thus, + * a process id derived from (world_idx, world_rank) is referred as LPID, or + * "local process id". + * + * Each process should maintain a table of worlds with sufficient information so + * processes can match worlds upon connection or making address exchange. + */ + +#define MPIR_NAMESPACE_MAX 128 +struct MPIR_World { + char namespace[MPIR_NAMESPACE_MAX]; + /* other useful fields */ + int num_procs; +}; + +extern struct MPIR_World MPIR_Worlds[]; + +int MPIR_add_world(const char *namespace, int num_procs); +int MPIR_find_world(const char *namespace); + +/* Abstract the integer type for lpid (process id). It is possible to use 32-bit + * in principle, but 64-bit is simpler since we can trivially combine + * (world_idx, world_rank). + */ +typedef int64_t MPIR_Lpid; + +#define MPIR_LPID_WORLD_INDEX(lpid) ((lpid) >> 32) +#define MPIR_LPID_WORLD_RANK(lpid) ((lpid) & 0xffffffff) +#define MPIR_LPID_FROM(world_idx, world_rank) (((uint64_t)(world_idx) << 32) | (world_rank)) +#define MPIR_LPID_DYNAMIC_MASK ((MPIR_Lpid)0x1 << 62) /* MPIR_Lpid is signed, avoid using the signed bit */ +#define MPIR_LPID_INVALID 0xffffffff + +#endif /* MPIR_LPID_H_INCLUDED */ diff --git a/src/include/mpir_mem.h b/src/include/mpir_mem.h index 147e67ef7bb..4eb03415105 100644 --- a/src/include/mpir_mem.h +++ b/src/include/mpir_mem.h @@ -110,6 +110,12 @@ extern "C" { int mpiu_chklmem_stk_sp_=0; \ MPIR_AssertDeclValue(const int mpiu_chklmem_stk_sz_,n_) +#define MPIR_CHKLMEM_ADD(pointer_) \ + do { \ + MPIR_Assert(mpiu_chklmem_stk_sp_= 0; --dim) tree_ut_hierarchy_init(&hierarchy[dim]); diff --git a/src/mpi/comm/builtin_comms.c b/src/mpi/comm/builtin_comms.c index 16a75588036..7e0273a677f 100644 --- a/src/mpi/comm/builtin_comms.c +++ b/src/mpi/comm/builtin_comms.c @@ -30,6 +30,9 @@ int MPIR_init_comm_world(void) MPIR_Process.comm_world->remote_size = MPIR_Process.size; MPIR_Process.comm_world->local_size = MPIR_Process.size; + MPIR_Process.comm_world->local_group = MPIR_GROUP_WORLD_PTR; + MPIR_Group_add_ref(MPIR_GROUP_WORLD_PTR); + mpi_errno = MPIR_Comm_commit(MPIR_Process.comm_world); MPIR_ERR_CHECK(mpi_errno); @@ -59,6 +62,9 @@ int MPIR_init_comm_self(void) MPIR_Process.comm_self->remote_size = 1; MPIR_Process.comm_self->local_size = 1; + MPIR_Process.comm_self->local_group = MPIR_GROUP_SELF_PTR; + MPIR_Group_add_ref(MPIR_GROUP_SELF_PTR); + mpi_errno = MPIR_Comm_commit(MPIR_Process.comm_self); MPIR_ERR_CHECK(mpi_errno); @@ -91,6 +97,9 @@ int MPIR_init_icomm_world(void) MPIR_Process.icomm_world->remote_size = MPIR_Process.size; MPIR_Process.icomm_world->local_size = MPIR_Process.size; + MPIR_Process.icomm_world->local_group = MPIR_GROUP_WORLD_PTR; + MPIR_Group_add_ref(MPIR_GROUP_WORLD_PTR); + mpi_errno = MPIR_Comm_commit(MPIR_Process.icomm_world); MPIR_ERR_CHECK(mpi_errno); diff --git a/src/mpi/comm/comm_impl.c b/src/mpi/comm/comm_impl.c index 9dbba6d703f..be1e858cffa 100644 --- a/src/mpi/comm/comm_impl.c +++ b/src/mpi/comm/comm_impl.c @@ -68,36 +68,17 @@ int MPIR_Comm_test_threadcomm_impl(MPIR_Comm * comm_ptr, int *flag) static int comm_create_local_group(MPIR_Comm * comm_ptr) { int mpi_errno = MPI_SUCCESS; - MPIR_Group *group_ptr; - int n = comm_ptr->local_size; - - mpi_errno = MPIR_Group_create(n, &group_ptr); - MPIR_ERR_CHECK(mpi_errno); - - /* Group belongs to the same session as communicator */ - MPIR_Group_set_session_ptr(group_ptr, comm_ptr->session_ptr); - group_ptr->is_local_dense_monotonic = TRUE; + int n = comm_ptr->local_size; + MPIR_Lpid *map = MPL_malloc(n * sizeof(MPIR_Lpid), MPL_MEM_GROUP); - int comm_world_size = MPIR_Process.size; for (int i = 0; i < n; i++) { - uint64_t lpid; - (void) MPID_Comm_get_lpid(comm_ptr, i, &lpid, FALSE); - group_ptr->lrank_to_lpid[i].lpid = lpid; - if (lpid > comm_world_size || (i > 0 && group_ptr->lrank_to_lpid[i - 1].lpid != (lpid - 1))) { - group_ptr->is_local_dense_monotonic = FALSE; - } + map[i] = MPIR_Group_rank_to_lpid(comm_ptr->local_group, i); } - group_ptr->size = n; - group_ptr->rank = comm_ptr->rank; - group_ptr->idx_of_first_lpid = -1; - - comm_ptr->local_group = group_ptr; - - /* FIXME : Add a sanity check that the size of the group is the same as - * the size of the communicator. This helps catch corrupted - * communicators */ + mpi_errno = MPIR_Group_create_map(n, comm_ptr->rank, comm_ptr->session_ptr, map, + &comm_ptr->local_group); + MPIR_ERR_CHECK(mpi_errno); fn_exit: return mpi_errno; @@ -177,137 +158,6 @@ int MPIR_Comm_compare_impl(MPIR_Comm * comm_ptr1, MPIR_Comm * comm_ptr2, int *re goto fn_exit; } -/* This function allocates and calculates an array (*mapping_out) such that - * (*mapping_out)[i] is the rank in (*mapping_comm) corresponding to local - * rank i in the given group_ptr. - * - * Ownership of the (*mapping_out) array is transferred to the caller who is - * responsible for freeing it. */ -int MPII_Comm_create_calculate_mapping(MPIR_Group * group_ptr, - MPIR_Comm * comm_ptr, - int **mapping_out, MPIR_Comm ** mapping_comm) -{ - int mpi_errno = MPI_SUCCESS; - int subsetOfWorld = 0; - int i, j; - int n; - int *mapping = 0; - MPIR_CHKPMEM_DECL(1); - - MPIR_FUNC_ENTER; - - *mapping_out = NULL; - *mapping_comm = comm_ptr; - - n = group_ptr->size; - MPIR_CHKPMEM_MALLOC(mapping, int *, n * sizeof(int), mpi_errno, "mapping", MPL_MEM_ADDRESS); - - /* Make sure that the processes for this group are contained within - * the input communicator. Also identify the mapping from the ranks of - * the old communicator to the new communicator. - * We do this by matching the lpids of the members of the group - * with the lpids of the members of the input communicator. - * It is an error if the group contains a reference to an lpid that - * does not exist in the communicator. - * - * An important special case is groups (and communicators) that - * are subsets of MPI_COMM_WORLD. In this case, the lpids are - * exactly the same as the ranks in comm world. - */ - - /* we examine the group's lpids in both the intracomm and non-comm_world cases */ - MPII_Group_setup_lpid_list(group_ptr); - - /* Optimize for groups contained within MPI_COMM_WORLD. */ - if (comm_ptr->comm_kind == MPIR_COMM_KIND__INTRACOMM) { - int wsize; - subsetOfWorld = 1; - wsize = MPIR_Process.size; - for (i = 0; i < n; i++) { - uint64_t g_lpid = group_ptr->lrank_to_lpid[i].lpid; - - /* This mapping is relative to comm world */ - MPL_DBG_MSG_FMT(MPIR_DBG_COMM, VERBOSE, - (MPL_DBG_FDEST, - "comm-create - mapping into world[%d] = %" PRIu64, i, g_lpid)); - if (g_lpid < wsize) { - mapping[i] = g_lpid; - } else { - subsetOfWorld = 0; - break; - } - } - } - MPL_DBG_MSG_D(MPIR_DBG_COMM, VERBOSE, "subsetOfWorld=%d", subsetOfWorld); - if (subsetOfWorld) { -#ifdef HAVE_ERROR_CHECKING - { - MPID_BEGIN_ERROR_CHECKS; - { - mpi_errno = MPIR_Group_check_subset(group_ptr, comm_ptr); - MPIR_ERR_CHECK(mpi_errno); - } - MPID_END_ERROR_CHECKS; - } -#endif - /* Override the comm to be used with the mapping array. */ - *mapping_comm = MPIR_Process.comm_world; - } else { - for (i = 0; i < n; i++) { - /* mapping[i] is the rank in the communicator of the process - * that is the ith element of the group */ - /* FIXME : BUBBLE SORT */ - mapping[i] = -1; - for (j = 0; j < comm_ptr->local_size; j++) { - uint64_t comm_lpid; - MPID_Comm_get_lpid(comm_ptr, j, &comm_lpid, FALSE); - if (comm_lpid == group_ptr->lrank_to_lpid[i].lpid) { - mapping[i] = j; - break; - } - } - MPIR_ERR_CHKANDJUMP1(mapping[i] == -1, mpi_errno, MPI_ERR_GROUP, - "**groupnotincomm", "**groupnotincomm %d", i); - } - } - - MPIR_Assert(mapping != NULL); - *mapping_out = mapping; - MPL_VG_CHECK_MEM_IS_DEFINED(*mapping_out, n * sizeof(**mapping_out)); - - MPIR_CHKPMEM_COMMIT(); - fn_exit: - MPIR_FUNC_EXIT; - return mpi_errno; - fn_fail: - MPIR_CHKPMEM_REAP(); - goto fn_exit; -} - -/* mapping[i] is equivalent network mapping between the old - * communicator and the new communicator. Index 'i' in the old - * communicator has the same network address as 'mapping[i]' in the - * new communicator. */ -/* WARNING: local_mapping and remote_mapping are stored in this - * function. The caller is responsible for their storage and will - * need to retain them till Comm_commit. */ -int MPII_Comm_create_map(int local_n, - int remote_n, - int *local_mapping, - int *remote_mapping, MPIR_Comm * mapping_comm, MPIR_Comm * newcomm) -{ - int mpi_errno = MPI_SUCCESS; - - MPIR_Comm_map_irregular(newcomm, mapping_comm, local_mapping, - local_n, MPIR_COMM_MAP_DIR__L2L, NULL); - if (mapping_comm->comm_kind == MPIR_COMM_KIND__INTERCOMM) { - MPIR_Comm_map_irregular(newcomm, mapping_comm, remote_mapping, - remote_n, MPIR_COMM_MAP_DIR__R2R, NULL); - } - return mpi_errno; -} - - /* comm create impl for intracommunicators, assumes that the standard error * checking has already taken place in the calling function */ int MPIR_Comm_create_intra(MPIR_Comm * comm_ptr, MPIR_Group * group_ptr, MPIR_Comm ** newcomm_ptr) @@ -320,6 +170,10 @@ int MPIR_Comm_create_intra(MPIR_Comm * comm_ptr, MPIR_Group * group_ptr, MPIR_Co MPIR_FUNC_ENTER; MPIR_Assert(comm_ptr->comm_kind == MPIR_COMM_KIND__INTRACOMM); +#ifdef HAVE_ERROR_CHECKING + mpi_errno = MPIR_Group_check_subset(group_ptr, comm_ptr); + MPIR_ERR_CHECK(mpi_errno); +#endif n = group_ptr->size; *newcomm_ptr = NULL; @@ -337,12 +191,6 @@ int MPIR_Comm_create_intra(MPIR_Comm * comm_ptr, MPIR_Group * group_ptr, MPIR_Co MPIR_Assert(new_context_id != 0); if (group_ptr->rank != MPI_UNDEFINED) { - MPIR_Comm *mapping_comm = NULL; - - mpi_errno = MPII_Comm_create_calculate_mapping(group_ptr, comm_ptr, - &mapping, &mapping_comm); - MPIR_ERR_CHECK(mpi_errno); - /* Get the new communicator structure and context id */ mpi_errno = MPIR_Comm_create(newcomm_ptr); @@ -357,18 +205,12 @@ int MPIR_Comm_create_intra(MPIR_Comm * comm_ptr, MPIR_Group * group_ptr, MPIR_Co (*newcomm_ptr)->local_group = group_ptr; MPIR_Group_add_ref(group_ptr); - (*newcomm_ptr)->remote_group = group_ptr; - MPIR_Group_add_ref(group_ptr); + (*newcomm_ptr)->remote_group = NULL; (*newcomm_ptr)->context_id = (*newcomm_ptr)->recvcontext_id; (*newcomm_ptr)->remote_size = (*newcomm_ptr)->local_size = n; MPIR_Comm_set_session_ptr(*newcomm_ptr, comm_ptr->session_ptr); - /* Setup the communicator's network address mapping. This is for the remote group, - * which is the same as the local group for intracommunicators */ - mpi_errno = MPII_Comm_create_map(n, 0, mapping, NULL, mapping_comm, *newcomm_ptr); - MPIR_ERR_CHECK(mpi_errno); - (*newcomm_ptr)->tainted = comm_ptr->tainted; mpi_errno = MPIR_Comm_commit(*newcomm_ptr); MPIR_ERR_CHECK(mpi_errno); @@ -400,17 +242,11 @@ int MPIR_Comm_create_intra(MPIR_Comm * comm_ptr, MPIR_Group * group_ptr, MPIR_Co int MPIR_Comm_create_inter(MPIR_Comm * comm_ptr, MPIR_Group * group_ptr, MPIR_Comm ** newcomm_ptr) { int mpi_errno = MPI_SUCCESS; - int new_context_id; - int *mapping = NULL; - int *remote_mapping = NULL; - MPIR_Comm *mapping_comm = NULL; - int remote_size = -1; - int rinfo[2]; - MPIR_CHKLMEM_DECL(1); - + MPIR_CHKLMEM_DECL(2); MPIR_FUNC_ENTER; MPIR_Assert(comm_ptr->comm_kind == MPIR_COMM_KIND__INTERCOMM); + MPIR_Session *session_ptr = comm_ptr->session_ptr; /* Create a new communicator from the specified group members */ @@ -424,39 +260,12 @@ int MPIR_Comm_create_inter(MPIR_Comm * comm_ptr, MPIR_Group * group_ptr, MPIR_Co if (!comm_ptr->local_comm) { MPII_Setup_intercomm_localcomm(comm_ptr); } + int new_context_id; mpi_errno = MPIR_Get_contextid_sparse(comm_ptr->local_comm, &new_context_id, FALSE); MPIR_ERR_CHECK(mpi_errno); MPIR_Assert(new_context_id != 0); MPIR_Assert(new_context_id != comm_ptr->recvcontext_id); - mpi_errno = MPII_Comm_create_calculate_mapping(group_ptr, comm_ptr, &mapping, &mapping_comm); - MPIR_ERR_CHECK(mpi_errno); - - *newcomm_ptr = NULL; - - if (group_ptr->rank != MPI_UNDEFINED) { - /* Get the new communicator structure and context id */ - mpi_errno = MPIR_Comm_create(newcomm_ptr); - if (mpi_errno) - goto fn_fail; - - (*newcomm_ptr)->recvcontext_id = new_context_id; - (*newcomm_ptr)->rank = group_ptr->rank; - (*newcomm_ptr)->comm_kind = comm_ptr->comm_kind; - /* Since the group has been provided, let the new communicator know - * about the group */ - (*newcomm_ptr)->local_comm = 0; - (*newcomm_ptr)->local_group = group_ptr; - MPIR_Group_add_ref(group_ptr); - - (*newcomm_ptr)->local_size = group_ptr->size; - (*newcomm_ptr)->remote_group = 0; - - (*newcomm_ptr)->is_low_group = comm_ptr->is_low_group; - - MPIR_Comm_set_session_ptr(*newcomm_ptr, comm_ptr->session_ptr); - } - /* There is an additional step. We must communicate the * information on the local context id and the group members, * given by the ranks so that the remote process can construct the @@ -465,6 +274,12 @@ int MPIR_Comm_create_inter(MPIR_Comm * comm_ptr, MPIR_Group * group_ptr, MPIR_Co * in the remote group, from which the remote network address * mapping can be constructed. We need to use the "collective" * context in the original intercommunicator */ + + int remote_size = -1; + int context_id; + int *remote_mapping; /* a list of remote ranks */ + int rinfo[2]; + if (comm_ptr->rank == 0) { int info[2]; info[0] = new_context_id; @@ -474,14 +289,21 @@ int MPIR_Comm_create_inter(MPIR_Comm * comm_ptr, MPIR_Group * group_ptr, MPIR_Co rinfo, 2, MPI_INT, 0, 0, comm_ptr, MPI_STATUS_IGNORE, MPIR_ERR_NONE); MPIR_ERR_CHECK(mpi_errno); - if (*newcomm_ptr != NULL) { - (*newcomm_ptr)->context_id = rinfo[0]; - } + context_id = rinfo[0]; remote_size = rinfo[1]; - MPIR_CHKLMEM_MALLOC(remote_mapping, int *, - remote_size * sizeof(int), - mpi_errno, "remote_mapping", MPL_MEM_ADDRESS); + int *mapping; + MPIR_CHKLMEM_MALLOC(mapping, int *, group_ptr->size * sizeof(int), + mpi_errno, "mapping", MPL_MEM_OTHER); + + /* effectively MPIR_Group_translate_ranks_impl */ + for (int i = 0; i < group_ptr->size; i++) { + MPIR_Lpid lpid = MPIR_Group_rank_to_lpid(group_ptr, i); + mapping[i] = MPIR_Group_lpid_to_rank(comm_ptr->local_group, lpid); + } + + MPIR_CHKLMEM_MALLOC(remote_mapping, int *, remote_size * sizeof(int), + mpi_errno, "remote_mapping", MPL_MEM_OTHER); /* Populate and exchange the ranks */ mpi_errno = MPIC_Sendrecv(mapping, group_ptr->size, MPI_INT, 0, 0, @@ -500,57 +322,63 @@ int MPIR_Comm_create_inter(MPIR_Comm * comm_ptr, MPIR_Group * group_ptr, MPIR_Co /* Broadcast to the other members of the local group */ mpi_errno = MPIR_Bcast(rinfo, 2, MPI_INT, 0, comm_ptr->local_comm, MPIR_ERR_NONE); MPIR_ERR_CHECK(mpi_errno); - if (*newcomm_ptr != NULL) { - (*newcomm_ptr)->context_id = rinfo[0]; - } + + context_id = rinfo[0]; remote_size = rinfo[1]; - MPIR_CHKLMEM_MALLOC(remote_mapping, int *, - remote_size * sizeof(int), - mpi_errno, "remote_mapping", MPL_MEM_ADDRESS); + + MPIR_CHKLMEM_MALLOC(remote_mapping, int *, remote_size * sizeof(int), + mpi_errno, "remote_mapping", MPL_MEM_OTHER); + mpi_errno = MPIR_Bcast(remote_mapping, remote_size, MPI_INT, 0, comm_ptr->local_comm, MPIR_ERR_NONE); MPIR_ERR_CHECK(mpi_errno); } MPIR_Assert(remote_size >= 0); + if (group_ptr->rank == MPI_UNDEFINED || remote_size <= 0) { + /* If we are not part of the group, or - + * It's possible that no members of the other side of comm were + * members of the group that they passed, which we only know after + * receiving/bcasting the remote_size above. We must return + * MPI_COMM_NULL in this case. + */ + MPIR_Free_contextid(new_context_id); + *newcomm_ptr = NULL; + goto fn_exit; + } + /* FIXME: the branch was kept to minimize line changes. Remove the if-check. */ if (group_ptr->rank != MPI_UNDEFINED) { + /* Get the new communicator structure and context id */ + mpi_errno = MPIR_Comm_create(newcomm_ptr); + MPIR_ERR_CHECK(mpi_errno); + + (*newcomm_ptr)->context_id = context_id; (*newcomm_ptr)->remote_size = remote_size; - /* Now, everyone has the remote_mapping, and can apply that to - * the network address mapping. */ + (*newcomm_ptr)->recvcontext_id = new_context_id; + (*newcomm_ptr)->rank = group_ptr->rank; + (*newcomm_ptr)->comm_kind = comm_ptr->comm_kind; + /* Since the group has been provided, let the new communicator know + * about the group */ + (*newcomm_ptr)->local_comm = 0; + (*newcomm_ptr)->local_size = group_ptr->size; + (*newcomm_ptr)->local_group = group_ptr; + MPIR_Group_add_ref(group_ptr); - /* Setup the communicator's network addresses from the local mapping. */ - mpi_errno = MPII_Comm_create_map(group_ptr->size, - remote_size, - mapping, remote_mapping, mapping_comm, *newcomm_ptr); - MPIR_ERR_CHECK(mpi_errno); + mpi_errno = MPIR_Group_incl_impl(comm_ptr->remote_group, rinfo[1], remote_mapping, + &(*newcomm_ptr)->remote_group); + + (*newcomm_ptr)->is_low_group = comm_ptr->is_low_group; + + MPIR_Comm_set_session_ptr(*newcomm_ptr, session_ptr); (*newcomm_ptr)->tainted = comm_ptr->tainted; mpi_errno = MPIR_Comm_commit(*newcomm_ptr); MPIR_ERR_CHECK(mpi_errno); - - if (remote_size <= 0) { - /* It's possible that no members of the other side of comm were - * members of the group that they passed, which we only know after - * receiving/bcasting the remote_size above. We must return - * MPI_COMM_NULL in this case, but we can't free the newcomm_ptr - * immediately after the communication above because - * MPIR_Comm_release won't work correctly with a half-constructed - * comm. */ - mpi_errno = MPIR_Comm_release(*newcomm_ptr); - MPIR_ERR_CHECK(mpi_errno); - *newcomm_ptr = NULL; - } - } else { - /* This process is not in the group */ - MPIR_Free_contextid(new_context_id); - *newcomm_ptr = NULL; } fn_exit: MPIR_CHKLMEM_FREEALL(); - MPL_free(mapping); - MPIR_FUNC_EXIT; return mpi_errno; fn_fail: @@ -582,7 +410,6 @@ int MPIR_Comm_create_group_impl(MPIR_Comm * comm_ptr, MPIR_Group * group_ptr, in { int mpi_errno = MPI_SUCCESS; int new_context_id = 0; - int *mapping = NULL; int n; MPIR_FUNC_ENTER; @@ -598,8 +425,6 @@ int MPIR_Comm_create_group_impl(MPIR_Comm * comm_ptr, MPIR_Group * group_ptr, in /* Create a new communicator from the specified group members */ if (group_ptr->rank != MPI_UNDEFINED) { - MPIR_Comm *mapping_comm = NULL; - /* For this routine, creation of the id is collective over the input *group*, so processes not in the group do not participate. */ @@ -607,10 +432,6 @@ int MPIR_Comm_create_group_impl(MPIR_Comm * comm_ptr, MPIR_Group * group_ptr, in MPIR_ERR_CHECK(mpi_errno); MPIR_Assert(new_context_id != 0); - mpi_errno = MPII_Comm_create_calculate_mapping(group_ptr, comm_ptr, - &mapping, &mapping_comm); - MPIR_ERR_CHECK(mpi_errno); - /* Get the new communicator structure and context id */ mpi_errno = MPIR_Comm_create(newcomm_ptr); @@ -625,18 +446,12 @@ int MPIR_Comm_create_group_impl(MPIR_Comm * comm_ptr, MPIR_Group * group_ptr, in (*newcomm_ptr)->local_group = group_ptr; MPIR_Group_add_ref(group_ptr); - (*newcomm_ptr)->remote_group = group_ptr; - MPIR_Group_add_ref(group_ptr); + (*newcomm_ptr)->remote_group = NULL; (*newcomm_ptr)->context_id = (*newcomm_ptr)->recvcontext_id; (*newcomm_ptr)->remote_size = (*newcomm_ptr)->local_size = n; MPIR_Comm_set_session_ptr(*newcomm_ptr, group_ptr->session_ptr); - /* Setup the communicator's vc table. This is for the remote group, - * which is the same as the local group for intracommunicators */ - mpi_errno = MPII_Comm_create_map(n, 0, mapping, NULL, mapping_comm, *newcomm_ptr); - MPIR_ERR_CHECK(mpi_errno); - (*newcomm_ptr)->tainted = comm_ptr->tainted; mpi_errno = MPIR_Comm_commit(*newcomm_ptr); MPIR_ERR_CHECK(mpi_errno); @@ -646,8 +461,6 @@ int MPIR_Comm_create_group_impl(MPIR_Comm * comm_ptr, MPIR_Group * group_ptr, in } fn_exit: - MPL_free(mapping); - MPIR_FUNC_EXIT; return mpi_errno; fn_fail: @@ -800,7 +613,7 @@ int MPIR_Intercomm_create_from_groups_impl(MPIR_Group * local_group_ptr, int loc int tag = get_tag_from_stringtag(stringtag); /* FIXME: ensure lpid is from comm_world */ - uint64_t remote_lpid = remote_group_ptr->lrank_to_lpid[remote_leader].lpid; + MPIR_Lpid remote_lpid = MPIR_Group_rank_to_lpid(remote_group_ptr, remote_leader); MPIR_Assert(remote_lpid < MPIR_Process.size); mpi_errno = MPIR_Intercomm_create_impl(local_comm, local_leader, MPIR_Process.comm_world, (int) remote_lpid, @@ -931,31 +744,24 @@ int MPIR_Comm_idup_with_info_impl(MPIR_Comm * comm_ptr, MPIR_Info * info, int MPIR_Comm_remote_group_impl(MPIR_Comm * comm_ptr, MPIR_Group ** group_ptr) { int mpi_errno = MPI_SUCCESS; - int i, n; - MPIR_FUNC_ENTER; + + /* FIXME: remove the following remote_group creation once this assertion passes */ + MPIR_Assert(comm_ptr->comm_kind == MPIR_COMM_KIND__INTERCOMM && comm_ptr->remote_group); + /* Create a group and populate it with the local process ids */ if (!comm_ptr->remote_group) { - n = comm_ptr->remote_size; - mpi_errno = MPIR_Group_create(n, group_ptr); - MPIR_ERR_CHECK(mpi_errno); + int n = comm_ptr->remote_size; + MPIR_Lpid *map = MPL_malloc(n * sizeof(MPIR_Lpid), MPL_MEM_GROUP); - for (i = 0; i < n; i++) { - uint64_t lpid; - (void) MPID_Comm_get_lpid(comm_ptr, i, &lpid, TRUE); - (*group_ptr)->lrank_to_lpid[i].lpid = lpid; - /* TODO calculate is_local_dense_monotonic */ + for (int i = 0; i < n; i++) { + map[i] = MPIR_Group_rank_to_lpid(comm_ptr->remote_group, i); } - (*group_ptr)->size = n; - (*group_ptr)->rank = MPI_UNDEFINED; - (*group_ptr)->idx_of_first_lpid = -1; - - MPIR_Group_set_session_ptr(*group_ptr, comm_ptr->session_ptr); - - comm_ptr->remote_group = *group_ptr; - } else { - *group_ptr = comm_ptr->remote_group; + mpi_errno = MPIR_Group_create_map(n, MPI_UNDEFINED, comm_ptr->session_ptr, map, + &comm_ptr->remote_group); + MPIR_ERR_CHECK(mpi_errno); } + *group_ptr = comm_ptr->remote_group; MPIR_Group_add_ref(comm_ptr->remote_group); fn_exit: @@ -983,74 +789,82 @@ int MPIR_Comm_set_info_impl(MPIR_Comm * comm_ptr, MPIR_Info * info_ptr) goto fn_exit; } +/* arbitrarily determine which group is the low_group by comparing + * world namespaces and world ranks */ +static int determine_low_group(MPIR_Lpid remote_lpid, bool * is_low_group_out) +{ + int mpi_errno = MPI_SUCCESS; + + int my_world_idx = 0; + int my_world_rank = MPIR_Process.rank; + int remote_world_idx = MPIR_LPID_WORLD_INDEX(remote_lpid); + int remote_world_rank = MPIR_LPID_WORLD_RANK(remote_lpid); + + if (my_world_idx == remote_world_idx) { + /* same world, just compare world ranks */ + MPIR_Assert(my_world_idx == 0); + *is_low_group_out = (my_world_rank < remote_world_rank); + } else { + /* different world, compare namespace */ + int cmp_result = strncmp(MPIR_Worlds[my_world_idx].namespace, + MPIR_Worlds[remote_world_idx].namespace, + MPIR_NAMESPACE_MAX); + MPIR_Assert(cmp_result != 0); + if (cmp_result < 0) + *is_low_group_out = false; + else + *is_low_group_out = true; + } + + return mpi_errno; +} + int MPIR_Intercomm_create_impl(MPIR_Comm * local_comm_ptr, int local_leader, MPIR_Comm * peer_comm_ptr, int remote_leader, int tag, MPIR_Comm ** new_intercomm_ptr) +{ + return MPIR_Intercomm_create_timeout(local_comm_ptr, local_leader, + peer_comm_ptr, remote_leader, tag, 0, new_intercomm_ptr); +} + +int MPIR_Intercomm_create_timeout(MPIR_Comm * local_comm_ptr, int local_leader, + MPIR_Comm * peer_comm_ptr, int remote_leader, + int tag, int timeout, MPIR_Comm ** new_intercomm_ptr) { int mpi_errno = MPI_SUCCESS; - int final_context_id, recvcontext_id; int remote_size = 0; - uint64_t *remote_lpids = NULL; - int comm_info[3]; - int is_low_group = 0; + MPIR_Lpid *remote_lpids = NULL; + MPIR_Session *session_ptr = local_comm_ptr->session_ptr; MPIR_FUNC_ENTER; - /* Shift tag into the tagged coll space */ - tag |= MPIR_TAG_COLL_BIT; - - mpi_errno = MPID_Intercomm_exchange_map(local_comm_ptr, local_leader, - peer_comm_ptr, remote_leader, - &remote_size, &remote_lpids, &is_low_group); - MPIR_ERR_CHECK(mpi_errno); - /* * Create the contexts. Each group will have a context for sending * to the other group. All processes must be involved. Because * we know that the local and remote groups are disjoint, this * step will complete */ - MPL_DBG_MSG_FMT(MPIR_DBG_COMM, VERBOSE, - (MPL_DBG_FDEST, "About to get contextid (local_size=%d) on rank %d", - local_comm_ptr->local_size, local_comm_ptr->rank)); /* In the multi-threaded case, MPIR_Get_contextid_sparse assumes that the * calling routine already holds the single critical section */ /* TODO: Make sure this is tag-safe */ + int recvcontext_id = MPIR_INVALID_CONTEXT_ID; mpi_errno = MPIR_Get_contextid_sparse(local_comm_ptr, &recvcontext_id, FALSE); MPIR_ERR_CHECK(mpi_errno); MPIR_Assert(recvcontext_id != 0); - MPL_DBG_MSG_FMT(MPIR_DBG_COMM, VERBOSE, (MPL_DBG_FDEST, "Got contextid=%d", recvcontext_id)); - - /* Leaders can now swap context ids and then broadcast the value - * to the local group of processes */ - if (local_comm_ptr->rank == local_leader) { - int remote_context_id; - - mpi_errno = - MPIC_Sendrecv(&recvcontext_id, 1, MPIR_CONTEXT_ID_T_DATATYPE, remote_leader, tag, - &remote_context_id, 1, MPIR_CONTEXT_ID_T_DATATYPE, remote_leader, tag, - peer_comm_ptr, MPI_STATUS_IGNORE, MPIR_ERR_NONE); - MPIR_ERR_CHECK(mpi_errno); - final_context_id = remote_context_id; + /* Shift tag into the tagged coll space */ + tag |= MPIR_TAG_COLL_BIT; - /* Now, send all of our local processes the remote_lpids, - * along with the final context id */ - comm_info[0] = final_context_id; - MPL_DBG_MSG(MPIR_DBG_COMM, VERBOSE, "About to bcast on local_comm"); - mpi_errno = MPIR_Bcast(comm_info, 1, MPI_INT, local_leader, local_comm_ptr, MPIR_ERR_NONE); - MPIR_ERR_CHECK(mpi_errno); - MPL_DBG_MSG_D(MPIR_DBG_COMM, VERBOSE, "end of bcast on local_comm of size %d", - local_comm_ptr->local_size); - } else { - /* we're the other processes */ - MPL_DBG_MSG(MPIR_DBG_COMM, VERBOSE, "About to receive bcast on local_comm"); - mpi_errno = MPIR_Bcast(comm_info, 1, MPI_INT, local_leader, local_comm_ptr, MPIR_ERR_NONE); - MPIR_ERR_CHECK(mpi_errno); + int remote_context_id; + mpi_errno = MPID_Intercomm_exchange(local_comm_ptr, local_leader, + peer_comm_ptr, remote_leader, tag, + recvcontext_id, &remote_context_id, + &remote_size, &remote_lpids, timeout); + MPIR_ERR_CHECK(mpi_errno); - /* Extract the context and group sign information */ - final_context_id = comm_info[0]; - } + bool is_low_group; + mpi_errno = determine_low_group(remote_lpids[0], &is_low_group); + MPIR_ERR_CHECK(mpi_errno); /* At last, we now have the information that we need to build the * intercommunicator */ @@ -1058,10 +872,9 @@ int MPIR_Intercomm_create_impl(MPIR_Comm * local_comm_ptr, int local_leader, /* All processes in the local_comm now build the communicator */ mpi_errno = MPIR_Comm_create(new_intercomm_ptr); - if (mpi_errno) - goto fn_fail; + MPIR_ERR_CHECK(mpi_errno); - (*new_intercomm_ptr)->context_id = final_context_id; + (*new_intercomm_ptr)->context_id = remote_context_id; (*new_intercomm_ptr)->recvcontext_id = recvcontext_id; (*new_intercomm_ptr)->remote_size = remote_size; (*new_intercomm_ptr)->local_size = local_comm_ptr->local_size; @@ -1070,13 +883,18 @@ int MPIR_Intercomm_create_impl(MPIR_Comm * local_comm_ptr, int local_leader, (*new_intercomm_ptr)->local_comm = 0; (*new_intercomm_ptr)->is_low_group = is_low_group; - MPIR_Comm_set_session_ptr(*new_intercomm_ptr, local_comm_ptr->session_ptr); + (*new_intercomm_ptr)->local_group = local_comm_ptr->local_group; + MPIR_Group_add_ref(local_comm_ptr->local_group); - mpi_errno = MPID_Create_intercomm_from_lpids(*new_intercomm_ptr, remote_size, remote_lpids); - if (mpi_errno) - goto fn_fail; + /* construct remote_group */ + mpi_errno = MPIR_Group_create_map(remote_size, MPI_UNDEFINED, session_ptr, remote_lpids, + &(*new_intercomm_ptr)->remote_group); + MPIR_ERR_CHECK(mpi_errno); - MPIR_Comm_map_dup(*new_intercomm_ptr, local_comm_ptr, MPIR_COMM_MAP_DIR__L2L); + MPIR_Comm_set_session_ptr(*new_intercomm_ptr, session_ptr); + + mpi_errno = MPID_Create_intercomm_from_lpids(*new_intercomm_ptr, remote_size, remote_lpids); + MPIR_ERR_CHECK(mpi_errno); /* Inherit the error handler (if any) */ MPID_THREAD_CS_ENTER(VCI, local_comm_ptr->mutex); @@ -1092,90 +910,15 @@ int MPIR_Intercomm_create_impl(MPIR_Comm * local_comm_ptr, int local_leader, fn_exit: - MPL_free(remote_lpids); - remote_lpids = NULL; MPIR_FUNC_EXIT; return mpi_errno; fn_fail: - goto fn_exit; -} - -/* Peer intercomm is a 1-to-1 intercomm, internally created by device layer - * to facilitate connecting dynamic processes */ - -int MPIR_peer_intercomm_create(int context_id, int recvcontext_id, - uint64_t remote_lpid, int is_low_group, MPIR_Comm ** newcomm) -{ - int mpi_errno = MPI_SUCCESS; - - mpi_errno = MPIR_Comm_create(newcomm); - MPIR_ERR_CHECK(mpi_errno); - - (*newcomm)->context_id = context_id; - (*newcomm)->recvcontext_id = recvcontext_id; - (*newcomm)->remote_size = 1; - (*newcomm)->local_size = 1; - (*newcomm)->rank = 0; - (*newcomm)->comm_kind = MPIR_COMM_KIND__INTERCOMM; - (*newcomm)->local_comm = 0; - (*newcomm)->is_low_group = is_low_group; - - mpi_errno = MPID_Create_intercomm_from_lpids(*newcomm, 1, &remote_lpid); - MPIR_ERR_CHECK(mpi_errno); - - MPIR_Comm *comm_self = MPIR_Process.comm_self; - MPIR_Comm_map_dup(*newcomm, comm_self, MPIR_COMM_MAP_DIR__L2L); - - /* Inherit the error handler */ - MPID_THREAD_CS_ENTER(VCI, comm_self->mutex); - (*newcomm)->errhandler = comm_self->errhandler; - if (comm_self->errhandler) { - MPIR_Errhandler_add_ref(comm_self->errhandler); + if (recvcontext_id != MPIR_INVALID_CONTEXT_ID) { + MPIR_Free_contextid(recvcontext_id); } - MPID_THREAD_CS_EXIT(VCI, comm_self->mutex); - - (*newcomm)->tainted = 1; - mpi_errno = MPIR_Comm_commit(*newcomm); - MPIR_ERR_CHECK(mpi_errno); - - fn_exit: - return mpi_errno; - fn_fail: goto fn_exit; } -/* This function creates mapping for new communicator - * basing on network addresses of existing communicator. - */ - -static int create_and_map(MPIR_Comm * comm_ptr, int local_high, MPIR_Comm * new_intracomm_ptr) -{ - int mpi_errno = MPI_SUCCESS; - int i; - - /* Now we know which group comes first. Build the new mapping - * from the existing comm */ - if (local_high) { - /* remote group first */ - MPIR_Comm_map_dup(new_intracomm_ptr, comm_ptr, MPIR_COMM_MAP_DIR__R2L); - - MPIR_Comm_map_dup(new_intracomm_ptr, comm_ptr, MPIR_COMM_MAP_DIR__L2L); - for (i = 0; i < comm_ptr->local_size; i++) - if (i == comm_ptr->rank) - new_intracomm_ptr->rank = comm_ptr->remote_size + i; - } else { - /* local group first */ - MPIR_Comm_map_dup(new_intracomm_ptr, comm_ptr, MPIR_COMM_MAP_DIR__L2L); - for (i = 0; i < comm_ptr->local_size; i++) - if (i == comm_ptr->rank) - new_intracomm_ptr->rank = i; - - MPIR_Comm_map_dup(new_intracomm_ptr, comm_ptr, MPIR_COMM_MAP_DIR__R2L); - } - - return mpi_errno; -} - int MPIR_Intercomm_merge_impl(MPIR_Comm * comm_ptr, int high, MPIR_Comm ** new_intracomm_ptr) { int mpi_errno = MPI_SUCCESS; @@ -1245,15 +988,43 @@ int MPIR_Intercomm_merge_impl(MPIR_Comm * comm_ptr, int high, MPIR_Comm ** new_i } (*new_intracomm_ptr)->recvcontext_id = (*new_intracomm_ptr)->context_id; (*new_intracomm_ptr)->remote_size = (*new_intracomm_ptr)->local_size = new_size; - (*new_intracomm_ptr)->rank = -1; (*new_intracomm_ptr)->comm_kind = MPIR_COMM_KIND__INTRACOMM; + (*new_intracomm_ptr)->remote_group = NULL; MPIR_Comm_set_session_ptr(*new_intracomm_ptr, comm_ptr->session_ptr); - /* Now we know which group comes first. Build the new mapping - * from the existing comm */ - mpi_errno = create_and_map(comm_ptr, local_high, (*new_intracomm_ptr)); - MPIR_ERR_CHECK(mpi_errno); + /* construct local_group */ + MPIR_Group *new_local_group; + + MPIR_Lpid *map; + map = MPL_malloc(new_size * sizeof(MPIR_Lpid), MPL_MEM_GROUP); + MPIR_ERR_CHKANDJUMP(!map, mpi_errno, MPI_ERR_OTHER, "**nomem"); + + int myrank; + MPIR_Group *group1, *group2; + if (local_high) { + group1 = comm_ptr->remote_group; + group2 = comm_ptr->local_group; + myrank = group1->size + group2->rank; + } else { + group1 = comm_ptr->local_group; + group2 = comm_ptr->remote_group; + myrank = group1->rank; + } + for (int i = 0; i < group1->size; i++) { + map[i] = MPIR_Group_rank_to_lpid(group1, i); + } + for (int i = 0; i < group2->size; i++) { + map[group1->size + i] = MPIR_Group_rank_to_lpid(group2, i); + } + + mpi_errno = MPIR_Group_create_map(new_size, myrank, comm_ptr->session_ptr, map, + &new_local_group); + + (*new_intracomm_ptr)->local_group = new_local_group; + MPIR_Group_add_ref(new_local_group); + + (*new_intracomm_ptr)->rank = myrank; /* We've setup a temporary context id, based on the context id * used by the intercomm. This allows us to perform the allreduce @@ -1282,15 +1053,14 @@ int MPIR_Intercomm_merge_impl(MPIR_Comm * comm_ptr, int high, MPIR_Comm ** new_i MPIR_ERR_CHECK(mpi_errno); (*new_intracomm_ptr)->remote_size = (*new_intracomm_ptr)->local_size = new_size; - (*new_intracomm_ptr)->rank = -1; + (*new_intracomm_ptr)->rank = myrank; (*new_intracomm_ptr)->comm_kind = MPIR_COMM_KIND__INTRACOMM; (*new_intracomm_ptr)->context_id = new_context_id; (*new_intracomm_ptr)->recvcontext_id = new_context_id; + (*new_intracomm_ptr)->remote_group = NULL; MPIR_Comm_set_session_ptr(*new_intracomm_ptr, comm_ptr->session_ptr); - - mpi_errno = create_and_map(comm_ptr, local_high, (*new_intracomm_ptr)); - MPIR_ERR_CHECK(mpi_errno); + (*new_intracomm_ptr)->local_group = new_local_group; (*new_intracomm_ptr)->tainted = 1; mpi_errno = MPIR_Comm_commit((*new_intracomm_ptr)); diff --git a/src/mpi/comm/comm_split.c b/src/mpi/comm/comm_split.c index 7c5519278e4..3d3d95187de 100644 --- a/src/mpi/comm/comm_split.c +++ b/src/mpi/comm/comm_split.c @@ -89,7 +89,6 @@ int MPIR_Comm_split_impl(MPIR_Comm * comm_ptr, int color, int key, MPIR_Comm ** first_entry = 0, first_remote_entry = 0, *last_ptr; int in_newcomm; /* TRUE iff *newcomm should be populated */ int new_context_id, remote_context_id; - MPIR_Comm_map_t *mapper; MPIR_CHKLMEM_DECL(4); rank = comm_ptr->rank; @@ -283,15 +282,21 @@ int MPIR_Comm_split_impl(MPIR_Comm * comm_ptr, int color, int key, MPIR_Comm ** * corresponding process in the input communicator */ MPIU_Sort_inttable(remotekeytable, new_remote_size); - MPIR_Comm_map_irregular(*newcomm_ptr, comm_ptr, NULL, - new_size, MPIR_COMM_MAP_DIR__L2L, &mapper); + int *local_ranks; + local_ranks = MPL_malloc(new_size * sizeof(int), MPL_MEM_OTHER); + MPIR_ERR_CHKANDJUMP(!local_ranks, mpi_errno, MPI_ERR_OTHER, "**nomem"); for (i = 0; i < new_size; i++) { - mapper->src_mapping[i] = keytable[i].color; + local_ranks[i] = keytable[i].color; if (keytable[i].color == comm_ptr->rank) (*newcomm_ptr)->rank = i; } + mpi_errno = MPIR_Group_incl_impl(comm_ptr->local_group, new_size, local_ranks, + &(*newcomm_ptr)->local_group); + MPIR_ERR_CHECK(mpi_errno); + MPL_free(local_ranks); + /* For the remote group, the situation is more complicated. * We need to find the size of our "partner" group in the * remote comm. The easiest way (in terms of code) is for @@ -307,11 +312,19 @@ int MPIR_Comm_split_impl(MPIR_Comm * comm_ptr, int color, int key, MPIR_Comm ** * is required to return MPI_COMM_NULL instead of an intercomm * with an empty remote group. */ - MPIR_Comm_map_irregular(*newcomm_ptr, comm_ptr, NULL, - new_remote_size, MPIR_COMM_MAP_DIR__R2R, &mapper); + int *remote_ranks; + remote_ranks = MPL_malloc(new_remote_size * sizeof(int), MPL_MEM_OTHER); + MPIR_ERR_CHKANDJUMP(!remote_ranks, mpi_errno, MPI_ERR_OTHER, "**nomem"); + + for (i = 0; i < new_remote_size; i++) { + remote_ranks[i] = remotekeytable[i].color; + } - for (i = 0; i < new_remote_size; i++) - mapper->src_mapping[i] = remotekeytable[i].color; + mpi_errno = MPIR_Group_incl_impl(comm_ptr->remote_group, + new_remote_size, remote_ranks, + &(*newcomm_ptr)->remote_group); + MPIR_ERR_CHECK(mpi_errno); + MPL_free(remote_ranks); (*newcomm_ptr)->context_id = remote_context_id; (*newcomm_ptr)->remote_size = new_remote_size; @@ -323,14 +336,20 @@ int MPIR_Comm_split_impl(MPIR_Comm * comm_ptr, int color, int key, MPIR_Comm ** (*newcomm_ptr)->context_id = (*newcomm_ptr)->recvcontext_id; (*newcomm_ptr)->remote_size = new_size; - MPIR_Comm_map_irregular(*newcomm_ptr, comm_ptr, NULL, - new_size, MPIR_COMM_MAP_DIR__L2L, &mapper); + int *local_ranks; + local_ranks = MPL_malloc(new_size * sizeof(int), MPL_MEM_OTHER); + MPIR_ERR_CHKANDJUMP(!local_ranks, mpi_errno, MPI_ERR_OTHER, "**nomem"); for (i = 0; i < new_size; i++) { - mapper->src_mapping[i] = keytable[i].color; + local_ranks[i] = keytable[i].color; if (keytable[i].color == comm_ptr->rank) (*newcomm_ptr)->rank = i; } + + mpi_errno = MPIR_Group_incl_impl(comm_ptr->local_group, new_size, local_ranks, + &(*newcomm_ptr)->local_group); + MPIR_ERR_CHECK(mpi_errno); + MPL_free(local_ranks); } /* Inherit the error handler (if any) */ diff --git a/src/mpi/comm/commutil.c b/src/mpi/comm/commutil.c index 9a51e8565ee..df0540087bc 100644 --- a/src/mpi/comm/commutil.c +++ b/src/mpi/comm/commutil.c @@ -309,8 +309,6 @@ int MPII_Comm_init(MPIR_Comm * comm_p) /* Initialize the revoked flag as false */ comm_p->revoked = 0; - comm_p->mapper_head = NULL; - comm_p->mapper_tail = NULL; comm_p->threadcomm = NULL; MPIR_stream_comm_init(comm_p); @@ -382,6 +380,10 @@ int MPII_Setup_intercomm_localcomm(MPIR_Comm * intercomm_ptr) mpi_errno = MPII_Comm_init(localcomm_ptr); MPIR_ERR_CHECK(mpi_errno); + MPIR_Assert(intercomm_ptr->local_group); + localcomm_ptr->local_group = intercomm_ptr->local_group; + MPIR_Group_add_ref(intercomm_ptr->local_group); + MPIR_Comm_set_session_ptr(localcomm_ptr, intercomm_ptr->session_ptr); /* use the parent intercomm's recv ctx as the basis for our ctx */ @@ -403,10 +405,6 @@ int MPII_Setup_intercomm_localcomm(MPIR_Comm * intercomm_ptr) localcomm_ptr->local_size = intercomm_ptr->local_size; localcomm_ptr->rank = intercomm_ptr->rank; - MPIR_Comm_map_dup(localcomm_ptr, intercomm_ptr, MPIR_COMM_MAP_DIR__L2L); - - /* TODO More advanced version: if the group is available, dup it by - * increasing the reference count instead of recreating it later */ /* FIXME : No local functions for the topology routines */ intercomm_ptr->local_comm = localcomm_ptr; @@ -424,99 +422,6 @@ int MPII_Setup_intercomm_localcomm(MPIR_Comm * intercomm_ptr) return mpi_errno; } -int MPIR_Comm_map_irregular(MPIR_Comm * newcomm, MPIR_Comm * src_comm, - int *src_mapping, int src_mapping_size, - MPIR_Comm_map_dir_t dir, MPIR_Comm_map_t ** map) -{ - int mpi_errno = MPI_SUCCESS; - MPIR_Comm_map_t *mapper; - MPIR_CHKPMEM_DECL(3); - - MPIR_FUNC_ENTER; - - MPIR_CHKPMEM_MALLOC(mapper, MPIR_Comm_map_t *, sizeof(MPIR_Comm_map_t), mpi_errno, "mapper", - MPL_MEM_COMM); - - mapper->type = MPIR_COMM_MAP_TYPE__IRREGULAR; - mapper->src_comm = src_comm; - mapper->dir = dir; - mapper->src_mapping_size = src_mapping_size; - - if (src_mapping) { - mapper->src_mapping = src_mapping; - mapper->free_mapping = 0; - } else { - MPIR_CHKPMEM_MALLOC(mapper->src_mapping, int *, - src_mapping_size * sizeof(int), mpi_errno, "mapper mapping", - MPL_MEM_COMM); - mapper->free_mapping = 1; - } - - mapper->next = NULL; - - LL_APPEND(newcomm->mapper_head, newcomm->mapper_tail, mapper); - - if (map) - *map = mapper; - - fn_exit: - MPIR_CHKPMEM_COMMIT(); - MPIR_FUNC_EXIT; - return mpi_errno; - fn_fail: - MPIR_CHKPMEM_REAP(); - goto fn_exit; -} - -int MPIR_Comm_map_dup(MPIR_Comm * newcomm, MPIR_Comm * src_comm, MPIR_Comm_map_dir_t dir) -{ - int mpi_errno = MPI_SUCCESS; - MPIR_Comm_map_t *mapper; - MPIR_CHKPMEM_DECL(1); - - MPIR_FUNC_ENTER; - - MPIR_CHKPMEM_MALLOC(mapper, MPIR_Comm_map_t *, sizeof(MPIR_Comm_map_t), mpi_errno, "mapper", - MPL_MEM_COMM); - - mapper->type = MPIR_COMM_MAP_TYPE__DUP; - mapper->src_comm = src_comm; - mapper->dir = dir; - - mapper->next = NULL; - - LL_APPEND(newcomm->mapper_head, newcomm->mapper_tail, mapper); - - fn_exit: - MPIR_CHKPMEM_COMMIT(); - MPIR_FUNC_EXIT; - return mpi_errno; - fn_fail: - MPIR_CHKPMEM_REAP(); - goto fn_exit; -} - - -int MPIR_Comm_map_free(MPIR_Comm * comm) -{ - int mpi_errno = MPI_SUCCESS; - MPIR_Comm_map_t *mapper, *tmp; - - MPIR_FUNC_ENTER; - - for (mapper = comm->mapper_head; mapper;) { - tmp = mapper->next; - if (mapper->type == MPIR_COMM_MAP_TYPE__IRREGULAR && mapper->free_mapping) - MPL_free(mapper->src_mapping); - MPL_free(mapper); - mapper = tmp; - } - comm->mapper_head = NULL; - - MPIR_FUNC_EXIT; - return mpi_errno; -} - static int get_node_count(MPIR_Comm * comm, int *node_count) { int mpi_errno = MPI_SUCCESS; @@ -581,8 +486,6 @@ static int MPIR_Comm_commit_internal(MPIR_Comm * comm) mpi_errno = get_node_count(comm, &comm->node_count); MPIR_ERR_CHECK(mpi_errno); - MPIR_Comm_map_free(comm); - fn_exit: MPIR_FUNC_EXIT; return mpi_errno; @@ -687,8 +590,13 @@ int MPIR_Comm_create_subcomms(MPIR_Comm * comm) /* Copy relevant hints to node_comm */ propagate_hints_to_subcomm(comm, comm->node_comm); - MPIR_Comm_map_irregular(comm->node_comm, comm, local_procs, num_local, - MPIR_COMM_MAP_DIR__L2L, NULL); + /* construct local_group */ + MPIR_Group *parent_group = comm->local_group; + MPIR_Assert(parent_group); + mpi_errno = MPIR_Group_incl_impl(parent_group, num_local, local_procs, + &comm->node_comm->local_group); + MPIR_ERR_CHECK(mpi_errno); + mpi_errno = MPIR_Comm_commit_internal(comm->node_comm); MPIR_ERR_CHECK(mpi_errno); } @@ -714,8 +622,13 @@ int MPIR_Comm_create_subcomms(MPIR_Comm * comm) /* Copy relevant hints to node_roots_comm */ propagate_hints_to_subcomm(comm, comm->node_roots_comm); - MPIR_Comm_map_irregular(comm->node_roots_comm, comm, external_procs, num_external, - MPIR_COMM_MAP_DIR__L2L, NULL); + /* construct local_group */ + MPIR_Group *parent_group = comm->local_group; + MPIR_Assert(parent_group); + mpi_errno = MPIR_Group_incl_impl(parent_group, num_external, external_procs, + &comm->node_roots_comm->local_group); + MPIR_ERR_CHECK(mpi_errno); + mpi_errno = MPIR_Comm_commit_internal(comm->node_roots_comm); MPIR_ERR_CHECK(mpi_errno); } @@ -919,7 +832,6 @@ int MPII_Comm_copy(MPIR_Comm * comm_ptr, int size, MPIR_Info * info, MPIR_Comm * int mpi_errno = MPI_SUCCESS; int new_context_id, new_recvcontext_id; MPIR_Comm *newcomm_ptr = NULL; - MPIR_Comm_map_t *map = NULL; MPIR_FUNC_ENTER; @@ -961,39 +873,15 @@ int MPII_Comm_copy(MPIR_Comm * comm_ptr, int size, MPIR_Info * info, MPIR_Comm * newcomm_ptr->comm_kind = comm_ptr->comm_kind; newcomm_ptr->local_comm = 0; - MPIR_Comm_set_session_ptr(newcomm_ptr, comm_ptr->session_ptr); - - /* There are two cases here - size is the same as the old communicator, - * or it is smaller. If the size is the same, we can just add a reference. - * Otherwise, we need to create a new network address mapping. Note that this is the - * test that matches the test on rank above. */ - if (size == comm_ptr->local_size) { - /* Duplicate the network address mapping */ - if (comm_ptr->comm_kind == MPIR_COMM_KIND__INTRACOMM) - MPIR_Comm_map_dup(newcomm_ptr, comm_ptr, MPIR_COMM_MAP_DIR__L2L); - else - MPIR_Comm_map_dup(newcomm_ptr, comm_ptr, MPIR_COMM_MAP_DIR__R2R); - } else { - int i; - - if (comm_ptr->comm_kind == MPIR_COMM_KIND__INTRACOMM) - MPIR_Comm_map_irregular(newcomm_ptr, comm_ptr, NULL, size, MPIR_COMM_MAP_DIR__L2L, - &map); - else - MPIR_Comm_map_irregular(newcomm_ptr, comm_ptr, NULL, size, MPIR_COMM_MAP_DIR__R2R, - &map); - for (i = 0; i < size; i++) { - /* For rank i in the new communicator, find the corresponding - * rank in the input communicator */ - map->src_mapping[i] = i; - } - } - - /* If it is an intercomm, duplicate the local network address references */ + newcomm_ptr->local_group = comm_ptr->local_group; + MPIR_Group_add_ref(comm_ptr->local_group); if (comm_ptr->comm_kind == MPIR_COMM_KIND__INTERCOMM) { - MPIR_Comm_map_dup(newcomm_ptr, comm_ptr, MPIR_COMM_MAP_DIR__L2L); + newcomm_ptr->remote_group = comm_ptr->remote_group; + MPIR_Group_add_ref(comm_ptr->remote_group); } + MPIR_Comm_set_session_ptr(newcomm_ptr, comm_ptr->session_ptr); + /* Set the sizes and ranks */ newcomm_ptr->rank = comm_ptr->rank; if (comm_ptr->comm_kind == MPIR_COMM_KIND__INTERCOMM) { @@ -1059,14 +947,11 @@ int MPII_Comm_copy_data(MPIR_Comm * comm_ptr, MPIR_Info * info, MPIR_Comm ** out newcomm_ptr->comm_kind = comm_ptr->comm_kind; newcomm_ptr->local_comm = 0; - if (comm_ptr->comm_kind == MPIR_COMM_KIND__INTRACOMM) - MPIR_Comm_map_dup(newcomm_ptr, comm_ptr, MPIR_COMM_MAP_DIR__L2L); - else - MPIR_Comm_map_dup(newcomm_ptr, comm_ptr, MPIR_COMM_MAP_DIR__R2R); - - /* If it is an intercomm, duplicate the network address mapping */ + newcomm_ptr->local_group = comm_ptr->local_group; + MPIR_Group_add_ref(comm_ptr->local_group); if (comm_ptr->comm_kind == MPIR_COMM_KIND__INTERCOMM) { - MPIR_Comm_map_dup(newcomm_ptr, comm_ptr, MPIR_COMM_MAP_DIR__L2L); + newcomm_ptr->remote_group = comm_ptr->remote_group; + MPIR_Group_add_ref(comm_ptr->remote_group); } /* Set the sizes and ranks */ diff --git a/src/mpi/comm/contextid.c b/src/mpi/comm/contextid.c index 9fab45bc789..c0a72060eca 100644 --- a/src/mpi/comm/contextid.c +++ b/src/mpi/comm/contextid.c @@ -753,7 +753,7 @@ static int sched_cb_gcn_allocate_cid(MPIR_Comm * comm, int tag, void *state) * Therefore, we set tag_up as lower bound for the operation. tag_ub is used by * most of the other blocking operations, but tag is always >0, so this * should be fine. - * 2.) We need odering between multiple idup operations on the same communicator. + * 2.) We need ordering between multiple idup operations on the same communicator. * The problem here is that the iallreduce operations of the first iteration * are not necessarily completed in the same order as they are issued, also on the * same communicator. To avoid deadlocks, we cannot add the elements to the @@ -790,7 +790,6 @@ static int sched_cb_gcn_allocate_cid(MPIR_Comm * comm, int tag, void *state) /* In the case of failure, the new communicator was half created. * So we need to clean the memory allocated for it. */ MPII_COMML_FORGET(st->new_comm); - MPIR_Comm_map_free(st->new_comm); MPIR_Handle_obj_free(&MPIR_Comm_mem, st->new_comm); MPL_free(st); goto fn_exit; diff --git a/src/mpi/comm/ulfm_impl.c b/src/mpi/comm/ulfm_impl.c index dfd4ad6bfcf..33edffa3d11 100644 --- a/src/mpi/comm/ulfm_impl.c +++ b/src/mpi/comm/ulfm_impl.c @@ -87,21 +87,22 @@ int MPIR_Comm_get_failed_impl(MPIR_Comm * comm_ptr, MPIR_Group ** failed_group_p /* create failed_group */ int n = utarray_len(failed_procs); + MPIR_Lpid *map = MPL_malloc(n * sizeof(MPIR_Lpid), MPL_MEM_GROUP); + MPIR_Group *new_group; - mpi_errno = MPIR_Group_create(n, &new_group); - MPIR_ERR_CHECK(mpi_errno); - new_group->rank = MPI_UNDEFINED; + int myrank = MPI_UNDEFINED; for (int i = 0; i < utarray_len(failed_procs); i++) { int *p = (int *) utarray_eltptr(failed_procs, i); - new_group->lrank_to_lpid[i].lpid = *p; + map[i] = *p; /* if calling process is part of the group, set the rank */ if (*p == MPIR_Process.rank) { - new_group->rank = i; + myrank = i; } } - new_group->size = n; - new_group->idx_of_first_lpid = -1; + + mpi_errno = MPIR_Group_create_map(n, myrank, comm_ptr->session_ptr, map, &new_group); + MPIR_ERR_CHECK(mpi_errno); MPIR_Group *comm_group; MPIR_Comm_group_impl(comm_ptr, &comm_group); diff --git a/src/mpi/errhan/errnames.txt b/src/mpi/errhan/errnames.txt index 9198104035d..54d8d2e49e5 100644 --- a/src/mpi/errhan/errnames.txt +++ b/src/mpi/errhan/errnames.txt @@ -902,6 +902,8 @@ is too big (> MPIU_SHMW_GHND_SZ) **iface_notfound %s:The network interface, \"%s\", specified in MPIR_CVAR_CH3_NETWORK_IFACE was not found. **procnamefailed:Failed to get processor name +**procnotfound:Process not found +**procnotfound %d:Process %d not found **notsuppmultithread:this functionality is not supported when the thread level is greater than MPI_THREAD_SINGLE **valuetoolarge:Value is too large to store diff --git a/src/mpi/group/group_impl.c b/src/mpi/group/group_impl.c index dbd3cd88204..8e09e216554 100644 --- a/src/mpi/group/group_impl.c +++ b/src/mpi/group/group_impl.c @@ -18,10 +18,25 @@ int MPIR_Group_size_impl(MPIR_Group * group_ptr, int *size) return MPI_SUCCESS; } +int MPIR_Group_free_impl(MPIR_Group * group_ptr) +{ + int mpi_errno = MPI_SUCCESS; + + /* Do not free MPI_GROUP_EMPTY */ + if (group_ptr->handle != MPI_GROUP_EMPTY) { + mpi_errno = MPIR_Group_release(group_ptr); + MPIR_ERR_CHECK(mpi_errno); + } + + fn_exit: + return mpi_errno; + fn_fail: + goto fn_exit; +} + int MPIR_Group_compare_impl(MPIR_Group * group_ptr1, MPIR_Group * group_ptr2, int *result) { int mpi_errno = MPI_SUCCESS; - int g1_idx, g2_idx, size, i; /* See if their sizes are equal */ if (group_ptr1->size != group_ptr2->size) { @@ -29,156 +44,93 @@ int MPIR_Group_compare_impl(MPIR_Group * group_ptr1, MPIR_Group * group_ptr2, in goto fn_exit; } - /* Run through the lrank to lpid lists of each group in lpid order - * to see if the same processes are involved */ - g1_idx = group_ptr1->idx_of_first_lpid; - g2_idx = group_ptr2->idx_of_first_lpid; - /* If the lpid list hasn't been created, do it now */ - if (g1_idx < 0) { - MPII_Group_setup_lpid_list(group_ptr1); - g1_idx = group_ptr1->idx_of_first_lpid; - } - if (g2_idx < 0) { - MPII_Group_setup_lpid_list(group_ptr2); - g2_idx = group_ptr2->idx_of_first_lpid; - } - while (g1_idx >= 0 && g2_idx >= 0) { - if (group_ptr1->lrank_to_lpid[g1_idx].lpid != group_ptr2->lrank_to_lpid[g2_idx].lpid) { - *result = MPI_UNEQUAL; - goto fn_exit; + int size; + size = group_ptr1->size; + + /* See if they are identical */ + bool is_ident = true; + for (int i = 0; i < size; i++) { + if (MPIR_Group_rank_to_lpid(group_ptr1, i) != MPIR_Group_rank_to_lpid(group_ptr2, i)) { + is_ident = false; + break; } - g1_idx = group_ptr1->lrank_to_lpid[g1_idx].next_lpid; - g2_idx = group_ptr2->lrank_to_lpid[g2_idx].next_lpid; } - /* See if the processes are in the same order by rank */ - size = group_ptr1->size; - for (i = 0; i < size; i++) { - if (group_ptr1->lrank_to_lpid[i].lpid != group_ptr2->lrank_to_lpid[i].lpid) { - *result = MPI_SIMILAR; - goto fn_exit; + if (is_ident) { + *result = MPI_IDENT; + goto fn_exit; + } + + /* See if they are similar */ + bool is_similar = true; + for (int i = 0; i < size; i++) { + MPIR_Lpid lpid = MPIR_Group_rank_to_lpid(group_ptr1, i); + if (MPI_UNDEFINED == MPIR_Group_lpid_to_rank(group_ptr2, lpid)) { + /* not found */ + is_similar = false; + break; } } - /* If we reach here, the groups are identical */ - *result = MPI_IDENT; + if (is_similar) { + *result = MPI_SIMILAR; + } else { + *result = MPI_UNEQUAL; + } fn_exit: return mpi_errno; } -int MPIR_Group_difference_impl(MPIR_Group * group_ptr1, MPIR_Group * group_ptr2, - MPIR_Group ** new_group_ptr) +int MPIR_Group_translate_ranks_impl(MPIR_Group * gp1, int n, const int ranks1[], + MPIR_Group * gp2, int ranks2[]) { int mpi_errno = MPI_SUCCESS; - int size1, i, k, g1_idx, g2_idx, nnew; - uint64_t l1_pid, l2_pid; - int *flags = NULL; - MPIR_FUNC_ENTER; - /* Return a group consisting of the members of group1 that are *not* - * in group2 */ - size1 = group_ptr1->size; - /* Insure that the lpid lists are setup */ - MPIR_Group_setup_lpid_pairs(group_ptr1, group_ptr2); - - flags = MPL_calloc(size1, sizeof(int), MPL_MEM_OTHER); - - g1_idx = group_ptr1->idx_of_first_lpid; - g2_idx = group_ptr2->idx_of_first_lpid; - - nnew = size1; - while (g1_idx >= 0 && g2_idx >= 0) { - l1_pid = group_ptr1->lrank_to_lpid[g1_idx].lpid; - l2_pid = group_ptr2->lrank_to_lpid[g2_idx].lpid; - if (l1_pid < l2_pid) { - g1_idx = group_ptr1->lrank_to_lpid[g1_idx].next_lpid; - } else if (l1_pid > l2_pid) { - g2_idx = group_ptr2->lrank_to_lpid[g2_idx].next_lpid; - } else { - /* Equal */ - flags[g1_idx] = 1; - g1_idx = group_ptr1->lrank_to_lpid[g1_idx].next_lpid; - g2_idx = group_ptr2->lrank_to_lpid[g2_idx].next_lpid; - nnew--; - } - } - /* Create the group */ - if (nnew == 0) { - /* See 5.3.2, Group Constructors. For many group routines, - * the standard explicitly says to return MPI_GROUP_EMPTY; - * for others it is implied */ - *new_group_ptr = MPIR_Group_empty; - goto fn_exit; - } else { - mpi_errno = MPIR_Group_create(nnew, new_group_ptr); - /* --BEGIN ERROR HANDLING-- */ - if (mpi_errno) { - goto fn_fail; - } - /* --END ERROR HANDLING-- */ - (*new_group_ptr)->rank = MPI_UNDEFINED; - k = 0; - for (i = 0; i < size1; i++) { - if (!flags[i]) { - (*new_group_ptr)->lrank_to_lpid[k].lpid = group_ptr1->lrank_to_lpid[i].lpid; - if (i == group_ptr1->rank) - (*new_group_ptr)->rank = k; - k++; - } + for (int i = 0; i < n; i++) { + if (ranks1[i] == MPI_PROC_NULL) { + ranks2[i] = MPI_PROC_NULL; + continue; } - /* TODO calculate is_local_dense_monotonic */ + MPIR_Lpid lpid = MPIR_Group_rank_to_lpid(gp1, ranks1[i]); + ranks2[i] = MPIR_Group_lpid_to_rank(gp2, lpid); } - MPIR_Group_set_session_ptr(*new_group_ptr, group_ptr1->session_ptr); - - fn_exit: - MPL_free(flags); - MPIR_FUNC_EXIT; return mpi_errno; - fn_fail: - goto fn_exit; } int MPIR_Group_excl_impl(MPIR_Group * group_ptr, int n, const int ranks[], MPIR_Group ** new_group_ptr) { int mpi_errno = MPI_SUCCESS; - int size, i, newi; - int *flags = NULL; - MPIR_FUNC_ENTER; - size = group_ptr->size; - - /* Allocate a new group and lrank_to_lpid array */ - mpi_errno = MPIR_Group_create(size - n, new_group_ptr); - MPIR_ERR_CHECK(mpi_errno); + int size = group_ptr->size; + int nnew = size - n; - (*new_group_ptr)->rank = MPI_UNDEFINED; /* Use flag fields to mark the members to *exclude* . */ - - flags = MPL_calloc(size, sizeof(int), MPL_MEM_OTHER); - - for (i = 0; i < n; i++) { + int *flags = MPL_calloc(size, sizeof(int), MPL_MEM_OTHER); + for (int i = 0; i < n; i++) { flags[ranks[i]] = 1; } - newi = 0; - for (i = 0; i < size; i++) { + MPIR_Lpid *map = MPL_malloc(nnew * sizeof(MPIR_Lpid), MPL_MEM_GROUP); + MPIR_ERR_CHKANDJUMP(!map, mpi_errno, MPI_ERR_OTHER, "**nomem"); + + int myrank = MPI_UNDEFINED; + int newi = 0; + for (int i = 0; i < size; i++) { if (flags[i] == 0) { - (*new_group_ptr)->lrank_to_lpid[newi].lpid = group_ptr->lrank_to_lpid[i].lpid; - if (group_ptr->rank == i) - (*new_group_ptr)->rank = newi; + map[newi] = MPIR_Group_rank_to_lpid(group_ptr, i); + if (group_ptr->rank == i) { + myrank = newi; + } newi++; } } - (*new_group_ptr)->size = size - n; - (*new_group_ptr)->idx_of_first_lpid = -1; - /* TODO calculate is_local_dense_monotonic */ - - MPIR_Group_set_session_ptr(*new_group_ptr, group_ptr->session_ptr); + mpi_errno = MPIR_Group_create_map(nnew, myrank, group_ptr->session_ptr, map, new_group_ptr); + MPIR_ERR_CHECK(mpi_errno); fn_exit: MPL_free(flags); @@ -188,28 +140,10 @@ int MPIR_Group_excl_impl(MPIR_Group * group_ptr, int n, const int ranks[], goto fn_exit; } -int MPIR_Group_free_impl(MPIR_Group * group_ptr) -{ - int mpi_errno = MPI_SUCCESS; - - /* Do not free MPI_GROUP_EMPTY */ - if (group_ptr->handle != MPI_GROUP_EMPTY) { - mpi_errno = MPIR_Group_release(group_ptr); - MPIR_ERR_CHECK(mpi_errno); - } - - fn_exit: - return mpi_errno; - fn_fail: - goto fn_exit; -} - int MPIR_Group_incl_impl(MPIR_Group * group_ptr, int n, const int ranks[], MPIR_Group ** new_group_ptr) { int mpi_errno = MPI_SUCCESS; - int i; - MPIR_FUNC_ENTER; if (n == 0) { @@ -217,98 +151,22 @@ int MPIR_Group_incl_impl(MPIR_Group * group_ptr, int n, const int ranks[], goto fn_exit; } - /* Allocate a new group and lrank_to_lpid array */ - mpi_errno = MPIR_Group_create(n, new_group_ptr); - if (mpi_errno) - goto fn_fail; - - (*new_group_ptr)->rank = MPI_UNDEFINED; - for (i = 0; i < n; i++) { - (*new_group_ptr)->lrank_to_lpid[i].lpid = group_ptr->lrank_to_lpid[ranks[i]].lpid; - if (ranks[i] == group_ptr->rank) - (*new_group_ptr)->rank = i; - } - (*new_group_ptr)->size = n; - (*new_group_ptr)->idx_of_first_lpid = -1; - /* TODO calculate is_local_dense_monotonic */ - - MPIR_Group_set_session_ptr(*new_group_ptr, group_ptr->session_ptr); - + int nnew = n; + MPIR_Lpid *map = MPL_malloc(nnew * sizeof(MPIR_Lpid), MPL_MEM_OTHER); + MPIR_ERR_CHKANDJUMP(!map, mpi_errno, MPI_ERR_OTHER, "**nomem"); - fn_exit: - MPIR_FUNC_EXIT; - return mpi_errno; - fn_fail: - goto fn_exit; -} - -int MPIR_Group_intersection_impl(MPIR_Group * group_ptr1, MPIR_Group * group_ptr2, - MPIR_Group ** new_group_ptr) -{ - int mpi_errno = MPI_SUCCESS; - int size1, i, k, g1_idx, g2_idx, nnew; - uint64_t l1_pid, l2_pid; - int *flags = NULL; - - MPIR_FUNC_ENTER; - /* Return a group consisting of the members of group1 that are - * in group2 */ - size1 = group_ptr1->size; - /* Insure that the lpid lists are setup */ - MPIR_Group_setup_lpid_pairs(group_ptr1, group_ptr2); - - flags = MPL_calloc(size1, sizeof(int), MPL_MEM_OTHER); - - g1_idx = group_ptr1->idx_of_first_lpid; - g2_idx = group_ptr2->idx_of_first_lpid; - - nnew = 0; - while (g1_idx >= 0 && g2_idx >= 0) { - l1_pid = group_ptr1->lrank_to_lpid[g1_idx].lpid; - l2_pid = group_ptr2->lrank_to_lpid[g2_idx].lpid; - if (l1_pid < l2_pid) { - g1_idx = group_ptr1->lrank_to_lpid[g1_idx].next_lpid; - } else if (l1_pid > l2_pid) { - g2_idx = group_ptr2->lrank_to_lpid[g2_idx].next_lpid; - } else { - /* Equal */ - flags[g1_idx] = 1; - g1_idx = group_ptr1->lrank_to_lpid[g1_idx].next_lpid; - g2_idx = group_ptr2->lrank_to_lpid[g2_idx].next_lpid; - nnew++; + int myrank = MPI_UNDEFINED; + for (int i = 0; i < n; i++) { + map[i] = MPIR_Group_rank_to_lpid(group_ptr, ranks[i]); + if (ranks[i] == group_ptr->rank) { + myrank = i; } } - /* Create the group. Handle the trivial case first */ - if (nnew == 0) { - *new_group_ptr = MPIR_Group_empty; - goto fn_exit; - } - mpi_errno = MPIR_Group_create(nnew, new_group_ptr); + mpi_errno = MPIR_Group_create_map(nnew, myrank, group_ptr->session_ptr, map, new_group_ptr); MPIR_ERR_CHECK(mpi_errno); - (*new_group_ptr)->rank = MPI_UNDEFINED; - (*new_group_ptr)->is_local_dense_monotonic = TRUE; - k = 0; - for (i = 0; i < size1; i++) { - if (flags[i]) { - uint64_t lpid = group_ptr1->lrank_to_lpid[i].lpid; - (*new_group_ptr)->lrank_to_lpid[k].lpid = lpid; - if (i == group_ptr1->rank) - (*new_group_ptr)->rank = k; - if (lpid > MPIR_Process.size || - (k > 0 && (*new_group_ptr)->lrank_to_lpid[k - 1].lpid != (lpid - 1))) { - (*new_group_ptr)->is_local_dense_monotonic = FALSE; - } - - k++; - } - } - - MPIR_Group_set_session_ptr(*new_group_ptr, group_ptr1->session_ptr); - fn_exit: - MPL_free(flags); MPIR_FUNC_EXIT; return mpi_errno; fn_fail: @@ -319,17 +177,15 @@ int MPIR_Group_range_excl_impl(MPIR_Group * group_ptr, int n, int ranges[][3], MPIR_Group ** new_group_ptr) { int mpi_errno = MPI_SUCCESS; - int size, i, j, k, nnew, first, last, stride; - int *flags = NULL; - MPIR_FUNC_ENTER; + /* Compute size, assuming that included ranks are valid (and distinct) */ - size = group_ptr->size; - nnew = 0; - for (i = 0; i < n; i++) { - first = ranges[i][0]; - last = ranges[i][1]; - stride = ranges[i][2]; + int size = group_ptr->size; + int nnew = 0; + for (int i = 0; i < n; i++) { + int first = ranges[i][0]; + int last = ranges[i][1]; + int stride = ranges[i][2]; /* works for stride of either sign. Error checking above * has already guaranteed stride != 0 */ nnew += 1 + (last - first) / stride; @@ -341,15 +197,6 @@ int MPIR_Group_range_excl_impl(MPIR_Group * group_ptr, int n, int ranges[][3], goto fn_exit; } - /* Allocate a new group and lrank_to_lpid array */ - mpi_errno = MPIR_Group_create(nnew, new_group_ptr); - /* --BEGIN ERROR HANDLING-- */ - if (mpi_errno) { - goto fn_fail; - } - /* --END ERROR HANDLING-- */ - (*new_group_ptr)->rank = MPI_UNDEFINED; - /* Group members are taken in rank order from the original group, * with the specified members removed. Use the flag array for that * purpose. If this was a critical routine, we could use the @@ -357,41 +204,46 @@ int MPIR_Group_range_excl_impl(MPIR_Group * group_ptr, int n, int ranges[][3], * was enabled *and* we are not MPI_THREAD_MULTIPLE, but since this * is a low-usage routine, we haven't taken that optimization. */ - flags = MPL_calloc(size, sizeof(int), MPL_MEM_OTHER); + int *flags = MPL_calloc(size, sizeof(int), MPL_MEM_OTHER); - for (i = 0; i < n; i++) { - first = ranges[i][0]; - last = ranges[i][1]; - stride = ranges[i][2]; + for (int i = 0; i < n; i++) { + int first = ranges[i][0]; + int last = ranges[i][1]; + int stride = ranges[i][2]; if (stride > 0) { - for (j = first; j <= last; j += stride) { + for (int j = first; j <= last; j += stride) { flags[j] = 1; } } else { - for (j = first; j >= last; j += stride) { + for (int j = first; j >= last; j += stride) { flags[j] = 1; } } } + /* Now, run through the group and pick up the members that were * not excluded */ - k = 0; - for (i = 0; i < size; i++) { + MPIR_Lpid *map = MPL_malloc(nnew * sizeof(MPIR_Lpid), MPL_MEM_GROUP); + MPIR_ERR_CHKANDJUMP(!map, mpi_errno, MPI_ERR_OTHER, "**nomem"); + + int myrank = MPI_UNDEFINED; + int k = 0; + for (int i = 0; i < size; i++) { if (!flags[i]) { - (*new_group_ptr)->lrank_to_lpid[k].lpid = group_ptr->lrank_to_lpid[i].lpid; + map[k] = MPIR_Group_rank_to_lpid(group_ptr, i); if (group_ptr->rank == i) { - (*new_group_ptr)->rank = k; + myrank = k; } k++; } } - /* TODO calculate is_local_dense_monotonic */ + MPL_free(flags); - MPIR_Group_set_session_ptr(*new_group_ptr, group_ptr->session_ptr); + mpi_errno = MPIR_Group_create_map(nnew, myrank, group_ptr->session_ptr, map, new_group_ptr); + MPIR_ERR_CHECK(mpi_errno); fn_exit: - MPL_free(flags); MPIR_FUNC_EXIT; return mpi_errno; fn_fail: @@ -402,16 +254,14 @@ int MPIR_Group_range_incl_impl(MPIR_Group * group_ptr, int n, int ranges[][3], MPIR_Group ** new_group_ptr) { int mpi_errno = MPI_SUCCESS; - int first, last, stride, nnew, i, j, k; - MPIR_FUNC_ENTER; /* Compute size, assuming that included ranks are valid (and distinct) */ - nnew = 0; - for (i = 0; i < n; i++) { - first = ranges[i][0]; - last = ranges[i][1]; - stride = ranges[i][2]; + int nnew = 0; + for (int i = 0; i < n; i++) { + int first = ranges[i][0]; + int last = ranges[i][1]; + int stride = ranges[i][2]; /* works for stride of either sign. Error checking above * has already guaranteed stride != 0 */ nnew += 1 + (last - first) / stride; @@ -422,40 +272,39 @@ int MPIR_Group_range_incl_impl(MPIR_Group * group_ptr, int n, int ranges[][3], goto fn_exit; } - /* Allocate a new group and lrank_to_lpid array */ - mpi_errno = MPIR_Group_create(nnew, new_group_ptr); - if (mpi_errno) - goto fn_fail; - (*new_group_ptr)->rank = MPI_UNDEFINED; + MPIR_Lpid *map = MPL_malloc(nnew * sizeof(MPIR_Lpid), MPL_MEM_GROUP); + MPIR_ERR_CHKANDJUMP(!map, mpi_errno, MPI_ERR_OTHER, "**nomem"); /* Group members taken in order specified by the range array */ /* This could be integrated with the error checking, but since this * is a low-usage routine, we haven't taken that optimization */ - k = 0; - for (i = 0; i < n; i++) { - first = ranges[i][0]; - last = ranges[i][1]; - stride = ranges[i][2]; + int myrank = MPI_UNDEFINED; + int k = 0; + for (int i = 0; i < n; i++) { + int first = ranges[i][0]; + int last = ranges[i][1]; + int stride = ranges[i][2]; if (stride > 0) { - for (j = first; j <= last; j += stride) { - (*new_group_ptr)->lrank_to_lpid[k].lpid = group_ptr->lrank_to_lpid[j].lpid; - if (j == group_ptr->rank) - (*new_group_ptr)->rank = k; + for (int j = first; j <= last; j += stride) { + map[k] = MPIR_Group_rank_to_lpid(group_ptr, j); + if (j == group_ptr->rank) { + myrank = k; + } k++; } } else { - for (j = first; j >= last; j += stride) { - (*new_group_ptr)->lrank_to_lpid[k].lpid = group_ptr->lrank_to_lpid[j].lpid; - if (j == group_ptr->rank) - (*new_group_ptr)->rank = k; + for (int j = first; j >= last; j += stride) { + map[k] = MPIR_Group_rank_to_lpid(group_ptr, j); + if (j == group_ptr->rank) { + myrank = k; + } k++; } } } - /* TODO calculate is_local_dense_monotonic */ - - MPIR_Group_set_session_ptr(*new_group_ptr, group_ptr->session_ptr); + mpi_errno = MPIR_Group_create_map(nnew, myrank, group_ptr->session_ptr, map, new_group_ptr); + MPIR_ERR_CHECK(mpi_errno); fn_exit: MPIR_FUNC_EXIT; @@ -464,180 +313,119 @@ int MPIR_Group_range_incl_impl(MPIR_Group * group_ptr, int n, int ranges[][3], goto fn_exit; } -int MPIR_Group_translate_ranks_impl(MPIR_Group * gp1, int n, const int ranks1[], - MPIR_Group * gp2, int ranks2[]) +int MPIR_Group_difference_impl(MPIR_Group * group_ptr1, MPIR_Group * group_ptr2, + MPIR_Group ** new_group_ptr) { int mpi_errno = MPI_SUCCESS; - int i, g2_idx; - uint64_t l1_pid, l2_pid; - - MPL_DBG_MSG_S(MPIR_DBG_OTHER, VERBOSE, "gp2->is_local_dense_monotonic=%s", - (gp2->is_local_dense_monotonic ? "TRUE" : "FALSE")); - - /* Initialize the output ranks */ - for (i = 0; i < n; i++) - ranks2[i] = MPI_UNDEFINED; + MPIR_FUNC_ENTER; - if (gp2->size > 0 && gp2->is_local_dense_monotonic) { - /* g2 probably == group_of(MPI_COMM_WORLD); use fast, constant-time lookup */ - uint64_t lpid_offset = gp2->lrank_to_lpid[0].lpid; + MPIR_Assert(group_ptr1->session_ptr == group_ptr2->session_ptr); - for (i = 0; i < n; ++i) { - uint64_t g1_lpid; + MPIR_Lpid *map = MPL_malloc(group_ptr1->size * sizeof(MPIR_Lpid), MPL_MEM_GROUP); + MPIR_ERR_CHKANDJUMP(!map, mpi_errno, MPI_ERR_OTHER, "**nomem"); - if (ranks1[i] == MPI_PROC_NULL) { - ranks2[i] = MPI_PROC_NULL; - continue; - } - /* "adjusted" lpid from g1 */ - g1_lpid = gp1->lrank_to_lpid[ranks1[i]].lpid - lpid_offset; - if (g1_lpid < gp2->size) { - ranks2[i] = g1_lpid; - } - /* else leave UNDEFINED */ - } - } else { - /* general, slow path; lookup time is dependent on the user-provided rank values! */ - g2_idx = gp2->idx_of_first_lpid; - if (g2_idx < 0) { - MPII_Group_setup_lpid_list(gp2); - g2_idx = gp2->idx_of_first_lpid; - } - if (g2_idx >= 0) { - /* g2_idx can be < 0 if the g2 group is empty */ - l2_pid = gp2->lrank_to_lpid[g2_idx].lpid; - for (i = 0; i < n; i++) { - if (ranks1[i] == MPI_PROC_NULL) { - ranks2[i] = MPI_PROC_NULL; - continue; - } - l1_pid = gp1->lrank_to_lpid[ranks1[i]].lpid; - /* Search for this l1_pid in group2. Use the following - * optimization: start from the last position in the lpid list - * if possible. A more sophisticated version could use a - * tree based or even hashed search to speed the translation. */ - if (l1_pid < l2_pid || g2_idx < 0) { - /* Start over from the beginning */ - g2_idx = gp2->idx_of_first_lpid; - l2_pid = gp2->lrank_to_lpid[g2_idx].lpid; - } - while (g2_idx >= 0 && l1_pid > l2_pid) { - g2_idx = gp2->lrank_to_lpid[g2_idx].next_lpid; - if (g2_idx >= 0) - l2_pid = gp2->lrank_to_lpid[g2_idx].lpid; - else - l2_pid = (uint64_t) - 1; - } - if (l1_pid == l2_pid) - ranks2[i] = g2_idx; + int nnew = 0; + int myrank = MPI_UNDEFINED; + /* For each rank in group1, search it in group2. */ + for (int i = 0; i < group_ptr1->size; i++) { + MPIR_Lpid lpid = MPIR_Group_rank_to_lpid(group_ptr1, i); + if (MPI_UNDEFINED == MPIR_Group_lpid_to_rank(group_ptr2, lpid)) { + /* not found */ + if (i == group_ptr1->rank) { + myrank = nnew; } + map[nnew++] = lpid; } } + + /* Create the group */ + mpi_errno = MPIR_Group_create_map(nnew, myrank, group_ptr1->session_ptr, map, new_group_ptr); + MPIR_ERR_CHECK(mpi_errno); + + fn_exit: + MPIR_FUNC_EXIT; return mpi_errno; + fn_fail: + goto fn_exit; } -int MPIR_Group_union_impl(MPIR_Group * group_ptr1, MPIR_Group * group_ptr2, - MPIR_Group ** new_group_ptr) +int MPIR_Group_intersection_impl(MPIR_Group * group_ptr1, MPIR_Group * group_ptr2, + MPIR_Group ** new_group_ptr) { int mpi_errno = MPI_SUCCESS; - int g1_idx, g2_idx, nnew, i, k, size1, size2; - uint64_t mylpid; - int *flags = NULL; - MPIR_FUNC_ENTER; - /* Determine the size of the new group. The new group consists of all - * members of group1 plus the members of group2 that are not in group1. - */ - g1_idx = group_ptr1->idx_of_first_lpid; - g2_idx = group_ptr2->idx_of_first_lpid; + /* Similar to MPI_Group_difference, but take the ranks that are found in group2 */ - /* If the lpid list hasn't been created, do it now */ - if (g1_idx < 0) { - MPII_Group_setup_lpid_list(group_ptr1); - g1_idx = group_ptr1->idx_of_first_lpid; - } - if (g2_idx < 0) { - MPII_Group_setup_lpid_list(group_ptr2); - g2_idx = group_ptr2->idx_of_first_lpid; - } - nnew = group_ptr1->size; - - /* Clear the flag bits on the second group. The flag is set if - * a member of the second group belongs to the union */ - size2 = group_ptr2->size; - flags = MPL_calloc(size2, sizeof(int), MPL_MEM_OTHER); - - /* Loop through the lists that are ordered by lpid (local process - * id) to detect which processes in group 2 are not in group 1 - */ - while (g1_idx >= 0 && g2_idx >= 0) { - uint64_t l1_pid, l2_pid; - l1_pid = group_ptr1->lrank_to_lpid[g1_idx].lpid; - l2_pid = group_ptr2->lrank_to_lpid[g2_idx].lpid; - if (l1_pid > l2_pid) { - nnew++; - flags[g2_idx] = 1; - g2_idx = group_ptr2->lrank_to_lpid[g2_idx].next_lpid; - } else if (l1_pid == l2_pid) { - g1_idx = group_ptr1->lrank_to_lpid[g1_idx].next_lpid; - g2_idx = group_ptr2->lrank_to_lpid[g2_idx].next_lpid; - } else { - /* l1 < l2 */ - g1_idx = group_ptr1->lrank_to_lpid[g1_idx].next_lpid; - } - } - /* If we hit the end of group1, add the remaining members of group 2 */ - while (g2_idx >= 0) { - nnew++; - flags[g2_idx] = 1; - g2_idx = group_ptr2->lrank_to_lpid[g2_idx].next_lpid; - } + MPIR_Assert(group_ptr1->session_ptr == group_ptr2->session_ptr); - if (nnew == 0) { - *new_group_ptr = MPIR_Group_empty; - goto fn_exit; + MPIR_Lpid *map = MPL_malloc(group_ptr1->size * sizeof(MPIR_Lpid), MPL_MEM_GROUP); + MPIR_ERR_CHKANDJUMP(!map, mpi_errno, MPI_ERR_OTHER, "**nomem"); + + int nnew = 0; + int myrank = MPI_UNDEFINED; + /* For each rank in group1, search it in group2. */ + for (int i = 0; i < group_ptr1->size; i++) { + MPIR_Lpid lpid = MPIR_Group_rank_to_lpid(group_ptr1, i); + if (MPI_UNDEFINED != MPIR_Group_lpid_to_rank(group_ptr2, lpid)) { + /* found */ + if (i == group_ptr1->rank) { + myrank = nnew; + } + map[nnew++] = lpid; + } } - /* Allocate a new group and lrank_to_lpid array */ - mpi_errno = MPIR_Group_create(nnew, new_group_ptr); + /* Create the group */ + mpi_errno = MPIR_Group_create_map(nnew, myrank, group_ptr1->session_ptr, map, new_group_ptr); MPIR_ERR_CHECK(mpi_errno); + fn_exit: + MPIR_FUNC_EXIT; + return mpi_errno; + fn_fail: + goto fn_exit; +} + +int MPIR_Group_union_impl(MPIR_Group * group_ptr1, MPIR_Group * group_ptr2, + MPIR_Group ** new_group_ptr) +{ + int mpi_errno = MPI_SUCCESS; + MPIR_FUNC_ENTER; + + MPIR_Assert(group_ptr1->session_ptr == group_ptr2->session_ptr); + + MPIR_Lpid *map = MPL_malloc((group_ptr1->size + group_ptr2->size) * sizeof(MPIR_Lpid), + MPL_MEM_GROUP); + MPIR_ERR_CHKANDJUMP(!map, mpi_errno, MPI_ERR_OTHER, "**nomem"); + /* If this process is in group1, then we can set the rank now. * If we are not in this group, this assignment will set the * current rank to MPI_UNDEFINED */ - (*new_group_ptr)->rank = group_ptr1->rank; + int myrank = group_ptr1->rank; /* Add group1 */ - size1 = group_ptr1->size; - for (i = 0; i < size1; i++) { - (*new_group_ptr)->lrank_to_lpid[i].lpid = group_ptr1->lrank_to_lpid[i].lpid; + for (int i = 0; i < group_ptr1->size; i++) { + map[i] = MPIR_Group_rank_to_lpid(group_ptr1, i); } /* Add members of group2 that are not in group 1 */ - - if (group_ptr1->rank == MPI_UNDEFINED && group_ptr2->rank >= 0) { - mylpid = group_ptr2->lrank_to_lpid[group_ptr2->rank].lpid; - } else { - mylpid = (uint64_t) - 2; - } - k = size1; - for (i = 0; i < size2; i++) { - if (flags[i]) { - (*new_group_ptr)->lrank_to_lpid[k].lpid = group_ptr2->lrank_to_lpid[i].lpid; - if ((*new_group_ptr)->rank == MPI_UNDEFINED && - group_ptr2->lrank_to_lpid[i].lpid == mylpid) - (*new_group_ptr)->rank = k; - k++; + int nnew = group_ptr1->size; + for (int i = 0; i < group_ptr2->size; i++) { + MPIR_Lpid lpid = MPIR_Group_rank_to_lpid(group_ptr2, i); + if (MPI_UNDEFINED == MPIR_Group_lpid_to_rank(group_ptr1, lpid)) { + /* not found */ + if (i == group_ptr2->rank) { + myrank = nnew; + } + map[nnew++] = lpid; } } - /* TODO calculate is_local_dense_monotonic */ - - MPIR_Group_set_session_ptr(*new_group_ptr, group_ptr1->session_ptr); + mpi_errno = MPIR_Group_create_map(nnew, myrank, group_ptr1->session_ptr, map, new_group_ptr); + MPIR_ERR_CHECK(mpi_errno); fn_exit: - MPL_free(flags); MPIR_FUNC_EXIT; return mpi_errno; fn_fail: @@ -648,40 +436,18 @@ int MPIR_Group_from_session_pset_impl(MPIR_Session * session_ptr, const char *ps MPIR_Group ** new_group_ptr) { int mpi_errno = MPI_SUCCESS; - MPIR_Group *group_ptr; if (MPL_stricmp(pset_name, "mpi://WORLD") == 0) { - mpi_errno = MPIR_Group_create(MPIR_Process.size, &group_ptr); + mpi_errno = MPIR_Group_dup(MPIR_GROUP_WORLD_PTR, session_ptr, new_group_ptr); MPIR_ERR_CHECK(mpi_errno); - - group_ptr->size = MPIR_Process.size; - group_ptr->rank = MPIR_Process.rank; - group_ptr->is_local_dense_monotonic = TRUE; - for (int i = 0; i < group_ptr->size; i++) { - group_ptr->lrank_to_lpid[i].lpid = i; - group_ptr->lrank_to_lpid[i].next_lpid = i + 1; - } - group_ptr->lrank_to_lpid[group_ptr->size - 1].next_lpid = -1; - group_ptr->idx_of_first_lpid = 0; } else if (MPL_stricmp(pset_name, "mpi://SELF") == 0) { - mpi_errno = MPIR_Group_create(1, &group_ptr); + mpi_errno = MPIR_Group_dup(MPIR_GROUP_SELF_PTR, session_ptr, new_group_ptr); MPIR_ERR_CHECK(mpi_errno); - - group_ptr->size = 1; - group_ptr->rank = 0; - group_ptr->is_local_dense_monotonic = TRUE; - group_ptr->lrank_to_lpid[0].lpid = MPIR_Process.rank; - group_ptr->lrank_to_lpid[0].next_lpid = -1; - group_ptr->idx_of_first_lpid = 0; } else { /* TODO: Implement pset struct, locate pset struct ptr */ MPIR_ERR_SETANDSTMT(mpi_errno, MPI_ERR_ARG, goto fn_fail, "**psetinvalidname"); } - MPIR_Group_set_session_ptr(group_ptr, session_ptr); - - *new_group_ptr = group_ptr; - fn_exit: return mpi_errno; fn_fail: diff --git a/src/mpi/group/groupdebug.c b/src/mpi/group/groupdebug.c deleted file mode 100644 index a70b9592d2f..00000000000 --- a/src/mpi/group/groupdebug.c +++ /dev/null @@ -1,77 +0,0 @@ -/* - * Copyright (C) by Argonne National Laboratory - * See COPYRIGHT in top-level directory - */ - -#include "mpiimpl.h" -#include "group.h" - -/* style: allow:fprintf:2 sig:0 */ -/* style: PMPIuse:PMPI_Abort:2 sig:0 */ - -/* - * This file contains routines that are used only to perform testing - * and debugging of the group routines - */ -void MPITEST_Group_create(int, int, MPI_Group *); -void MPITEST_Group_print(MPI_Group); - -/* --BEGIN DEBUG-- */ -void MPITEST_Group_create(int nproc, int myrank, MPI_Group * new_group) -{ - MPIR_Group *new_group_ptr; - int i; - - new_group_ptr = (MPIR_Group *) MPIR_Handle_obj_alloc(&MPIR_Group_mem); - if (!new_group_ptr) { - fprintf(stderr, "Could not create a new group\n"); - PMPI_Abort(MPI_COMM_WORLD, 1); - } - MPIR_Object_set_ref(new_group_ptr, 1); - new_group_ptr->lrank_to_lpid = - (MPII_Group_pmap_t *) MPL_malloc(nproc * sizeof(MPII_Group_pmap_t), MPL_MEM_DEBUG); - if (!new_group_ptr->lrank_to_lpid) { - fprintf(stderr, "Could not create lrank map for new group\n"); - PMPI_Abort(MPI_COMM_WORLD, 1); - } - - new_group_ptr->rank = MPI_UNDEFINED; - for (i = 0; i < nproc; i++) { - new_group_ptr->lrank_to_lpid[i].lrank = i; - new_group_ptr->lrank_to_lpid[i].lpid = i; - } - new_group_ptr->size = nproc; - new_group_ptr->rank = myrank; - new_group_ptr->idx_of_first_lpid = -1; - - *new_group = new_group_ptr->handle; -} - -void MPITEST_Group_print(MPI_Group g) -{ - MPIR_Group *g_ptr; - int g_idx, size, i; - - MPIR_Group_get_ptr(g, g_ptr); - - g_idx = g_ptr->idx_of_first_lpid; - if (g_idx < 0) { - MPII_Group_setup_lpid_list(g_ptr); - g_idx = g_ptr->idx_of_first_lpid; - } - - /* Loop through these, printing the lpids by rank and in order */ - size = g_ptr->size; - fprintf(stdout, "Lpids in rank order\n"); - for (i = 0; i < size; i++) { - fprintf(stdout, "Rank %d has lpid %d\n", i, g_ptr->lrank_to_lpid[i].lpid); - } - - fprintf(stdout, "Ranks in lpid order\n"); - while (g_idx >= 0) { - fprintf(stdout, "Rank %d has lpid %d\n", g_idx, g_ptr->lrank_to_lpid[g_idx].lpid); - g_idx = g_ptr->lrank_to_lpid[g_idx].next_lpid; - } -} - -/* --END DEBUG-- */ diff --git a/src/mpi/group/grouputil.c b/src/mpi/group/grouputil.c index ac777e50305..a0c1037e003 100644 --- a/src/mpi/group/grouputil.c +++ b/src/mpi/group/grouputil.c @@ -6,6 +6,32 @@ #include "mpiimpl.h" #include "group.h" +/* Global world list. + * world_idx, part of MPIR_Lpid, points to this array */ +#define MPIR_MAX_WORLDS 1024 +static int num_worlds = 0; +struct MPIR_World MPIR_Worlds[MPIR_MAX_WORLDS]; + +int MPIR_add_world(const char *namespace, int num_procs) +{ + int world_idx = num_worlds++; + + MPL_strncpy(MPIR_Worlds[world_idx].namespace, namespace, MPIR_NAMESPACE_MAX); + MPIR_Worlds[world_idx].num_procs = num_procs; + + return world_idx; +} + +int MPIR_find_world(const char *namespace) +{ + for (int i = 0; i < num_worlds; i++) { + if (strncmp(MPIR_Worlds[i].namespace, namespace, MPIR_NAMESPACE_MAX) == 0) { + return i; + } + } + return -1; +} + /* Preallocated group objects */ MPIR_Group MPIR_Group_builtin[MPIR_GROUP_N_BUILTIN]; MPIR_Group MPIR_Group_direct[MPIR_GROUP_PREALLOC]; @@ -22,35 +48,75 @@ int MPIR_Group_init(void) { int mpi_errno = MPI_SUCCESS; - MPIR_Assert(MPIR_GROUP_N_BUILTIN == 1); /* update this func if this ever triggers */ + MPIR_Assert(MPIR_GROUP_N_BUILTIN == 3); /* update this func if this ever triggers */ + + struct MPIR_Pmap *pmap; MPIR_Group_builtin[0].handle = MPI_GROUP_EMPTY; MPIR_Object_set_ref(&MPIR_Group_builtin[0], 1); MPIR_Group_builtin[0].size = 0; MPIR_Group_builtin[0].rank = MPI_UNDEFINED; - MPIR_Group_builtin[0].idx_of_first_lpid = -1; - MPIR_Group_builtin[0].lrank_to_lpid = NULL; + MPIR_Group_builtin[0].session_ptr = NULL; + memset(&MPIR_Group_builtin[0].pmap, 0, sizeof(struct MPIR_Pmap)); + + MPIR_Group_builtin[1].handle = MPIR_GROUP_WORLD; + MPIR_Object_set_ref(&MPIR_Group_builtin[1], 1); + MPIR_Group_builtin[1].size = MPIR_Process.size; + MPIR_Group_builtin[1].rank = MPIR_Process.rank; + MPIR_Group_builtin[1].session_ptr = NULL; + pmap = &MPIR_Group_builtin[1].pmap; + pmap->use_map = false; + pmap->u.stride.offset = 0; + pmap->u.stride.stride = 1; + + MPIR_Group_builtin[2].handle = MPIR_GROUP_SELF; + MPIR_Object_set_ref(&MPIR_Group_builtin[2], 1); + MPIR_Group_builtin[2].size = 1; + MPIR_Group_builtin[2].rank = 0; + MPIR_Group_builtin[2].session_ptr = NULL; + pmap = &MPIR_Group_builtin[2].pmap; + pmap->use_map = false; + pmap->u.stride.offset = MPIR_Process.rank; + pmap->u.stride.stride = 1; - /* TODO hook for device here? */ return mpi_errno; } +int MPIR_Group_finalize(void) +{ + num_worlds = 0; + + return MPI_SUCCESS; +} int MPIR_Group_release(MPIR_Group * group_ptr) { int mpi_errno = MPI_SUCCESS; - int inuse; + /* MPIR_Group_empty was not properly reference counted - FIXME */ + if (group_ptr == MPIR_Group_empty) { + goto fn_exit; + } + + int inuse; MPIR_Group_release_ref(group_ptr, &inuse); if (!inuse) { + MPIR_Assert(!HANDLE_IS_BUILTIN(group_ptr->handle)); /* Only if refcount is 0 do we actually free. */ - MPL_free(group_ptr->lrank_to_lpid); + if (group_ptr->pmap.use_map) { + MPL_free(group_ptr->pmap.u.map); + } if (group_ptr->session_ptr != NULL) { /* Release session */ MPIR_Session_release(group_ptr->session_ptr); } +#ifdef MPID_DEV_GROUP_DECL + mpi_errno = MPID_Group_free_hook(group_ptr); +#endif MPIR_Handle_obj_free(&MPIR_Group_mem, group_ptr); } + + fn_exit: return mpi_errno; } @@ -73,151 +139,133 @@ int MPIR_Group_create(int nproc, MPIR_Group ** new_group_ptr) } /* --END ERROR HANDLING-- */ MPIR_Object_set_ref(*new_group_ptr, 1); - (*new_group_ptr)->lrank_to_lpid = - (MPII_Group_pmap_t *) MPL_calloc(nproc, sizeof(MPII_Group_pmap_t), MPL_MEM_GROUP); - /* --BEGIN ERROR HANDLING-- */ - if (!(*new_group_ptr)->lrank_to_lpid) { - MPIR_Handle_obj_free(&MPIR_Group_mem, *new_group_ptr); - *new_group_ptr = NULL; - MPIR_CHKMEM_SETERR(mpi_errno, nproc * sizeof(MPII_Group_pmap_t), "newgroup->lrank_to_lpid"); - return mpi_errno; - } - /* --END ERROR HANDLING-- */ - (*new_group_ptr)->size = nproc; - /* Make sure that there is no question that the list of ranks sorted - * by pids is marked as uninitialized */ - (*new_group_ptr)->idx_of_first_lpid = -1; - - (*new_group_ptr)->is_local_dense_monotonic = FALSE; + /* initialize fields */ + (*new_group_ptr)->size = nproc; + (*new_group_ptr)->rank = MPI_UNDEFINED; (*new_group_ptr)->session_ptr = NULL; + memset(&(*new_group_ptr)->pmap, 0, sizeof(struct MPIR_Pmap)); +#ifdef MPID_DEV_GROUP_DECL + mpi_errno = MPID_Group_init_hook(*new_group_ptr); +#endif + return mpi_errno; } -/* - * return value is the first index in the list - * - * This "sorts" an lpid array by lpid value, using a simple merge sort - * algorithm. - * - * In actuality, it does not reorder the elements of maparray (these must remain - * in group rank order). Instead it builds the traversal order (in increasing - * lpid order) through the maparray given by the "next_lpid" fields. +/* Internally the only reason to duplicate a group is to copy from NULL session to a new session. + * Otherwise, we can just use the same group and increment the reference count. */ -static int mergesort_lpidarray(MPII_Group_pmap_t maparray[], int n) +int MPIR_Group_dup(MPIR_Group * old_group, MPIR_Session * session_ptr, MPIR_Group ** new_group_ptr) { - int idx1, idx2, first_idx, cur_idx, next_lpid, idx2_offset; + int mpi_errno = MPI_SUCCESS; - if (n == 2) { - if (maparray[0].lpid > maparray[1].lpid) { - first_idx = 1; - maparray[0].next_lpid = -1; - maparray[1].next_lpid = 0; - } else { - first_idx = 0; - maparray[0].next_lpid = 1; - maparray[1].next_lpid = -1; + *new_group_ptr = (MPIR_Group *) MPIR_Handle_obj_alloc(&MPIR_Group_mem); + MPIR_ERR_CHKANDJUMP(!*new_group_ptr, mpi_errno, MPI_ERR_OTHER, "**nomem"); + MPIR_Object_set_ref(*new_group_ptr, 1); + + (*new_group_ptr)->size = old_group->size; + (*new_group_ptr)->rank = old_group->rank; + MPIR_Group_set_session_ptr(*new_group_ptr, session_ptr); + memcpy(&(*new_group_ptr)->pmap, &old_group->pmap, sizeof(struct MPIR_Pmap)); + + if (old_group->pmap.use_map) { + int size = old_group->size; + MPIR_Lpid *map = MPL_malloc(size * sizeof(MPIR_Lpid), MPL_MEM_GROUP); + MPIR_ERR_CHKANDJUMP(!map, mpi_errno, MPI_ERR_OTHER, "**nomem"); + for (int i = 0; i < size; i++) { + map[i] = old_group->pmap.u.map[i]; } - return first_idx; - } - if (n == 1) { - maparray[0].next_lpid = -1; - return 0; + + (*new_group_ptr)->pmap.u.map = map; } - if (n == 0) - return -1; - - /* Sort each half */ - idx2_offset = n / 2; - idx1 = mergesort_lpidarray(maparray, n / 2); - idx2 = mergesort_lpidarray(maparray + idx2_offset, n - n / 2) + idx2_offset; - /* merge the results */ - /* There are three lists: - * first_idx - points to the HEAD of the sorted, merged list - * cur_idx - points to the LAST element of the sorted, merged list - * idx1 - points to the HEAD of one sorted list - * idx2 - points to the HEAD of the other sorted list - * - * We first identify the head element of the sorted list. We then - * take elements from the remaining lists. When one list is empty, - * we add the other list to the end of sorted list. - * - * The last wrinkle is that the next_lpid fields in maparray[idx2] - * are relative to n/2, not 0 (that is, a next_lpid of 1 is - * really 1 + n/2, relative to the beginning of maparray). - */ - /* Find the head element */ - if (maparray[idx1].lpid > maparray[idx2].lpid) { - first_idx = idx2; - idx2 = maparray[idx2].next_lpid + idx2_offset; +#ifdef MPID_DEV_GROUP_DECL + mpi_errno = MPID_Group_init_hook(*new_group_ptr); +#endif + + fn_exit: + return mpi_errno; + fn_fail: + goto fn_exit; +} + +static bool check_map_is_strided(int size, MPIR_Lpid * map, + MPIR_Lpid * offset_out, MPIR_Lpid * stride_out); +int MPIR_Group_create_map(int size, int rank, MPIR_Session * session_ptr, MPIR_Lpid * map, + MPIR_Group ** new_group_ptr) +{ + int mpi_errno = MPI_SUCCESS; + + if (size == 0) { + /* See 5.3.2, Group Constructors. For many group routines, + * the standard explicitly says to return MPI_GROUP_EMPTY; + * for others it is implied */ + MPL_free(map); + *new_group_ptr = MPIR_Group_empty; + MPIR_Group_add_ref(*new_group_ptr); + goto fn_exit; } else { - first_idx = idx1; - idx1 = maparray[idx1].next_lpid; - } + MPIR_Group *newgrp; + mpi_errno = MPIR_Group_create(size, &newgrp); + MPIR_ERR_CHECK(mpi_errno); - /* Merge the lists until one is empty */ - cur_idx = first_idx; - while (idx1 >= 0 && idx2 >= 0) { - if (maparray[idx1].lpid > maparray[idx2].lpid) { - next_lpid = maparray[idx2].next_lpid; - if (next_lpid >= 0) - next_lpid += idx2_offset; - maparray[cur_idx].next_lpid = idx2; - cur_idx = idx2; - idx2 = next_lpid; + newgrp->rank = rank; + MPIR_Group_set_session_ptr(newgrp, session_ptr); + + if (check_map_is_strided(size, map, &newgrp->pmap.u.stride.offset, + &newgrp->pmap.u.stride.stride)) { + newgrp->pmap.use_map = false; + MPL_free(map); } else { - next_lpid = maparray[idx1].next_lpid; - maparray[cur_idx].next_lpid = idx1; - cur_idx = idx1; - idx1 = next_lpid; - } - } - /* Add whichever list remains */ - if (idx1 >= 0) { - maparray[cur_idx].next_lpid = idx1; - } else { - maparray[cur_idx].next_lpid = idx2; - /* Convert the rest of these next_lpid values to be - * relative to the beginning of maparray */ - while (idx2 >= 0) { - next_lpid = maparray[idx2].next_lpid; - if (next_lpid >= 0) { - next_lpid += idx2_offset; - maparray[idx2].next_lpid = next_lpid; - } - idx2 = next_lpid; + newgrp->pmap.use_map = true; + newgrp->pmap.u.map = map; + /* TODO: build hash to accelerate MPIR_Group_lpid_to_rank */ } + + *new_group_ptr = newgrp; } - return first_idx; + fn_exit: + return mpi_errno; + fn_fail: + goto fn_exit; } -/* - * Create a list of the lpids, in lpid order. - * - * Called by group_compare, group_translate_ranks, group_union - * - * In the case of a single main thread lock, the lock must - * be held on entry to this routine. This forces some of the routines - * noted above to hold the SINGLE_CS; which would otherwise not be required. - */ -void MPII_Group_setup_lpid_list(MPIR_Group * group_ptr) +int MPIR_Group_create_stride(int size, int rank, MPIR_Session * session_ptr, + MPIR_Lpid offset, MPIR_Lpid stride, MPIR_Group ** new_group_ptr) { - if (group_ptr->idx_of_first_lpid == -1) { - group_ptr->idx_of_first_lpid = - mergesort_lpidarray(group_ptr->lrank_to_lpid, group_ptr->size); + int mpi_errno = MPI_SUCCESS; + + if (size == 0) { + /* See 5.3.2, Group Constructors. For many group routines, + * the standard explicitly says to return MPI_GROUP_EMPTY; + * for others it is implied */ + *new_group_ptr = MPIR_Group_empty; + goto fn_exit; + } else { + MPIR_Group *newgrp; + mpi_errno = MPIR_Group_create(size, &newgrp); + MPIR_ERR_CHECK(mpi_errno); + + newgrp->rank = rank; + MPIR_Group_set_session_ptr(newgrp, session_ptr); + + newgrp->pmap.use_map = false; + newgrp->pmap.u.stride.offset = offset; + newgrp->pmap.u.stride.stride = stride; + + *new_group_ptr = newgrp; } + + fn_exit: + return mpi_errno; + fn_fail: + goto fn_exit; } -void MPIR_Group_setup_lpid_pairs(MPIR_Group * group_ptr1, MPIR_Group * group_ptr2) +static int pmap_lpid_to_rank(struct MPIR_Pmap *pmap, int size, MPIR_Lpid lpid); +int MPIR_Group_lpid_to_rank(MPIR_Group * group, MPIR_Lpid lpid) { - /* If the lpid list hasn't been created, do it now */ - if (group_ptr1->idx_of_first_lpid < 0) { - MPII_Group_setup_lpid_list(group_ptr1); - } - if (group_ptr2->idx_of_first_lpid < 0) { - MPII_Group_setup_lpid_list(group_ptr2); - } + return pmap_lpid_to_rank(&group->pmap, group->size, lpid); } #ifdef HAVE_ERROR_CHECKING @@ -355,54 +403,20 @@ int MPIR_Group_check_valid_ranges(MPIR_Group * group_ptr, int ranges[][3], int n int MPIR_Group_check_subset(MPIR_Group * group_ptr, MPIR_Comm * comm_ptr) { int mpi_errno = MPI_SUCCESS; - int g1_idx, g2_idx, l1_pid, l2_pid, i; - MPII_Group_pmap_t *vmap = 0; - int vsize = comm_ptr->comm_kind == MPIR_COMM_KIND__INTERCOMM ? comm_ptr->local_size : - comm_ptr->remote_size; - MPIR_CHKLMEM_DECL(1); - - MPIR_Assert(group_ptr != NULL); - - MPIR_CHKLMEM_MALLOC(vmap, MPII_Group_pmap_t *, - vsize * sizeof(MPII_Group_pmap_t), mpi_errno, "", MPL_MEM_GROUP); - /* Initialize the vmap */ - for (i = 0; i < vsize; i++) { - MPID_Comm_get_lpid(comm_ptr, i, &vmap[i].lpid, FALSE); - vmap[i].next_lpid = 0; - } - MPII_Group_setup_lpid_list(group_ptr); - g1_idx = group_ptr->idx_of_first_lpid; - g2_idx = mergesort_lpidarray(vmap, vsize); - MPL_DBG_MSG_FMT(MPIR_DBG_COMM, VERBOSE, (MPL_DBG_FDEST, - "initial indices: %d %d\n", g1_idx, g2_idx)); - while (g1_idx >= 0 && g2_idx >= 0) { - l1_pid = group_ptr->lrank_to_lpid[g1_idx].lpid; - l2_pid = vmap[g2_idx].lpid; - MPL_DBG_MSG_FMT(MPIR_DBG_COMM, VERBOSE, (MPL_DBG_FDEST, - "Lpids are %d, %d\n", l1_pid, l2_pid)); - if (l1_pid < l2_pid) { - /* If we have to advance g1, we didn't find a match, so - * that's an error. */ - break; - } else if (l1_pid > l2_pid) { - g2_idx = vmap[g2_idx].next_lpid; - } else { - /* Equal */ - g1_idx = group_ptr->lrank_to_lpid[g1_idx].next_lpid; - g2_idx = vmap[g2_idx].next_lpid; + for (int rank = 0; rank < group_ptr->size; rank++) { + MPIR_Lpid lpid = MPIR_Group_rank_to_lpid(group_ptr, rank); + int r = MPIR_Group_lpid_to_rank(comm_ptr->local_group, lpid); + if (r == MPI_UNDEFINED) { + MPIR_ERR_SETANDJUMP1(mpi_errno, MPI_ERR_GROUP, "**groupnotincomm", + "**groupnotincomm %d", rank); } - MPL_DBG_MSG_FMT(MPIR_DBG_COMM, VERBOSE, (MPL_DBG_FDEST, - "g1 = %d, g2 = %d\n", g1_idx, g2_idx)); - } - - if (g1_idx >= 0) { - MPIR_ERR_SET1(mpi_errno, MPI_ERR_GROUP, "**groupnotincomm", "**groupnotincomm %d", g1_idx); } - fn_fail: - MPIR_CHKLMEM_FREEALL(); + fn_exit: return mpi_errno; + fn_fail: + goto fn_exit; } #endif /* HAVE_ERROR_CHECKING */ @@ -416,3 +430,53 @@ void MPIR_Group_set_session_ptr(MPIR_Group * group_ptr, MPIR_Session * session_p MPIR_Session_add_ref(session_ptr); } } + +/* internal static routines */ + +static bool check_map_is_strided(int size, MPIR_Lpid * map, + MPIR_Lpid * offset_out, MPIR_Lpid * stride_out) +{ + MPIR_Assert(size > 0); + for (int i = 0; i < size; i++) { + MPIR_Assert(map[i] != MPI_UNDEFINED); + } + if (size == 1) { + *offset_out = map[0]; + *stride_out = 1; + return true; + } else { + MPIR_Lpid offset, stride; + offset = map[0]; + stride = map[1] - map[0]; + for (int i = 1; i < size; i++) { + if (map[i] - map[i - 1] != stride) { + return false; + } + } + *offset_out = offset; + *stride_out = stride; + return true; + } +} + +static int pmap_lpid_to_rank(struct MPIR_Pmap *pmap, int size, MPIR_Lpid lpid) +{ + if (pmap->use_map) { + /* Use linear search for now. + * Optimization: build hash map in MPIR_Group_create_map and do O(1) hash lookup + */ + for (int rank = 0; rank < size; rank++) { + if (pmap->u.map[rank] == lpid) { + return rank; + } + } + return MPI_UNDEFINED; + } else { + int rank = (lpid - pmap->u.stride.offset) / pmap->u.stride.stride; + if (rank < 0 || rank >= size || + lpid != rank * pmap->u.stride.stride + pmap->u.stride.offset) { + return MPI_UNDEFINED; + } + return rank; + } +} diff --git a/src/mpi/init/mpir_init.c b/src/mpi/init/mpir_init.c index 2f1c115aa13..a20038d8b40 100644 --- a/src/mpi/init/mpir_init.c +++ b/src/mpi/init/mpir_init.c @@ -479,6 +479,8 @@ int MPII_Finalize(MPIR_Session * session_ptr) MPL_free(MPIR_Process.memory_alloc_kinds); MPIR_Process.memory_alloc_kinds = NULL; + MPIR_Group_finalize(); + /* All memory should be freed at this point */ MPII_finalize_memory_tracing(); diff --git a/src/mpid/ch3/include/mpidimpl.h b/src/mpid/ch3/include/mpidimpl.h index 1400271797f..1234fbb6539 100644 --- a/src/mpid/ch3/include/mpidimpl.h +++ b/src/mpid/ch3/include/mpidimpl.h @@ -484,7 +484,7 @@ typedef int (*MPIDI_PG_Destroy_fn_t)(MPIDI_PG_t * pg); int MPIDI_VCRT_Create(int size, struct MPIDI_VCRT **vcrt_ptr); int MPIDI_VCRT_Add_ref(struct MPIDI_VCRT *vcrt); -int MPIDI_VCRT_Release(struct MPIDI_VCRT *vcrt, int isDisconnect); +int MPIDI_VCRT_Release(struct MPIDI_VCRT *vcrt); int MPIDI_VCR_Dup(MPIDI_VCR orig_vcr, MPIDI_VCR * new_vcr); int MPIDI_PG_Init(MPIDI_PG_Compare_ids_fn_t, MPIDI_PG_Destroy_fn_t); @@ -1128,7 +1128,7 @@ int MPIDI_CH3I_Get_accumulate(const void *origin_addr, MPI_Aint origin_count, MPIDI_CH3_Progress_signal_completion - Inform the progress engine that a pending request has completed. - IMPLEMENTORS: + IMPLEMENTERS: In a single-threaded environment, this routine can be implemented by incrementing a request completion counter. In a multi-threaded environment, the request completion counter must be atomically @@ -1229,10 +1229,10 @@ int MPIDI_CH3I_VC_post_sockconnect(MPIDI_VC_t * ); all processes in comm*/ int MPID_PG_BCast( MPIR_Comm *peercomm_p, MPIR_Comm *comm_p, int root ); -/* Channel defintitions */ +/* Channel definititions */ /*@ - MPIDI_CH3_iStartMsg - A non-blocking request to send a CH3 packet. A r - equest object is allocated only if the send could not be completed + MPIDI_CH3_iStartMsg - A non-blocking request to send a CH3 packet. A + request object is allocated only if the send could not be completed immediately. Input Parameters: @@ -1282,7 +1282,7 @@ int MPIDI_CH3_iStartMsg(MPIDI_VC_t * vc, void * pkt, intptr_t pkt_sz, packet structure and the vector may be allocated on the stack. - IMPLEMENTORS: + IMPLEMENTERS: If the send can not be completed immediately, the CH3 packet structure and the vector must be stored internally until the request is complete. @@ -1346,7 +1346,7 @@ int MPIDI_CH3_iSend(MPIDI_VC_t * vc, MPIR_Request * sreq, void * pkt, packet structure and the vector may be allocated on the stack. - IMPLEMENTORS: + IMPLEMENTERS: If the send can not be completed immediately, the packet structure and the vector must be stored internally until the request is complete. diff --git a/src/mpid/ch3/include/mpidpost.h b/src/mpid/ch3/include/mpidpost.h index 6f76c6aedc1..231bccec1bf 100644 --- a/src/mpid/ch3/include/mpidpost.h +++ b/src/mpid/ch3/include/mpidpost.h @@ -45,7 +45,7 @@ } .ve - IMPLEMENTORS: + IMPLEMENTERS: A multi-threaded implementation might save the current value of a request completion counter in the state. @*/ @@ -66,7 +66,7 @@ void MPIDI_CH3_Progress_start(MPID_Progress_state * state); NOTE: MPIDI_CH3_Progress_start/end() need to be called. - IMPLEMENTORS: + IMPLEMENTERS: A multi-threaded implementation would return immediately if the a request had been completed between the call to MPIDI_CH3_Progress_start() and MPIDI_CH3_Progress_wait(). This could be @@ -110,7 +110,7 @@ int MPIDI_CH3_Progress_test(void); Return value: An mpi error code. - IMPLEMENTORS: + IMPLEMENTERS: This routine is similar to MPIDI_CH3_Progress_test but may not be as thorough in its attempt to satisfy all outstanding communication. @@ -182,18 +182,16 @@ static inline int MPID_Progress_test(MPID_Progress_state * state) /* state is un int MPIDI_GPID_GetAllInComm( MPIR_Comm *comm_ptr, int local_size, MPIDI_Gpid local_gpids[], int *singlePG ); int MPIDI_GPID_Get( MPIR_Comm *comm_ptr, int rank, MPIDI_Gpid *gpid ); -int MPIDI_GPID_ToLpidArray( int size, MPIDI_Gpid gpid[], uint64_t lpid[] ); +int MPIDI_GPID_ToLpidArray( int size, MPIDI_Gpid gpid[], MPIR_Lpid lpid[] ); int MPIDI_PG_ForwardPGInfo( MPIR_Comm *peer_ptr, MPIR_Comm *comm_ptr, int nPGids, const MPIDI_Gpid gpids[], int root ); -int MPID_Intercomm_exchange_map( MPIR_Comm *local_comm_ptr, int local_leader, - MPIR_Comm *peer_comm_ptr, int remote_leader, - int *remote_size, uint64_t **remote_lpids, - int *is_low_group); +int MPID_Intercomm_exchange(MPIR_Comm *local_comm_ptr, int local_leader, + MPIR_Comm *peer_comm_ptr, int remote_leader, + int tag, int context_id, int *remote_context_id, + int *remote_size, MPIR_Lpid **remote_lpids, int timeout); int MPID_Create_intercomm_from_lpids( MPIR_Comm *newcomm_ptr, - int size, const uint64_t lpids[] ); - -#define MPID_INTERCOMM_NO_DYNPROC(comm) (0) + int size, const MPIR_Lpid lpids[] ); /* ULFM support */ MPL_STATIC_INLINE_PREFIX int MPID_Comm_AS_enabled(MPIR_Comm * comm_ptr) diff --git a/src/mpid/ch3/include/mpidpre.h b/src/mpid/ch3/include/mpidpre.h index 595434c3aff..4177ca80a8e 100644 --- a/src/mpid/ch3/include/mpidpre.h +++ b/src/mpid/ch3/include/mpidpre.h @@ -182,10 +182,6 @@ typedef struct MPIDI_CH3I_comm * waiting for a revoke message before we can release * the context id */ - int is_disconnected; /* set to TRUE if this communicator was - * disconnected as a part of - * MPI_COMM_DISCONNECT; FALSE otherwise. */ - struct MPIDI_VCRT *vcrt; /* virtual connection reference table */ struct MPIDI_VCRT *local_vcrt; /* local virtual connection reference table */ @@ -195,6 +191,11 @@ typedef struct MPIDI_CH3I_comm } MPIDI_CH3I_comm_t; +/* add vcrt to MPIR_Group so we can inherit it whenever possible */ +#define MPID_DEV_GROUP_DECL struct MPIDI_VCRT *ch3_vcrt; +int MPID_Group_init_hook(MPIR_Group * group_ptr); +int MPID_Group_free_hook(MPIR_Group * group_ptr); + #define MPID_DEV_COMM_DECL MPIDI_CH3I_comm_t dev; #ifndef DEFINED_REQ @@ -480,7 +481,7 @@ typedef struct MPIDI_Request { * 4. The callback function can complete other requests, thus * calling those requests' callback functions. However, the * recursion depth of request completion function is limited. - * If we ever need deeper recurisve calls, we need to change + * If we ever need deeper recursive calls, we need to change * to an iterative design instead of a recursive design for * request completion. * @@ -829,7 +830,6 @@ int MPID_Progress_poke(void); int MPID_Get_processor_name( char *name, int namelen, int *resultlen); int MPID_Get_universe_size(int * universe_size); -int MPID_Comm_get_lpid(MPIR_Comm *comm_ptr, int idx, uint64_t *lpid_ptr, bool is_remote); #define MPID_Request_create_from_comm(kind, comm) MPIR_Request_create(kind) void MPID_Request_create_hook(MPIR_Request *); diff --git a/src/mpid/ch3/src/ch3u_comm.c b/src/mpid/ch3/src/ch3u_comm.c index b704d3042e2..8af02db70e3 100644 --- a/src/mpid/ch3/src/ch3u_comm.c +++ b/src/mpid/ch3/src/ch3u_comm.c @@ -111,75 +111,65 @@ int MPIDI_CH3I_Comm_init(void) goto fn_exit; } - -static void dup_vcrt(struct MPIDI_VCRT *src_vcrt, struct MPIDI_VCRT **dest_vcrt, - MPIR_Comm_map_t *mapper, int src_comm_size, int vcrt_size, - int vcrt_offset) +static int create_vcrt_from_group(MPIR_Group *group, struct MPIDI_VCRT **vcrt_out) { - int flag, i; - - /* try to find the simple case where the new comm is a simple - * duplicate of the previous comm. in that case, we simply add a - * reference to the previous VCRT instead of recreating it. */ - if (mapper->type == MPIR_COMM_MAP_TYPE__DUP && src_comm_size == vcrt_size) { - *dest_vcrt = src_vcrt; - MPIDI_VCRT_Add_ref(src_vcrt); - return; - } - else if (mapper->type == MPIR_COMM_MAP_TYPE__IRREGULAR && - mapper->src_mapping_size == vcrt_size) { - /* if the mapping array is exactly the same as the original - * comm's VC list, there is no need to create a new VCRT. - * instead simply point to the original comm's VCRT and bump - * up it's reference count */ - flag = 1; - for (i = 0; i < mapper->src_mapping_size; i++) - if (mapper->src_mapping[i] != i) - flag = 0; + int mpi_errno = MPI_SUCCESS; - if (flag) { - *dest_vcrt = src_vcrt; - MPIDI_VCRT_Add_ref(src_vcrt); - return; - } + if (group->ch3_vcrt) { + MPIDI_VCRT_Add_ref(group->ch3_vcrt); + *vcrt_out = group->ch3_vcrt; + goto fn_exit; } - /* we are in the more complex case where we need to allocate a new - * VCRT */ - - if (!vcrt_offset) - MPIDI_VCRT_Create(vcrt_size, dest_vcrt); + struct MPIDI_VCRT *vcrt; + mpi_errno = MPIDI_VCRT_Create(group->size, &vcrt); + MPIR_ERR_CHECK(mpi_errno); - if (mapper->type == MPIR_COMM_MAP_TYPE__DUP) { - for (i = 0; i < src_comm_size; i++) - MPIDI_VCR_Dup(src_vcrt->vcr_table[i], - &((*dest_vcrt)->vcr_table[i + vcrt_offset])); - } - else { - for (i = 0; i < mapper->src_mapping_size; i++) - MPIDI_VCR_Dup(src_vcrt->vcr_table[mapper->src_mapping[i]], - &((*dest_vcrt)->vcr_table[i + vcrt_offset])); + *vcrt_out = vcrt; + + for (int i = 0; i < group->size; i++) { + MPIR_Lpid lpid = MPIR_Group_rank_to_lpid(group, i); + /* Currently ch3 does not synchronize pg with MPIR_worlds. All lpid are contiguous + * with world_idx = 0. We can tell whether it is a spawned process by checking whether + * it is >= world size. + */ + if (lpid < MPIR_Process.size) { + MPIDI_VCR_Dup(&MPIDI_Process.my_pg->vct[lpid], &vcrt->vcr_table[i]); + } else { + /* search PGs to find the vc. Not particularly efficient, but likely not critical */ + /* TODO: Build a vc hash for dynamic processes */ + MPIDI_PG_iterator iter; + MPIDI_PG_Get_iterator(&iter); + bool found_it = false; + while (MPIDI_PG_Has_next(&iter)) { + MPIDI_PG_t *pg; + MPIDI_PG_Get_next(&iter, &pg); + for (int j = 0; j < pg->size; j++) { + if (pg->vct[j].lpid == lpid) { + MPIDI_VCR_Dup(&pg->vct[j], &vcrt->vcr_table[i]); + found_it = true; + break; + } + } + if (found_it) { + break; + } + pg = pg->next; + } + MPIR_ERR_CHKANDJUMP1(!found_it, mpi_errno, MPI_ERR_OTHER, "**procnotfound", + "**procnotfound %d", i); + } } -} -static inline int map_size(MPIR_Comm_map_t map) -{ - if (map.type == MPIR_COMM_MAP_TYPE__IRREGULAR) - return map.src_mapping_size; - else if (map.dir == MPIR_COMM_MAP_DIR__L2L || map.dir == MPIR_COMM_MAP_DIR__L2R) - return map.src_comm->local_size; - else - return map.src_comm->remote_size; + fn_exit: + return mpi_errno; + fn_fail: + goto fn_exit; } int MPIDI_CH3I_Comm_commit_pre_hook(MPIR_Comm *comm) { int mpi_errno = MPI_SUCCESS; - hook_elt *elt; - MPIR_Comm_map_t *mapper; - MPIR_Comm *src_comm; - int vcrt_size, vcrt_offset; - MPIR_FUNC_ENTER; if (comm == MPIR_Process.comm_world) { @@ -198,6 +188,7 @@ int MPIDI_CH3I_Comm_commit_pre_hook(MPIR_Comm *comm) for (int p = 0; p < MPIR_Process.size; p++) { MPIDI_VCR_Dup(&MPIDI_Process.my_pg->vct[p], &comm->dev.vcrt->vcr_table[p]); } + goto done_vcrt; } else if (comm == MPIR_Process.comm_self) { comm->rank = 0; comm->remote_size = 1; @@ -211,6 +202,7 @@ int MPIDI_CH3I_Comm_commit_pre_hook(MPIR_Comm *comm) } MPIDI_VCR_Dup(&MPIDI_Process.my_pg->vct[MPIR_Process.rank], &comm->dev.vcrt->vcr_table[0]); + goto done_vcrt; } else if (comm == MPIR_Process.icomm_world) { comm->rank = MPIR_Process.rank; comm->remote_size = MPIR_Process.size; @@ -218,104 +210,35 @@ int MPIDI_CH3I_Comm_commit_pre_hook(MPIR_Comm *comm) MPIDI_VCRT_Add_ref(MPIR_Process.comm_world->dev.vcrt ); comm->dev.vcrt = MPIR_Process.comm_world->dev.vcrt; + goto done_vcrt; } - /* initialize the is_disconnected variable to FALSE. this will be - * set to TRUE if the communicator is freed by an - * MPI_COMM_DISCONNECT call. */ - comm->dev.is_disconnected = 0; - - /* do some sanity checks */ - LL_FOREACH(comm->mapper_head, mapper) { - if (mapper->src_comm->comm_kind == MPIR_COMM_KIND__INTRACOMM) - MPIR_Assertp(mapper->dir == MPIR_COMM_MAP_DIR__L2L || - mapper->dir == MPIR_COMM_MAP_DIR__L2R); - if (comm->comm_kind == MPIR_COMM_KIND__INTRACOMM) - MPIR_Assertp(mapper->dir == MPIR_COMM_MAP_DIR__L2L || - mapper->dir == MPIR_COMM_MAP_DIR__R2L); - } - - /* First, handle all the mappers that contribute to the local part - * of the comm */ - vcrt_size = 0; - LL_FOREACH(comm->mapper_head, mapper) { - if (mapper->dir == MPIR_COMM_MAP_DIR__L2R || - mapper->dir == MPIR_COMM_MAP_DIR__R2R) - continue; - - vcrt_size += map_size(*mapper); + if (comm->comm_kind == MPIR_COMM_KIND__INTRACOMM) { + mpi_errno = create_vcrt_from_group(comm->local_group, &comm->dev.vcrt); + MPIR_ERR_CHECK(mpi_errno); + } else { + mpi_errno = create_vcrt_from_group(comm->local_group, &comm->dev.local_vcrt); + MPIR_ERR_CHECK(mpi_errno); + mpi_errno = create_vcrt_from_group(comm->remote_group, &comm->dev.vcrt); + MPIR_ERR_CHECK(mpi_errno); } - vcrt_offset = 0; - LL_FOREACH(comm->mapper_head, mapper) { - src_comm = mapper->src_comm; - - if (mapper->dir == MPIR_COMM_MAP_DIR__L2R || - mapper->dir == MPIR_COMM_MAP_DIR__R2R) - continue; - if (mapper->dir == MPIR_COMM_MAP_DIR__L2L) { - if (src_comm->comm_kind == MPIR_COMM_KIND__INTRACOMM && comm->comm_kind == MPIR_COMM_KIND__INTRACOMM) { - dup_vcrt(src_comm->dev.vcrt, &comm->dev.vcrt, mapper, mapper->src_comm->local_size, - vcrt_size, vcrt_offset); - } - else if (src_comm->comm_kind == MPIR_COMM_KIND__INTRACOMM && comm->comm_kind == MPIR_COMM_KIND__INTERCOMM) - dup_vcrt(src_comm->dev.vcrt, &comm->dev.local_vcrt, mapper, mapper->src_comm->local_size, - vcrt_size, vcrt_offset); - else if (src_comm->comm_kind == MPIR_COMM_KIND__INTERCOMM && comm->comm_kind == MPIR_COMM_KIND__INTRACOMM) { - dup_vcrt(src_comm->dev.local_vcrt, &comm->dev.vcrt, mapper, mapper->src_comm->local_size, - vcrt_size, vcrt_offset); - } - else - dup_vcrt(src_comm->dev.local_vcrt, &comm->dev.local_vcrt, mapper, - mapper->src_comm->local_size, vcrt_size, vcrt_offset); - } - else { /* mapper->dir == MPIR_COMM_MAP_DIR__R2L */ - MPIR_Assert(src_comm->comm_kind == MPIR_COMM_KIND__INTERCOMM); - if (comm->comm_kind == MPIR_COMM_KIND__INTRACOMM) { - dup_vcrt(src_comm->dev.vcrt, &comm->dev.vcrt, mapper, mapper->src_comm->remote_size, - vcrt_size, vcrt_offset); - } - else - dup_vcrt(src_comm->dev.vcrt, &comm->dev.local_vcrt, mapper, mapper->src_comm->remote_size, - vcrt_size, vcrt_offset); + done_vcrt: + /* add vcrt to the comm groups if they are not there */ + if (comm->comm_kind == MPIR_COMM_KIND__INTRACOMM) { + if (comm->local_group->ch3_vcrt == NULL) { + MPIDI_VCRT_Add_ref(comm->dev.vcrt); + comm->local_group->ch3_vcrt = comm->dev.vcrt; } - vcrt_offset += map_size(*mapper); - } - - /* Next, handle all the mappers that contribute to the remote part - * of the comm (only valid for intercomms) */ - vcrt_size = 0; - LL_FOREACH(comm->mapper_head, mapper) { - if (mapper->dir == MPIR_COMM_MAP_DIR__L2L || - mapper->dir == MPIR_COMM_MAP_DIR__R2L) - continue; - - vcrt_size += map_size(*mapper); - } - vcrt_offset = 0; - LL_FOREACH(comm->mapper_head, mapper) { - src_comm = mapper->src_comm; - - if (mapper->dir == MPIR_COMM_MAP_DIR__L2L || - mapper->dir == MPIR_COMM_MAP_DIR__R2L) - continue; - - MPIR_Assert(comm->comm_kind == MPIR_COMM_KIND__INTERCOMM); - - if (mapper->dir == MPIR_COMM_MAP_DIR__L2R) { - if (src_comm->comm_kind == MPIR_COMM_KIND__INTRACOMM) - dup_vcrt(src_comm->dev.vcrt, &comm->dev.vcrt, mapper, mapper->src_comm->local_size, - vcrt_size, vcrt_offset); - else - dup_vcrt(src_comm->dev.local_vcrt, &comm->dev.vcrt, mapper, - mapper->src_comm->local_size, vcrt_size, vcrt_offset); + } else { + if (comm->local_group->ch3_vcrt == NULL) { + MPIDI_VCRT_Add_ref(comm->dev.local_vcrt); + comm->local_group->ch3_vcrt = comm->dev.local_vcrt; } - else { /* mapper->dir == MPIR_COMM_MAP_DIR__R2R */ - MPIR_Assert(src_comm->comm_kind == MPIR_COMM_KIND__INTERCOMM); - dup_vcrt(src_comm->dev.vcrt, &comm->dev.vcrt, mapper, mapper->src_comm->remote_size, - vcrt_size, vcrt_offset); + if (comm->remote_group->ch3_vcrt == NULL) { + MPIDI_VCRT_Add_ref(comm->dev.vcrt); + comm->remote_group->ch3_vcrt = comm->dev.vcrt; } - vcrt_offset += map_size(*mapper); } if (comm->comm_kind == MPIR_COMM_KIND__INTERCOMM) { @@ -326,6 +249,7 @@ int MPIDI_CH3I_Comm_commit_pre_hook(MPIR_Comm *comm) } } + hook_elt *elt; LL_FOREACH(create_hooks_head, elt) { mpi_errno = elt->hook_fn(comm, elt->param); if (mpi_errno) MPIR_ERR_POP(mpi_errno);; @@ -359,11 +283,11 @@ int MPIDI_CH3I_Comm_destroy_hook(MPIR_Comm *comm) MPIR_ERR_CHECK(mpi_errno); } - mpi_errno = MPIDI_VCRT_Release(comm->dev.vcrt, comm->dev.is_disconnected); + mpi_errno = MPIDI_VCRT_Release(comm->dev.vcrt); MPIR_ERR_CHECK(mpi_errno); if (comm->comm_kind == MPIR_COMM_KIND__INTERCOMM) { - mpi_errno = MPIDI_VCRT_Release(comm->dev.local_vcrt, comm->dev.is_disconnected); + mpi_errno = MPIDI_VCRT_Release(comm->dev.local_vcrt); MPIR_ERR_CHECK(mpi_errno); } @@ -512,7 +436,7 @@ static int nonempty_intersection(MPIR_Comm *comm, MPIR_Group *group, int *flag) for (i_g = 0; i_g < group->size; ++i_g) { /* FIXME: This won't work for dynamic procs */ - MPIDI_PG_Get_vc(MPIDI_Process.my_pg, group->lrank_to_lpid[i_g].lpid, &vc_g); + MPIDI_PG_Get_vc(MPIDI_Process.my_pg, MPIR_Group_rank_to_lpid(group, i_g), &vc_g); for (i_c = 0; i_c < comm->remote_size; ++i_c) { MPIDI_Comm_get_vc(comm, i_c, &vc_c); if (vc_g == vc_c) { @@ -581,3 +505,19 @@ void MPIDI_CH3I_Comm_find(int context_id, MPIR_Comm **comm) MPIR_FUNC_EXIT; } + +int MPID_Group_init_hook(MPIR_Group * group_ptr) +{ + group_ptr->ch3_vcrt = NULL; + return MPI_SUCCESS; +} + +int MPID_Group_free_hook(MPIR_Group * group_ptr) +{ + int mpi_errno = MPI_SUCCESS; + + if (group_ptr->ch3_vcrt) { + mpi_errno = MPIDI_VCRT_Release(group_ptr->ch3_vcrt); + } + return mpi_errno; +} diff --git a/src/mpid/ch3/src/ch3u_handle_connection.c b/src/mpid/ch3/src/ch3u_handle_connection.c index ef5819aaf3d..17ef122cb7f 100644 --- a/src/mpid/ch3/src/ch3u_handle_connection.c +++ b/src/mpid/ch3/src/ch3u_handle_connection.c @@ -372,7 +372,7 @@ static int terminate_failed_VCs(MPIR_Group *new_failed_group) MPIDI_VC_t *vc; /* terminate the VC */ /* FIXME: This won't work for dynamic procs */ - MPIDI_PG_Get_vc(MPIDI_Process.my_pg, new_failed_group->lrank_to_lpid[i].lpid, &vc); + MPIDI_PG_Get_vc(MPIDI_Process.my_pg, MPIR_Group_rank_to_lpid(new_failed_group, i), &vc); mpi_errno = MPIDI_CH3_Connection_terminate(vc); MPIR_ERR_CHECK(mpi_errno); } diff --git a/src/mpid/ch3/src/ch3u_port.c b/src/mpid/ch3/src/ch3u_port.c index bd6c8bebfeb..fa1d29bf069 100644 --- a/src/mpid/ch3/src/ch3u_port.c +++ b/src/mpid/ch3/src/ch3u_port.c @@ -487,12 +487,10 @@ static int MPIDI_CH3I_Initialize_tmp_comm(MPIR_Comm **comm_pptr, MPIDI_VC_t *vc_ptr, int is_low_group, int context_id_offset) { int mpi_errno = MPI_SUCCESS; - MPIR_Comm *tmp_comm, *commself_ptr; + MPIR_Comm *tmp_comm; MPIR_FUNC_ENTER; - MPIR_Comm_get_ptr( MPI_COMM_SELF, commself_ptr ); - /* WDG-old code allocated a context id that was then discarded */ mpi_errno = MPIR_Comm_create(&tmp_comm); MPIR_ERR_CHECK(mpi_errno); @@ -524,11 +522,6 @@ static int MPIDI_CH3I_Initialize_tmp_comm(MPIR_Comm **comm_pptr, /* No pg structure needed since vc has already been set up (connection has been established). */ - /* Point local vcrt at those of commself_ptr */ - /* FIXME: Explain why */ - tmp_comm->dev.local_vcrt = commself_ptr->dev.vcrt; - MPIDI_VCRT_Add_ref(commself_ptr->dev.vcrt); - /* No pg needed since connection has already been formed. FIXME - ensure that the comm_release code does not try to free an unallocated pg */ @@ -542,14 +535,6 @@ static int MPIDI_CH3I_Initialize_tmp_comm(MPIR_Comm **comm_pptr, /* FIXME: Why do we do a dup here? */ MPIDI_VCR_Dup(vc_ptr, &tmp_comm->dev.vcrt->vcr_table[0]); - MPIR_Coll_comm_init(tmp_comm); - - /* Even though this is a tmp comm and we don't call - MPI_Comm_commit, we still need to call the creation hook - because the destruction hook will be called in comm_release */ - mpi_errno = MPID_Comm_commit_pre_hook(tmp_comm); - MPIR_ERR_CHECK(mpi_errno); - *comm_pptr = tmp_comm; fn_exit: @@ -559,6 +544,22 @@ static int MPIDI_CH3I_Initialize_tmp_comm(MPIR_Comm **comm_pptr, goto fn_exit; } +static int MPIDI_CH3I_Release_tmp_comm(MPIR_Comm *tmp_comm) +{ + int mpi_errno = MPI_SUCCESS; + + mpi_errno = MPIDI_VCRT_Release(tmp_comm->dev.vcrt); + MPIR_ERR_CHECK(mpi_errno); + + MPIR_Free_contextid(tmp_comm->recvcontext_id); + MPIR_Handle_obj_free(&MPIR_Comm_mem, tmp_comm); + + fn_exit: + return mpi_errno; + fn_fail: + goto fn_exit; +} + /* ------------------------------------------------------------------------- */ /* MPIDI_Comm_connect() @@ -745,7 +746,7 @@ int MPIDI_Comm_connect(const char *port_name, MPIR_Info *info, int root, MPIR_ERR_CHECK(mpi_errno); /* All communication with remote root done. Release the communicator. */ - MPIR_Comm_release(tmp_comm); + MPIDI_CH3I_Release_tmp_comm(tmp_comm); } /*printf("connect:barrier\n");fflush(stdout);*/ @@ -1276,7 +1277,7 @@ int MPIDI_Comm_accept(const char *port_name, MPIR_Info *info, int root, MPIR_ERR_CHECK(mpi_errno); /* All communication with remote root done. Release the communicator. */ - MPIR_Comm_release(tmp_comm); + MPIDI_CH3I_Release_tmp_comm(tmp_comm); } MPL_DBG_MSG(MPIDI_CH3_DBG_CONNECT,VERBOSE,"Barrier"); @@ -1337,24 +1338,23 @@ static int SetupNewIntercomm( MPIR_Comm *comm_ptr, int remote_comm_size, intercomm->remote_size = remote_comm_size; intercomm->local_size = comm_ptr->local_size; intercomm->rank = comm_ptr->rank; - intercomm->local_group = NULL; - intercomm->remote_group = NULL; intercomm->comm_kind = MPIR_COMM_KIND__INTERCOMM; intercomm->local_comm = NULL; - /* Point local vcrt at those of incoming intracommunicator */ - intercomm->dev.local_vcrt = comm_ptr->dev.vcrt; - MPIDI_VCRT_Add_ref(comm_ptr->dev.vcrt); + intercomm->local_group = comm_ptr->local_group; + MPIR_Group_add_ref(comm_ptr->local_group); - /* Set up VC reference table */ - mpi_errno = MPIDI_VCRT_Create(intercomm->remote_size, &intercomm->dev.vcrt); - if (mpi_errno != MPI_SUCCESS) { - MPIR_ERR_SETANDJUMP(mpi_errno,MPI_ERR_OTHER, "**init_vcrt"); - } + MPIR_Lpid *remote_map; + remote_map = MPL_malloc(remote_comm_size * sizeof(MPIR_Lpid), MPL_MEM_GROUP); + MPIR_ERR_CHKANDJUMP(!remote_map, mpi_errno, MPI_ERR_OTHER, "**nomem"); for (i=0; i < intercomm->remote_size; i++) { - MPIDI_PG_Dup_vcr(remote_pg[remote_translation[i].pg_index], - remote_translation[i].pg_rank, &intercomm->dev.vcrt->vcr_table[i]); + MPIDI_PG_t *pg = remote_pg[remote_translation[i].pg_index]; + int rank = remote_translation[i].pg_rank; + remote_map[i] = pg->vct[rank].lpid; } + mpi_errno = MPIR_Group_create_map(remote_comm_size, MPI_UNDEFINED, comm_ptr->session_ptr, + remote_map, &intercomm->remote_group); + MPIR_ERR_CHECK(mpi_errno); mpi_errno = MPIR_Comm_commit(intercomm); MPIR_ERR_CHECK(mpi_errno); diff --git a/src/mpid/ch3/src/mpid_comm_disconnect.c b/src/mpid/ch3/src/mpid_comm_disconnect.c index cec8a198e2f..2809dd51ff3 100644 --- a/src/mpid/ch3/src/mpid_comm_disconnect.c +++ b/src/mpid/ch3/src/mpid_comm_disconnect.c @@ -27,10 +27,6 @@ int MPID_Comm_disconnect(MPIR_Comm *comm_ptr) MPIR_ERR_SETANDJUMP(mpi_errno,MPIX_ERR_REVOKED,"**revoked"); } - /* it's more than a comm_release, but ok for now */ - /* FIXME: Describe what more might be required */ - /* MPIU_PG_Printall( stdout ); */ - comm_ptr->dev.is_disconnected = 1; mpi_errno = MPIR_Comm_release(comm_ptr); MPIR_ERR_CHECK(mpi_errno); /* If any of the VCs were released by this Comm_release, wait diff --git a/src/mpid/ch3/src/mpid_vc.c b/src/mpid/ch3/src/mpid_vc.c index 81cb71c91e6..83e2a67c1b2 100644 --- a/src/mpid/ch3/src/mpid_vc.c +++ b/src/mpid/ch3/src/mpid_vc.c @@ -106,7 +106,7 @@ int MPIDI_VCRT_Add_ref(struct MPIDI_VCRT *vcrt) Notes: @*/ -int MPIDI_VCRT_Release(struct MPIDI_VCRT *vcrt, int isDisconnect ) +int MPIDI_VCRT_Release(struct MPIDI_VCRT *vcrt) { int in_use; int mpi_errno = MPI_SUCCESS; @@ -130,24 +130,8 @@ int MPIDI_VCRT_Release(struct MPIDI_VCRT *vcrt, int isDisconnect ) MPIDI_VC_release_ref(vc, &in_use); - /* Dynamic connections start with a refcount of 2 instead of 1. - * That way we can distinguish between an MPI_Free and an - * MPI_Comm_disconnect. */ - /* XXX DJG FIXME-MT should we be checking this? */ - /* probably not, need to do something like the following instead: */ -#if 0 - if (isDisconnect) { - MPIR_Assert(in_use); - /* FIXME this is still bogus, the VCRT may contain a mix of - * dynamic and non-dynamic VCs, so the ref_count isn't - * guaranteed to have started at 2. The best thing to do might - * be to avoid overloading the reference counting this way and - * use a separate check for dynamic VCs (another flag? compare - * PGs?) */ - MPIR_Object_release_ref(vc, &in_use); - } -#endif - if (isDisconnect && MPIR_Object_get_ref(vc) == 1) { + if (vc->lpid >= MPIR_Process.size && MPIR_Object_get_ref(vc) == 1) { + /* release vc from dynamic process */ MPIDI_VC_release_ref(vc, &in_use); } @@ -238,25 +222,6 @@ int MPIDI_VCR_Dup(MPIDI_VCR orig_vcr, MPIDI_VCR * new_vcr) return MPI_SUCCESS; } -/*@ - MPID_Comm_get_lpid - Get the local process ID for a given VC reference - @*/ -int MPID_Comm_get_lpid(MPIR_Comm *comm_ptr, int idx, uint64_t *lpid_ptr, bool is_remote) -{ - - MPIR_FUNC_ENTER; - - if (comm_ptr->comm_kind == MPIR_COMM_KIND__INTRACOMM) - *lpid_ptr = comm_ptr->dev.vcrt->vcr_table[idx]->lpid; - else if (is_remote) - *lpid_ptr = comm_ptr->dev.vcrt->vcr_table[idx]->lpid; - else - *lpid_ptr = comm_ptr->dev.local_vcrt->vcr_table[idx]->lpid; - - MPIR_FUNC_EXIT; - return MPI_SUCCESS; -} - /* * The following routines convert to/from the global pids, which are * represented as pairs of ints (process group id, rank in that process group) @@ -321,7 +286,7 @@ int MPIDI_GPID_Get( MPIR_Comm *comm_ptr, int rank, MPIDI_Gpid *in_gpid ) * the GPIDs. Note that this code requires that all processes * have information on the process groups. */ -int MPIDI_GPID_ToLpidArray( int size, MPIDI_Gpid in_gpid[], uint64_t lpid[] ) +int MPIDI_GPID_ToLpidArray( int size, MPIDI_Gpid in_gpid[], MPIR_Lpid lpid[] ) { int i, mpi_errno = MPI_SUCCESS; int pgid; @@ -377,15 +342,12 @@ int MPIDI_GPID_ToLpidArray( int size, MPIDI_Gpid in_gpid[], uint64_t lpid[] ) } static inline int MPIDI_LPID_GetAllInComm(MPIR_Comm *comm_ptr, int local_size, - uint64_t local_lpids[]) + MPIR_Lpid local_lpids[]) { - int i; int mpi_errno = MPI_SUCCESS; MPIR_Assert( comm_ptr->local_size == local_size ); - for (i=0; ilocal_size; i++) { - uint64_t tmp_lpid; - mpi_errno |= MPID_Comm_get_lpid( comm_ptr, i, &tmp_lpid, FALSE ); - local_lpids[i] = tmp_lpid; + for (int i=0; ilocal_size; i++) { + local_lpids[i] = comm_ptr->dev.vcrt->vcr_table[i]->lpid; } return mpi_errno; } @@ -395,7 +357,7 @@ static inline int MPIDI_LPID_GetAllInComm(MPIR_Comm *comm_ptr, int local_size, /*@ check_disjoint_lpids - Exchange address mapping for intercomm creation. @*/ -static int check_disjoint_lpids(uint64_t lpids1[], int n1, uint64_t lpids2[], int n2) +static int check_disjoint_lpids(MPIR_Lpid lpids1[], int n1, MPIR_Lpid lpids2[], int n2) { int i, mask_size, idx, bit; uint64_t maxlpid = 0; @@ -457,24 +419,20 @@ static int check_disjoint_lpids(uint64_t lpids1[], int n1, uint64_t lpids2[], in #endif /* HAVE_ERROR_CHECKING */ /*@ - MPID_Intercomm_exchange_map - Exchange address mapping for intercomm creation. + MPID_Intercomm_exchange - Exchange remote info for intercomm creation. @*/ -int MPID_Intercomm_exchange_map(MPIR_Comm *local_comm_ptr, int local_leader, - MPIR_Comm *peer_comm_ptr, int remote_leader, - int *remote_size, uint64_t **remote_lpids, - int *is_low_group) +int MPID_Intercomm_exchange(MPIR_Comm *local_comm_ptr, int local_leader, + MPIR_Comm *peer_comm_ptr, int remote_leader, int tag, + int context_id, int *remote_context_id, + int *remote_size, MPIR_Lpid **remote_lpids, int timeout /* unused */) { int mpi_errno = MPI_SUCCESS; int singlePG; int local_size; - uint64_t *local_lpids=0; + MPIR_Lpid *local_lpids=0; MPIDI_Gpid *local_gpids=NULL, *remote_gpids=NULL; - int comm_info[2]; - int cts_tag; MPIR_CHKLMEM_DECL(3); - cts_tag = 0 | MPIR_TAG_COLL_BIT; - if (local_comm_ptr->rank == local_leader) { /* First, exchange the group information. If we were certain @@ -488,30 +446,32 @@ int MPID_Intercomm_exchange_map(MPIR_Comm *local_comm_ptr, int local_leader, /* printf( "About to sendrecv in intercomm_create\n" );fflush(stdout);*/ MPL_DBG_MSG_FMT(MPIDI_CH3_DBG_OTHER,VERBOSE,(MPL_DBG_FDEST,"rank %d sendrecv to rank %d", peer_comm_ptr->rank, remote_leader)); - mpi_errno = MPIC_Sendrecv( &local_size, 1, MPI_INT, - remote_leader, cts_tag, - remote_size, 1, MPI_INT, - remote_leader, cts_tag, - peer_comm_ptr, MPI_STATUS_IGNORE, MPIR_ERR_NONE ); + int local_ints[2] = {local_size, context_id}; + int remote_ints[2]; + mpi_errno = MPIC_Sendrecv(local_ints, 2, MPI_INT, remote_leader, tag, + remote_ints, 2, MPI_INT, remote_leader, tag, + peer_comm_ptr, MPI_STATUS_IGNORE, MPIR_ERR_NONE ); MPIR_ERR_CHECK(mpi_errno); + *remote_size = remote_ints[0]; + *remote_context_id = remote_ints[1]; MPL_DBG_MSG_FMT(MPIDI_CH3_DBG_OTHER,VERBOSE,(MPL_DBG_FDEST, "local size = %d, remote size = %d", local_size, *remote_size )); /* With this information, we can now send and receive the global process ids from the peer. */ MPIR_CHKLMEM_MALLOC(remote_gpids,MPIDI_Gpid*,(*remote_size)*sizeof(MPIDI_Gpid), mpi_errno,"remote_gpids", MPL_MEM_DYNAMIC); - *remote_lpids = (uint64_t*) MPL_malloc((*remote_size)*sizeof(uint64_t), MPL_MEM_ADDRESS); + *remote_lpids = MPL_malloc((*remote_size)*sizeof(MPIR_Lpid), MPL_MEM_ADDRESS); MPIR_CHKLMEM_MALLOC(local_gpids,MPIDI_Gpid*,local_size*sizeof(MPIDI_Gpid), mpi_errno,"local_gpids", MPL_MEM_DYNAMIC); - MPIR_CHKLMEM_MALLOC(local_lpids,uint64_t*,local_size*sizeof(uint64_t), mpi_errno,"local_lpids", MPL_MEM_DYNAMIC); + MPIR_CHKLMEM_MALLOC(local_lpids,MPIR_Lpid*,local_size*sizeof(MPIR_Lpid), mpi_errno,"local_lpids", MPL_MEM_DYNAMIC); mpi_errno = MPIDI_GPID_GetAllInComm( local_comm_ptr, local_size, local_gpids, &singlePG ); MPIR_ERR_CHECK(mpi_errno); /* Exchange the lpid arrays */ mpi_errno = MPIC_Sendrecv( local_gpids, local_size*sizeof(MPIDI_Gpid), MPI_BYTE, - remote_leader, cts_tag, + remote_leader, tag, remote_gpids, (*remote_size)*sizeof(MPIDI_Gpid), MPI_BYTE, - remote_leader, cts_tag, peer_comm_ptr, + remote_leader, tag, peer_comm_ptr, MPI_STATUS_IGNORE, MPIR_ERR_NONE ); MPIR_ERR_CHECK(mpi_errno); @@ -537,22 +497,18 @@ int MPID_Intercomm_exchange_map(MPIR_Comm *local_comm_ptr, int local_leader, } # endif /* HAVE_ERROR_CHECKING */ - /* Make an arbitrary decision about which group of process is - the low group. The LEADERS do this by comparing the - local process ids of the 0th member of the two groups */ - (*is_low_group) = local_lpids[0] < (*remote_lpids)[0]; - /* At this point, we're done with the local lpids; they'll be freed with the other local memory on exit */ } /* End of the first phase of the leader communication */ /* Leaders can now swap context ids and then broadcast the value to the local group of processes */ + int comm_info[3]; if (local_comm_ptr->rank == local_leader) { /* Now, send all of our local processes the remote_lpids, along with the final context id */ comm_info[0] = *remote_size; - comm_info[1] = *is_low_group; + comm_info[1] = *remote_context_id; MPL_DBG_MSG(MPIDI_CH3_DBG_OTHER,VERBOSE,"About to bcast on local_comm"); mpi_errno = MPIR_Bcast( comm_info, 2, MPI_INT, local_leader, local_comm_ptr, MPIR_ERR_NONE ); MPIR_ERR_CHECK(mpi_errno); @@ -570,17 +526,17 @@ int MPID_Intercomm_exchange_map(MPIR_Comm *local_comm_ptr, int local_leader, MPIR_ERR_CHECK(mpi_errno); *remote_size = comm_info[0]; MPIR_CHKLMEM_MALLOC(remote_gpids,MPIDI_Gpid*,(*remote_size)*sizeof(MPIDI_Gpid), mpi_errno,"remote_gpids", MPL_MEM_DYNAMIC); - *remote_lpids = (uint64_t*) MPL_malloc((*remote_size)*sizeof(uint64_t), MPL_MEM_ADDRESS); + *remote_lpids = MPL_malloc((*remote_size)*sizeof(MPIR_Lpid), MPL_MEM_ADDRESS); mpi_errno = MPIR_Bcast( remote_gpids, (*remote_size)*sizeof(MPIDI_Gpid), MPI_BYTE, local_leader, local_comm_ptr, MPIR_ERR_NONE ); MPIR_ERR_CHECK(mpi_errno); /* Extract the context and group sign information */ - *is_low_group = comm_info[1]; + *remote_context_id = comm_info[1]; } /* Finish up by giving the device the opportunity to update - any other infomration among these processes. Note that the + any other information among these processes. Note that the new intercomm has not been set up; in fact, we haven't yet attempted to set up the connection tables. @@ -621,67 +577,11 @@ int MPID_Intercomm_exchange_map(MPIR_Comm *local_comm_ptr, int local_leader, 'MPI_Comm_connect/MPI_Comm_accept'. Thus, it is only used for intercommunicators. @*/ int MPID_Create_intercomm_from_lpids( MPIR_Comm *newcomm_ptr, - int size, const uint64_t lpids[] ) + int size, const MPIR_Lpid lpids[] ) { int mpi_errno = MPI_SUCCESS; - MPIR_Comm *commworld_ptr; - int i; - MPIDI_PG_iterator iter; - commworld_ptr = MPIR_Process.comm_world; - /* Setup the communicator's vc table: remote group */ - MPIDI_VCRT_Create( size, &newcomm_ptr->dev.vcrt ); - for (i=0; irank, i, lpids[i] ); */ - if (lpids[i] < commworld_ptr->remote_size) { - vc = commworld_ptr->dev.vcrt->vcr_table[lpids[i]]; - } - else { - /* We must find the corresponding vcr for a given lpid */ - /* For now, this means iterating through the process groups */ - MPIDI_PG_t *pg = 0; - int j; - - MPIDI_PG_Get_iterator(&iter); - /* Skip comm_world */ - MPIDI_PG_Get_next( &iter, &pg ); - do { - MPIDI_PG_Get_next( &iter, &pg ); - MPIR_ERR_CHKINTERNAL(!pg, mpi_errno, "no pg"); - /* FIXME: a quick check on the min/max values of the lpid - for this process group could help speed this search */ - for (j=0; jsize; j++) { - /*printf( "Checking lpid %d against %d in pg %s\n", - lpids[i], pg->vct[j].lpid, (char *)pg->id ); - fflush(stdout); */ - if (pg->vct[j].lpid == lpids[i]) { - vc = &pg->vct[j]; - /*printf( "found vc %x for lpid = %d in another pg\n", - (int)vc, lpids[i] );*/ - break; - } - } - } while (!vc); - } - - /* printf( "about to dup vc %x for lpid = %d in another pg\n", - (int)vc, lpids[i] ); */ - /* Note that his will increment the ref count for the associate - PG if necessary. */ - MPIDI_VCR_Dup( vc, &newcomm_ptr->dev.vcrt->vcr_table[i] ); - } -fn_exit: return mpi_errno; -fn_fail: - goto fn_exit; } /* The following is a temporary hook to ensure that all processes in diff --git a/src/mpid/ch3/src/mpidi_pg.c b/src/mpid/ch3/src/mpidi_pg.c index 5db84999bb9..364ec260a34 100644 --- a/src/mpid/ch3/src/mpidi_pg.c +++ b/src/mpid/ch3/src/mpidi_pg.c @@ -44,6 +44,12 @@ int MPIDI_PG_Init(MPIDI_PG_Compare_ids_fn_t compare_ids_fn, MPIDI_PG_Compare_ids_fn = compare_ids_fn; MPIDI_PG_Destroy_fn = destroy_fn; + /* initialize the device fields in builtin groups */ +#ifdef MPID_DEV_GROUP_DECL + for (int i = 0; i < MPIR_GROUP_N_BUILTIN; i++) { + MPID_Group_init_hook(MPIR_Group_builtin + i); + } +#endif return mpi_errno; } @@ -64,6 +70,13 @@ int MPIDI_PG_Finalize(void) MPIU_PG_Printall( stdout ); } + /* release the vcrt in builtin groups, since they don't really get freed */ +#ifdef MPID_DEV_GROUP_DECL + for (int i = 0; i < MPIR_GROUP_N_BUILTIN; i++) { + MPID_Group_free_hook(MPIR_Group_builtin + i); + } +#endif + /* Free the storage associated with the process groups */ pg = MPIDI_PG_list; while (pg) { diff --git a/src/mpid/ch4/ch4_api.txt b/src/mpid/ch4/ch4_api.txt index c1778e546ff..25657023333 100644 --- a/src/mpid/ch4/ch4_api.txt +++ b/src/mpid/ch4/ch4_api.txt @@ -87,16 +87,16 @@ Non Native API: am_tag_recv : int NM*: rank, comm, handler_id, tag, buf-2, count, datatype, src_vci, dst_vci, rreq SHM*: rank, comm, handler_id, tag, buf-2, count, datatype, src_vci, dst_vci, rreq - comm_get_gpid : int - NM*: comm_ptr, idx, gpid_ptr, is_remote get_local_upids : int NM : comm, local_upid_size, local_upids - upids_to_gpids : int - NM : size, remote_upid_size, remote_upids, remote_gpids + insert_upid: int + NM : lpid, upid, upid_len dynamic_send : int - NM : remote_gpid, tag, buf, size, timeout + NM : remote_lpid, tag, buf, size, timeout dynamic_recv : int NM : tag, buf-2, size, timeout + dynamic_sendrecv : int + NM : remote_lpid, tag, send_buf, send_size, recv_buf, recv_size, timeout mpi_comm_commit_pre_hook : int NM : comm SHM : comm @@ -477,8 +477,7 @@ PARAM: local_upid_size: int ** local_upids: char ** lock_type: int - gpid_ptr: uint64_t * - lpids: const int[] + lpid: MPIR_Lpid made_progress: int * message: MPIR_Request * message_p: MPIR_Request ** @@ -503,10 +502,9 @@ PARAM: recvcounts: const MPI_Aint * recvtype: MPI_Datatype recvtypes: const MPI_Datatype[] - remote_gpid: uint64_t - remote_gpids: uint64_t * - remote_upid_size: int * - remote_upids: char * + recv_buf: void * + recv_size: int + remote_lpid: MPIR_Lpid req: MPIR_Request * req_p: MPIR_Request ** result_addr: void * @@ -521,6 +519,8 @@ PARAM: sendcounts: const MPI_Aint * sendtype: MPI_Datatype sendtypes: const MPI_Datatype[] + send_buf: const void * + send_size: int size: int size_p: MPI_Aint * size-2: MPI_Aint @@ -538,6 +538,8 @@ PARAM: target_rank: int timeout: int type: MPIR_Datatype * + upid: const char * + upid_len: int vci: int void: win: MPIR_Win * diff --git a/src/mpid/ch4/include/mpidch4.h b/src/mpid/ch4/include/mpidch4.h index 3dd3528efbc..ba4b043b14a 100644 --- a/src/mpid/ch4/include/mpidch4.h +++ b/src/mpid/ch4/include/mpidch4.h @@ -26,7 +26,6 @@ int MPID_Comm_get_all_failed_procs(MPIR_Comm *, MPIR_Group **, int); int MPID_Comm_revoke(MPIR_Comm *, int); int MPID_Comm_failure_ack(MPIR_Comm *); MPL_STATIC_INLINE_PREFIX int MPID_Comm_AS_enabled(MPIR_Comm *) MPL_STATIC_INLINE_SUFFIX; -int MPID_Comm_get_lpid(MPIR_Comm *, int, uint64_t *, bool); int MPID_CS_finalize(void); int MPID_Finalize(void); int MPID_Get_universe_size(int *); @@ -167,8 +166,11 @@ int MPID_Type_commit_hook(MPIR_Datatype *); int MPID_Type_free_hook(MPIR_Datatype *); int MPID_Op_commit_hook(MPIR_Op *); int MPID_Op_free_hook(MPIR_Op *); -int MPID_Intercomm_exchange_map(MPIR_Comm *, int, MPIR_Comm *, int, int *, uint64_t **, int *); -int MPID_Create_intercomm_from_lpids(MPIR_Comm *, int, const uint64_t[]); +int MPID_Intercomm_exchange(MPIR_Comm * local_comm, int local_leader, + MPIR_Comm * peer_comm, int remote_leader, int tag, + int context_id, int *remote_context_id_out, + int *remote_size_out, MPIR_Lpid ** remote_lpids_out, int timeout); +int MPID_Create_intercomm_from_lpids(MPIR_Comm *, int, const MPIR_Lpid[]); int MPID_Comm_commit_pre_hook(MPIR_Comm *); int MPID_Comm_free_hook(MPIR_Comm *); int MPID_Comm_set_hints(MPIR_Comm *, MPIR_Info *); diff --git a/src/mpid/ch4/include/mpidpre.h b/src/mpid/ch4/include/mpidpre.h index 3735200e2c8..bab94b1bcf9 100644 --- a/src/mpid/ch4/include/mpidpre.h +++ b/src/mpid/ch4/include/mpidpre.h @@ -561,69 +561,6 @@ typedef struct MPIDIG_comm_t { #endif } MPIDIG_comm_t; -#define MPIDI_CALC_STRIDE(rank, stride, blocksize, offset) \ - ((rank) / (blocksize) * ((stride) - (blocksize)) + (rank) + (offset)) - -#define MPIDI_CALC_STRIDE_SIMPLE(rank, stride, offset) \ - ((rank) * (stride) + (offset)) - -typedef enum { - MPIDI_RANK_MAP_DIRECT, - MPIDI_RANK_MAP_DIRECT_INTRA, - MPIDI_RANK_MAP_OFFSET, - MPIDI_RANK_MAP_OFFSET_INTRA, - MPIDI_RANK_MAP_STRIDE, - MPIDI_RANK_MAP_STRIDE_INTRA, - MPIDI_RANK_MAP_STRIDE_BLOCK, - MPIDI_RANK_MAP_STRIDE_BLOCK_INTRA, - MPIDI_RANK_MAP_LUT, - MPIDI_RANK_MAP_LUT_INTRA, - MPIDI_RANK_MAP_MLUT, - MPIDI_RANK_MAP_NONE -} MPIDI_rank_map_mode; - -typedef int MPIDI_lpid_t; -typedef struct { - int avtid; - int lpid; -} MPIDI_gpid_t; - -typedef struct { - MPIR_cc_t ref_count; - MPIDI_lpid_t lpid[]; -} MPIDI_rank_map_lut_t; - -typedef struct { - MPIR_cc_t ref_count; - MPIDI_gpid_t gpid[]; -} MPIDI_rank_map_mlut_t; - -typedef struct { - MPIDI_rank_map_mode mode; - int avtid; - int size; - - union { - int offset; - struct { - int offset; - int stride; - int blocksize; - } stride; - } reg; - - union { - struct { - MPIDI_rank_map_lut_t *t; - MPIDI_lpid_t *lpid; - } lut; - struct { - MPIDI_rank_map_mlut_t *t; - MPIDI_gpid_t *gpid; - } mlut; - } irreg; -} MPIDI_rank_map_t; - typedef struct MPIDI_Devcomm_t { struct { /* The first fields are used by the AM(MPIDIG) apis */ @@ -638,8 +575,6 @@ typedef struct MPIDI_Devcomm_t { MPIDI_SHM_COMM_DECL} shm; #endif - MPIDI_rank_map_t map; - MPIDI_rank_map_t local_map; struct MPIR_Comm *multi_leads_comm; /* sub communicators related for multi-leaders based implementation */ struct MPIR_Comm *inter_node_leads_comm, *sub_node_comm, *intra_node_leads_comm; @@ -685,8 +620,7 @@ typedef struct { typedef struct { void *upid; int upid_len; - int avtid; - int lpid; + MPIR_Lpid lpid; UT_hash_handle hh; } MPIDI_upid_hash; #endif @@ -703,20 +637,6 @@ typedef struct MPIDI_av_entry { #define HAVE_DEV_COMM_HOOK -/* - * operation for (avtid, lpid) to/from gpid - */ -#define MPIDIU_LPID_BITS 32 -#define MPIDIU_LPID_MASK 0xFFFFFFFFU -#define MPIDIU_GPID_CREATE(avtid, lpid) (((uint64_t) (avtid) << MPIDIU_LPID_BITS) | (lpid)) -#define MPIDIU_GPID_GET_AVTID(gpid) ((gpid) >> MPIDIU_LPID_BITS) -#define MPIDIU_GPID_GET_LPID(gpid) ((gpid) & MPIDIU_LPID_MASK) - -#define MPIDI_DYNPROC_MASK (0x80000000U) - -#define MPID_INTERCOMM_NO_DYNPROC(comm) \ - (MPIDI_COMM((comm),map).avtid == 0 && MPIDI_COMM((comm),local_map).avtid == 0) - int MPIDI_check_for_failed_procs(void); #ifdef HAVE_SIGNAL diff --git a/src/mpid/ch4/netmod/ofi/init_addrxchg.c b/src/mpid/ch4/netmod/ofi/init_addrxchg.c index 7a1766df84e..a37e9bd1f62 100644 --- a/src/mpid/ch4/netmod/ofi/init_addrxchg.c +++ b/src/mpid/ch4/netmod/ofi/init_addrxchg.c @@ -131,9 +131,12 @@ int MPIDI_OFI_addr_exchange_root_ctx(void) (MPIDI_OFI_global.ctx[0].av, table, num_nodes, mapped_table, 0ULL, NULL), avmap); + if (mapped_table[0] == 0) { + MPIDI_OFI_global.lpid0 = node_roots[0]; + } for (int i = 0; i < num_nodes; i++) { MPIR_Assert(mapped_table[i] != FI_ADDR_NOTAVAIL); - MPIDI_OFI_AV(&MPIDIU_get_av(0, node_roots[i])).dest[0][0] = mapped_table[i]; + MPIDI_OFI_AV(MPIDIU_lpid_to_av(node_roots[i])).dest[0][0] = mapped_table[i]; } MPL_free(mapped_table); /* Then, allgather all address names using init_comm */ @@ -149,7 +152,7 @@ int MPIDI_OFI_addr_exchange_root_ctx(void) char *addrname = (char *) table + recv_bc_len * rank_map[i]; MPIDI_OFI_CALL(fi_av_insert(MPIDI_OFI_global.ctx[0].av, addrname, 1, &addr, 0ULL, NULL), avmap); - MPIDI_OFI_AV(&MPIDIU_get_av(0, i)).dest[0][0] = addr; + MPIDI_OFI_AV(MPIDIU_lpid_to_av(i)).dest[0][0] = addr; } } mpi_errno = MPIDU_bc_table_destroy(); @@ -161,9 +164,12 @@ int MPIDI_OFI_addr_exchange_root_ctx(void) MPIDI_OFI_CALL(fi_av_insert (MPIDI_OFI_global.ctx[0].av, table, size, mapped_table, 0ULL, NULL), avmap); + if (mapped_table[0] == 0) { + MPIDI_OFI_global.lpid0 = 0; + } for (int i = 0; i < size; i++) { MPIR_Assert(mapped_table[i] != FI_ADDR_NOTAVAIL); - MPIDI_OFI_AV(&MPIDIU_get_av(0, i)).dest[0][0] = mapped_table[i]; + MPIDI_OFI_AV(MPIDIU_lpid_to_av(i)).dest[0][0] = mapped_table[i]; } MPL_free(mapped_table); mpi_errno = MPIDU_bc_table_destroy(); @@ -173,7 +179,7 @@ int MPIDI_OFI_addr_exchange_root_ctx(void) /* check */ if (MPIDI_OFI_ENABLE_AV_TABLE) { for (int r = 0; r < size; r++) { - MPIDI_OFI_addr_t *av ATTRIBUTE((unused)) = &MPIDI_OFI_AV(&MPIDIU_get_av(0, r)); + MPIDI_OFI_addr_t *av ATTRIBUTE((unused)) = &MPIDI_OFI_AV(MPIDIU_lpid_to_av(r)); MPIR_Assert(av->dest[0][0] == get_root_av_table_index(r)); } } @@ -192,7 +198,7 @@ int MPIDI_OFI_addr_exchange_root_ctx(void) /* Macros to reduce clutter, so we can focus on the ordering logics. * Note: they are not perfectly wrapped, but tolerable since only used here. */ #define GET_AV_AND_ADDRNAMES(rank) \ - MPIDI_OFI_addr_t *av ATTRIBUTE((unused)) = &MPIDI_OFI_AV(&MPIDIU_get_av(0, rank)); \ + MPIDI_OFI_addr_t *av ATTRIBUTE((unused)) = &MPIDI_OFI_AV(MPIDIU_lpid_to_av(rank)); \ char *r_names = all_names + rank * max_vcis * num_nics * name_len; #define DO_AV_INSERT(ctx_idx, nic, vci) \ @@ -346,7 +352,7 @@ int MPIDI_OFI_addr_exchange_all_ctx(void) #if MPIDI_CH4_MAX_VCIS > 1 if (MPIDI_OFI_ENABLE_AV_TABLE) { for (int r = 0; r < size; r++) { - MPIDI_OFI_addr_t *av ATTRIBUTE((unused)) = &MPIDI_OFI_AV(&MPIDIU_get_av(0, r)); + MPIDI_OFI_addr_t *av ATTRIBUTE((unused)) = &MPIDI_OFI_AV(MPIDIU_lpid_to_av(r)); for (int nic = 0; nic < num_nics; nic++) { for (int vci = 0; vci < NUM_VCIS_FOR_RANK(r); vci++) { MPIR_Assert(av->dest[nic][vci] == get_av_table_index(r, nic, vci, diff --git a/src/mpid/ch4/netmod/ofi/ofi_impl.h b/src/mpid/ch4/netmod/ofi/ofi_impl.h index a3ad267eee4..556a2e4110d 100644 --- a/src/mpid/ch4/netmod/ofi/ofi_impl.h +++ b/src/mpid/ch4/netmod/ofi/ofi_impl.h @@ -31,11 +31,6 @@ ATTRIBUTE((unused)); #define MPIDI_OFI_DT(dt) ((dt)->dev.netmod.ofi) #define MPIDI_OFI_OP(op) ((op)->dev.netmod.ofi) #define MPIDI_OFI_COMM(comm) ((comm)->dev.ch4.netmod.ofi) -#define MPIDI_OFI_COMM_TO_INDEX(comm,rank) \ - MPIDIU_comm_rank_to_pid(comm, rank, NULL, NULL) -#define MPIDI_OFI_TO_PHYS(avtid, lpid, _nic) \ - MPIDI_OFI_AV(&MPIDIU_get_av((avtid), (lpid))).dest[_nic][0] - #define MPIDI_OFI_WIN(win) ((win)->dev.netmod.ofi) #define MPIDI_OFI_NIC_NAME(nic) (MPIDI_OFI_global.prov_use[nic] ? \ diff --git a/src/mpid/ch4/netmod/ofi/ofi_init.c b/src/mpid/ch4/netmod/ofi/ofi_init.c index 634d3b7facb..651110c5bda 100644 --- a/src/mpid/ch4/netmod/ofi/ofi_init.c +++ b/src/mpid/ch4/netmod/ofi/ofi_init.c @@ -719,6 +719,9 @@ int MPIDI_OFI_init_local(int *tag_bits) mpi_errno = ofi_pvar_init(); MPIR_ERR_CHECK(mpi_errno); + /* A way to tell which av is empty */ + MPIDI_OFI_global.lpid0 = MPIR_LPID_INVALID; + /* -------------------------------- */ /* Set up the libfabric provider(s) */ /* -------------------------------- */ @@ -943,7 +946,7 @@ static int flush_send(int dst, int nic, int vci, MPIDI_OFI_dynamic_process_reque { int mpi_errno = MPI_SUCCESS; - fi_addr_t addr = MPIDI_OFI_av_to_phys(&MPIDIU_get_av(0, dst), nic, vci); + fi_addr_t addr = MPIDI_OFI_av_to_phys(MPIDIU_lpid_to_av(dst), nic, vci); static int data = 0; uint64_t match_bits = MPIDI_OFI_init_sendtag(MPIDI_OFI_FLUSH_CONTEXT_ID, 0, MPIDI_OFI_FLUSH_TAG); @@ -974,7 +977,7 @@ static int flush_recv(int src, int nic, int vci, MPIDI_OFI_dynamic_process_reque { int mpi_errno = MPI_SUCCESS; - fi_addr_t addr = MPIDI_OFI_av_to_phys(&MPIDIU_get_av(0, src), nic, vci); + fi_addr_t addr = MPIDI_OFI_av_to_phys(MPIDIU_lpid_to_av(src), nic, vci); uint64_t mask_bits = 0; uint64_t match_bits = MPIDI_OFI_init_sendtag(MPIDI_OFI_FLUSH_CONTEXT_ID, 0, MPIDI_OFI_FLUSH_TAG); @@ -1555,10 +1558,10 @@ static int try_open_shared_av(struct fid_domain *domain, struct fid_av **p_av, i /* directly references the mapped fi_addr_t array instead */ fi_addr_t *mapped_table = (fi_addr_t *) av_attr.map_addr; for (int i = 0; i < MPIR_Process.size; i++) { - MPIDI_OFI_AV(&MPIDIU_get_av(0, i)).dest[nic][0] = mapped_table[i]; + MPIDI_OFI_AV(MPIDIU_lpid_to_av(i)).dest[nic][0] = mapped_table[i]; MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MAP, VERBOSE, (MPL_DBG_FDEST, " grank mapped to: rank=%d, av=%p, dest=%" PRIu64, - i, (void *) &MPIDIU_get_av(0, i), mapped_table[i])); + i, (void *) MPIDIU_lpid_to_av(i), mapped_table[i])); } ret = 1; } diff --git a/src/mpid/ch4/netmod/ofi/ofi_proc.h b/src/mpid/ch4/netmod/ofi/ofi_proc.h index b23e6ec531d..c7ab1f2fb7f 100644 --- a/src/mpid/ch4/netmod/ofi/ofi_proc.h +++ b/src/mpid/ch4/netmod/ofi/ofi_proc.h @@ -20,20 +20,4 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_NM_rank_is_local(int rank, MPIR_Comm * comm) return ret; } -MPL_STATIC_INLINE_PREFIX int MPIDI_NM_comm_get_gpid(MPIR_Comm * comm_ptr, - int idx, uint64_t * gpid_ptr, bool is_remote) -{ - int avtid = 0, lpid = 0; - if (comm_ptr->comm_kind == MPIR_COMM_KIND__INTRACOMM) - MPIDIU_comm_rank_to_pid(comm_ptr, idx, &lpid, &avtid); - else if (is_remote) - MPIDIU_comm_rank_to_pid(comm_ptr, idx, &lpid, &avtid); - else { - MPIDIU_comm_rank_to_pid_local(comm_ptr, idx, &lpid, &avtid); - } - - *gpid_ptr = MPIDIU_GPID_CREATE(avtid, lpid); - return MPI_SUCCESS; -} - #endif /* OFI_PROC_H_INCLUDED */ diff --git a/src/mpid/ch4/netmod/ofi/ofi_spawn.c b/src/mpid/ch4/netmod/ofi/ofi_spawn.c index 20adc54b3b1..187e1db4b2c 100644 --- a/src/mpid/ch4/netmod/ofi/ofi_spawn.c +++ b/src/mpid/ch4/netmod/ofi/ofi_spawn.c @@ -7,7 +7,11 @@ #include "ofi_impl.h" #include "ofi_noinline.h" -int MPIDI_OFI_dynamic_send(uint64_t remote_gpid, int tag, const void *buf, int size, int timeout) +/* NOTE: all these functions assume the caller to enter VCI-0 critical section */ + +static int cancel_dynamic_request(MPIDI_OFI_dynamic_process_request_t * dynamic_req, bool is_send); + +int MPIDI_OFI_dynamic_send(MPIR_Lpid remote_lpid, int tag, const void *buf, int size, int timeout) { int mpi_errno = MPI_SUCCESS; @@ -16,11 +20,7 @@ int MPIDI_OFI_dynamic_send(uint64_t remote_gpid, int tag, const void *buf, int s int nic = 0; /* dynamic process only use nic 0 */ int vci = 0; /* dynamic process only use vci 0 */ int ctx_idx = 0; - int avtid = MPIDIU_GPID_GET_AVTID(remote_gpid); - int lpid = MPIDIU_GPID_GET_LPID(remote_gpid); - fi_addr_t remote_addr = MPIDI_OFI_av_to_phys(&MPIDIU_get_av(avtid, lpid), nic, vci); - - MPID_THREAD_CS_ENTER(VCI, MPIDI_VCI(vci).lock); + fi_addr_t remote_addr = MPIDI_OFI_av_to_phys(MPIDIU_lpid_to_av_slow(remote_lpid), nic, vci); MPIDI_OFI_dynamic_process_request_t req; req.done = 0; @@ -52,24 +52,13 @@ int MPIDI_OFI_dynamic_send(uint64_t remote_gpid, int tag, const void *buf, int s if (!req.done) { /* time out, let's cancel the request */ - int rc; - rc = fi_cancel((fid_t) MPIDI_OFI_global.ctx[ctx_idx].tx, (void *) &req.context); - if (rc && rc != -FI_ENOENT) { - MPIR_ERR_CHKANDJUMP2(rc < 0, mpi_errno, MPI_ERR_OTHER, "**ofid_cancel", - "**ofid_cancel %s %s", MPIDI_OFI_DEFAULT_NIC_NAME, - fi_strerror(-rc)); - - } - while (!req.done) { - mpi_errno = MPIDI_OFI_progress_uninlined(vci); - MPIR_ERR_CHECK(mpi_errno); - } + mpi_errno = cancel_dynamic_request(&req, true); + MPIR_ERR_CHECK(mpi_errno); mpi_errno = MPIX_ERR_TIMEOUT; } fn_exit: - MPID_THREAD_CS_EXIT(VCI, MPIDI_VCI(vci).lock); return mpi_errno; fn_fail: goto fn_exit; @@ -91,8 +80,6 @@ int MPIDI_OFI_dynamic_recv(int tag, void *buf, int size, int timeout) match_bits = MPIDI_OFI_init_recvtag(&mask_bits, 0, MPI_ANY_SOURCE, tag); match_bits |= MPIDI_OFI_DYNPROC_SEND; - MPID_THREAD_CS_ENTER(VCI, MPIDI_VCI(vci).lock); - MPL_time_t time_start, time_now; double time_gap; MPL_wtime(&time_start); @@ -109,120 +96,128 @@ int MPIDI_OFI_dynamic_recv(int tag, void *buf, int size, int timeout) if (!req.done) { /* time out, let's cancel the request */ - int rc; - rc = fi_cancel((fid_t) MPIDI_OFI_global.ctx[ctx_idx].rx, (void *) &req.context); - if (rc && rc != -FI_ENOENT) { - MPIR_ERR_CHKANDJUMP2(rc < 0, mpi_errno, MPI_ERR_OTHER, "**ofid_cancel", - "**ofid_cancel %s %s", MPIDI_OFI_DEFAULT_NIC_NAME, - fi_strerror(-rc)); - - } - while (!req.done) { - mpi_errno = MPIDI_OFI_progress_uninlined(vci); - MPIR_ERR_CHECK(mpi_errno); - } + mpi_errno = cancel_dynamic_request(&req, false); + MPIR_ERR_CHECK(mpi_errno); mpi_errno = MPIX_ERR_TIMEOUT; } fn_exit: - MPID_THREAD_CS_EXIT(VCI, MPIDI_VCI(vci).lock); return mpi_errno; fn_fail: goto fn_exit; } -/* the following functions are "proc" functions, but because they are only used during dynamic - * process spawning, having them here provides better context */ - -int MPIDI_OFI_upids_to_gpids(int size, int *remote_upid_size, char *remote_upids, - uint64_t * remote_gpids) +int MPIDI_OFI_dynamic_sendrecv(MPIR_Lpid remote_lpid, int tag, + const void *send_buf, int send_size, void *recv_buf, int recv_size, + int timeout) { - int i, mpi_errno = MPI_SUCCESS; - int *new_avt_procs; - char **new_upids; - int n_new_procs = 0; - int n_avts; - char *curr_upid; - int nic = 0; - int ctx_idx = MPIDI_OFI_get_ctx_index(0, nic); - - MPIR_CHKLMEM_DECL(2); - - MPIR_CHKLMEM_MALLOC(new_avt_procs, int *, sizeof(int) * size, mpi_errno, "new_avt_procs", - MPL_MEM_ADDRESS); - MPIR_CHKLMEM_MALLOC(new_upids, char **, sizeof(char *) * size, mpi_errno, "new_upids", - MPL_MEM_ADDRESS); + int mpi_errno = MPI_SUCCESS; - n_avts = MPIDIU_get_n_avts(); + /* NOTE: dynamic_sendrecv is always called inside CS of vci 0 */ + int vci = 0; + int nic = 0; + int ctx_idx = 0; +#ifdef MPICH_DEBUG_MUTEX + MPID_THREAD_ASSERT_IN_CS(VCI, (*(MPID_Thread_mutex_t *) MPIR_Request_mem[vci].lock)); +#endif + + MPIDI_av_entry_t *av = MPIDIU_lpid_to_av_slow(remote_lpid); + fi_addr_t remote_addr = MPIDI_OFI_av_to_phys(av, nic, vci); + + MPIDI_OFI_dynamic_process_request_t send_req; + send_req.done = 0; + send_req.event_id = MPIDI_OFI_EVENT_DYNPROC_DONE; + + if (send_size > 0) { + uint64_t match_bits = MPIDI_OFI_DYNPROC_SEND | tag; + if (MPIDI_OFI_ENABLE_DATA) { + MPIDI_OFI_CALL_RETRY(fi_tsenddata(MPIDI_OFI_global.ctx[ctx_idx].tx, + send_buf, send_size, NULL, 0, + remote_addr, match_bits, (void *) &send_req.context), + vci, tsenddata); + } else { + MPIDI_OFI_CALL_RETRY(fi_tsend(MPIDI_OFI_global.ctx[ctx_idx].tx, + send_buf, send_size, NULL, + remote_addr, match_bits, (void *) &send_req.context), + vci, tsend); + } + } else { + send_req.done = 1; + } - curr_upid = remote_upids; - for (i = 0; i < size; i++) { - int j, k; - char tbladdr[FI_NAME_MAX]; - int found = 0; - size_t sz = 0; + MPIDI_OFI_dynamic_process_request_t recv_req; + recv_req.done = 0; + recv_req.event_id = MPIDI_OFI_EVENT_DYNPROC_DONE; + + if (recv_size > 0) { + uint64_t mask_bits = 0; + uint64_t match_bits = MPIDI_OFI_DYNPROC_SEND | tag; + MPIDI_OFI_CALL_RETRY(fi_trecv(MPIDI_OFI_global.ctx[ctx_idx].rx, + recv_buf, recv_size, NULL, + remote_addr, match_bits, mask_bits, &recv_req.context), + vci, trecv); + } else { + recv_req.done = 1; + } - char *hostname = curr_upid; - int hostname_len = strlen(hostname); - char *addrname = hostname + hostname_len + 1; - int addrname_len = remote_upid_size[i] - hostname_len - 1; + MPL_time_t time_start; + MPL_wtime(&time_start); + while (!send_req.done || !recv_req.done) { + mpi_errno = MPIDI_OFI_progress_uninlined(vci); + MPIR_ERR_CHECK(mpi_errno); - for (k = 0; k < n_avts; k++) { - if (MPIDIU_get_av_table(k) == NULL) { - continue; - } - for (j = 0; j < MPIDIU_get_av_table(k)->size; j++) { - sz = MPIDI_OFI_global.addrnamelen; - MPIDI_OFI_VCI_CALL(fi_av_lookup(MPIDI_OFI_global.ctx[ctx_idx].av, - MPIDI_OFI_TO_PHYS(k, j, nic), &tbladdr, &sz), 0, - avlookup); - if (sz == addrname_len && !memcmp(tbladdr, addrname, addrname_len)) { - remote_gpids[i] = MPIDIU_GPID_CREATE(k, j); - found = 1; - break; + if (timeout > 0) { + MPL_time_t time_now; + double time_gap; + MPL_wtime(&time_now); + MPL_wtime_diff(&time_start, &time_now, &time_gap); + if (time_gap > (double) timeout) { + /* timed out, cancel the operations */ + if (!send_req.done) { + mpi_errno = cancel_dynamic_request(&send_req, true); + MPIR_ERR_CHECK(mpi_errno); } - } - if (found) { + if (!recv_req.done) { + mpi_errno = cancel_dynamic_request(&recv_req, false); + MPIR_ERR_CHECK(mpi_errno); + } + + mpi_errno = MPIX_ERR_TIMEOUT; break; } } - - if (!found) { - new_avt_procs[n_new_procs] = i; - new_upids[n_new_procs] = curr_upid; - n_new_procs++; - } - curr_upid += remote_upid_size[i]; } - /* create new av_table, insert processes */ - if (n_new_procs > 0) { - int avtid; - mpi_errno = MPIDIU_new_avt(n_new_procs, &avtid); - MPIR_ERR_CHECK(mpi_errno); - - for (i = 0; i < n_new_procs; i++) { - char *hostname = new_upids[i]; - char *addrname = hostname + strlen(hostname) + 1; + fn_exit: + return mpi_errno; + fn_fail: + goto fn_exit; +} - fi_addr_t addr; - MPIDI_OFI_VCI_CALL(fi_av_insert(MPIDI_OFI_global.ctx[ctx_idx].av, addrname, - 1, &addr, 0ULL, NULL), 0, avmap); - MPIR_Assert(addr != FI_ADDR_NOTAVAIL); - MPIDI_OFI_AV(&MPIDIU_get_av(avtid, i)).dest[nic][0] = addr; +static int cancel_dynamic_request(MPIDI_OFI_dynamic_process_request_t * dynamic_req, bool is_send) +{ + int mpi_errno = MPI_SUCCESS; - int node_id; - mpi_errno = MPIR_nodeid_lookup(hostname, &node_id); - MPIR_ERR_CHECK(mpi_errno); - MPIDIU_get_av(avtid, i).node_id = node_id; + struct fid_ep *ep; + if (is_send) { + ep = MPIDI_OFI_global.ctx[0].tx; + } else { + ep = MPIDI_OFI_global.ctx[0].rx; + } + int rc; + rc = fi_cancel((fid_t) ep, (void *) &dynamic_req->context); + if (rc && rc != -FI_ENOENT) { + MPIR_ERR_CHKANDJUMP2(rc < 0, mpi_errno, MPI_ERR_OTHER, "**ofid_cancel", + "**ofid_cancel %s %s", MPIDI_OFI_DEFAULT_NIC_NAME, fi_strerror(-rc)); - remote_gpids[new_avt_procs[i]] = MPIDIU_GPID_CREATE(avtid, i); - } + } + while (!dynamic_req->done) { + mpi_errno = MPIDI_OFI_progress_uninlined(0); + MPIR_ERR_CHECK(mpi_errno); } fn_exit: - MPIR_CHKLMEM_FREEALL(); return mpi_errno; fn_fail: goto fn_exit; @@ -266,8 +261,8 @@ int MPIDI_OFI_get_local_upids(MPIR_Comm * comm, int **local_upid_size, char **lo size_t sz = MPIDI_OFI_global.addrnamelen;; MPIDI_OFI_addr_t *av = &MPIDI_OFI_AV(MPIDIU_comm_rank_to_av(comm, i)); - MPIDI_OFI_VCI_CALL(fi_av_lookup(MPIDI_OFI_global.ctx[ctx_idx].av, av->dest[nic][0], - temp_buf + idx, &sz), 0, avlookup); + MPIDI_OFI_CALL(fi_av_lookup(MPIDI_OFI_global.ctx[ctx_idx].av, av->dest[nic][0], + temp_buf + idx, &sz), avlookup); idx += (int) sz; (*local_upid_size)[i] = upid_len; @@ -282,3 +277,47 @@ int MPIDI_OFI_get_local_upids(MPIR_Comm * comm, int **local_upid_size, char **lo MPIR_CHKPMEM_REAP(); goto fn_exit; } + +int MPIDI_OFI_insert_upid(MPIR_Lpid lpid, const char *upid, int upid_len) +{ + int mpi_errno = MPI_SUCCESS; + + const char *hostname = upid; + MPIDI_av_entry_t *av = MPIDIU_lpid_to_av_slow(lpid); + + bool do_insert = false; + if (lpid & MPIR_LPID_DYNAMIC_MASK) { + do_insert = true; + } else if (MPIDI_OFI_AV(av).dest[0][0] == 0 && lpid != MPIDI_OFI_global.lpid0) { + MPIDI_av_entry_t *dynamic_av = MPIDIU_find_dynamic_av(upid, upid_len); + if (dynamic_av) { + /* just copy it over */ + MPIDI_OFI_AV(av).dest[0][0] = MPIDI_OFI_AV(dynamic_av).dest[0][0]; + } else { + do_insert = true; + } + + /* set node_id */ + int node_id; + mpi_errno = MPIR_nodeid_lookup(hostname, &node_id); + MPIR_ERR_CHECK(mpi_errno); + av->node_id = node_id; + } + + if (do_insert) { + const char *addrname = hostname + strlen(hostname) + 1; + /* new entry */ + MPIDI_OFI_CALL(fi_av_insert(MPIDI_OFI_global.ctx[0].av, addrname, + 1, &MPIDI_OFI_AV(av).dest[0][0], 0ULL, NULL), avmap); + MPIR_Assert(MPIDI_OFI_AV(av).dest[0][0] != FI_ADDR_NOTAVAIL); + } + + if (MPIDI_OFI_AV(av).dest[0][0] == 0) { + MPIDI_OFI_global.lpid0 = lpid; + } + + fn_exit: + return mpi_errno; + fn_fail: + goto fn_exit; +} diff --git a/src/mpid/ch4/netmod/ofi/ofi_types.h b/src/mpid/ch4/netmod/ofi/ofi_types.h index 9b1309fd0e5..cdfe66c5394 100644 --- a/src/mpid/ch4/netmod/ofi/ofi_types.h +++ b/src/mpid/ch4/netmod/ofi/ofi_types.h @@ -524,6 +524,9 @@ typedef struct { size_t addrnamelen; /* OFI uses the same name length within a provider. */ char pname[MPI_MAX_PROCESSOR_NAME]; int port_name_tag_mask[MPIR_MAX_CONTEXT_MASK]; + /* To support dynamic av tables, we need a way to tell which entries are empty. + * ch4 av tables are initialize to 0s. Thus we need know which "0" is valid. */ + MPIR_Lpid lpid0; /* Capability settings */ #ifdef MPIDI_OFI_ENABLE_RUNTIME_CHECKS diff --git a/src/mpid/ch4/netmod/ucx/ucx_impl.h b/src/mpid/ch4/netmod/ucx/ucx_impl.h index d204383ac5b..e5d952e9ba9 100644 --- a/src/mpid/ch4/netmod/ucx/ucx_impl.h +++ b/src/mpid/ch4/netmod/ucx/ucx_impl.h @@ -19,7 +19,6 @@ #define MPIDI_UCX_COMM(comm) ((comm)->dev.ch4.netmod.ucx) #define MPIDI_UCX_REQ(req) ((req)->dev.ch4.netmod.ucx) -#define COMM_TO_INDEX(comm,rank) MPIDIU_comm_rank_to_pid(comm, rank, NULL, NULL) #define MPIDI_UCX_COMM_TO_EP(comm,rank,vci_src,vci_dst) \ MPIDI_UCX_AV(MPIDIU_comm_rank_to_av(comm, rank)).dest[vci_src][vci_dst] #define MPIDI_UCX_AV_TO_EP(av,vci_src,vci_dst) MPIDI_UCX_AV((av)).dest[vci_src][vci_dst] diff --git a/src/mpid/ch4/netmod/ucx/ucx_init.c b/src/mpid/ch4/netmod/ucx/ucx_init.c index fd7698bbebf..ad4c08c8363 100644 --- a/src/mpid/ch4/netmod/ucx/ucx_init.c +++ b/src/mpid/ch4/netmod/ucx/ucx_init.c @@ -102,9 +102,9 @@ static int initial_address_exchange(void) ep_params.address = (ucp_address_t *) ((char *) table + i * recv_bc_len); ucx_status = ucp_ep_create(MPIDI_UCX_global.ctx[0].worker, &ep_params, - &MPIDI_UCX_AV(&MPIDIU_get_av(0, node_roots[i])).dest[0][0]); + &MPIDI_UCX_AV(MPIDIU_lpid_to_av(node_roots[i])).dest[0][0]); MPIDI_UCX_CHK_STATUS(ucx_status); - MPIDIU_upidhash_add(ep_params.address, recv_bc_len, 0, node_roots[i]); + MPIDIU_upidhash_add(ep_params.address, recv_bc_len, node_roots[i]); } mpi_errno = MPIDU_bc_allgather(init_comm, MPIDI_UCX_global.ctx[0].if_address, (int) MPIDI_UCX_global.ctx[0].addrname_len, FALSE, @@ -117,9 +117,9 @@ static int initial_address_exchange(void) ep_params.field_mask = UCP_EP_PARAM_FIELD_REMOTE_ADDRESS; ep_params.address = (ucp_address_t *) ((char *) table + rank_map[i] * recv_bc_len); ucx_status = ucp_ep_create(MPIDI_UCX_global.ctx[0].worker, &ep_params, - &MPIDI_UCX_AV(&MPIDIU_get_av(0, i)).dest[0][0]); + &MPIDI_UCX_AV(MPIDIU_lpid_to_av(i)).dest[0][0]); MPIDI_UCX_CHK_STATUS(ucx_status); - MPIDIU_upidhash_add(ep_params.address, recv_bc_len, 0, i); + MPIDIU_upidhash_add(ep_params.address, recv_bc_len, i); } } mpi_errno = MPIDU_bc_table_destroy(); @@ -130,9 +130,9 @@ static int initial_address_exchange(void) ep_params.address = (ucp_address_t *) ((char *) table + i * recv_bc_len); ucx_status = ucp_ep_create(MPIDI_UCX_global.ctx[0].worker, &ep_params, - &MPIDI_UCX_AV(&MPIDIU_get_av(0, i)).dest[0][0]); + &MPIDI_UCX_AV(MPIDIU_lpid_to_av(i)).dest[0][0]); MPIDI_UCX_CHK_STATUS(ucx_status); - MPIDIU_upidhash_add(ep_params.address, recv_bc_len, 0, i); + MPIDIU_upidhash_add(ep_params.address, recv_bc_len, i); } mpi_errno = MPIDU_bc_table_destroy(); MPIR_ERR_CHECK(mpi_errno); @@ -180,7 +180,7 @@ static int all_vcis_address_exchange(void) ucp_ep_params_t ep_params; for (int vci_local = 0; vci_local < num_vcis; vci_local++) { for (int r = 0; r < size; r++) { - MPIDI_UCX_addr_t *av = &MPIDI_UCX_AV(&MPIDIU_get_av(0, r)); + MPIDI_UCX_addr_t *av = &MPIDI_UCX_AV(MPIDIU_lpid_to_av(r)); for (int vci_remote = 0; vci_remote < num_vcis; vci_remote++) { if (vci_local == 0 && vci_remote == 0) { /* don't overwrite existing addr, or bad things will happen */ @@ -369,7 +369,7 @@ int MPIDI_UCX_mpi_finalize_hook(void) int p = 0; for (int i = 0; i < MPIR_Process.size; i++) { - MPIDI_UCX_addr_t *av = &MPIDI_UCX_AV(&MPIDIU_get_av(0, i)); + MPIDI_UCX_addr_t *av = &MPIDI_UCX_AV(MPIDIU_lpid_to_av(i)); for (int vci_local = 0; vci_local < MPIDI_UCX_global.num_vcis; vci_local++) { for (int vci_remote = 0; vci_remote < MPIDI_UCX_global.num_vcis; vci_remote++) { ucp_request = ucp_disconnect_nb(av->dest[vci_local][vci_remote]); diff --git a/src/mpid/ch4/netmod/ucx/ucx_proc.h b/src/mpid/ch4/netmod/ucx/ucx_proc.h index 066670c014a..b8481ffd6a6 100644 --- a/src/mpid/ch4/netmod/ucx/ucx_proc.h +++ b/src/mpid/ch4/netmod/ucx/ucx_proc.h @@ -19,21 +19,4 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_NM_rank_is_local(int rank, MPIR_Comm * comm) return ret; } -MPL_STATIC_INLINE_PREFIX int MPIDI_NM_comm_get_gpid(MPIR_Comm * comm_ptr, - int idx, uint64_t * gpid_ptr, bool is_remote) -{ - int avtid = 0, lpid = 0; - if (comm_ptr->comm_kind == MPIR_COMM_KIND__INTRACOMM) { - MPIDIU_comm_rank_to_pid(comm_ptr, idx, &lpid, &avtid); - } else if (is_remote) { - MPIDIU_comm_rank_to_pid(comm_ptr, idx, &lpid, &avtid); - } else { - MPIDIU_comm_rank_to_pid_local(comm_ptr, idx, &lpid, &avtid); - } - - *gpid_ptr = MPIDIU_GPID_CREATE(avtid, lpid); - return MPI_SUCCESS; - -} - #endif /* UCX_PROC_H_INCLUDED */ diff --git a/src/mpid/ch4/netmod/ucx/ucx_spawn.c b/src/mpid/ch4/netmod/ucx/ucx_spawn.c index 05e888d5639..5cb4de4ce76 100644 --- a/src/mpid/ch4/netmod/ucx/ucx_spawn.c +++ b/src/mpid/ch4/netmod/ucx/ucx_spawn.c @@ -20,18 +20,14 @@ static void dynamic_recv_cb(void *request, ucs_status_t status, *done = true; } -int MPIDI_UCX_dynamic_send(uint64_t remote_gpid, int tag, const void *buf, int size, int timeout) +int MPIDI_UCX_dynamic_send(MPIR_Lpid remote_lpid, int tag, const void *buf, int size, int timeout) { int mpi_errno = MPI_SUCCESS; uint64_t ucx_tag = MPIDI_UCX_DYNPROC_MASK + tag; int vci = 0; - MPID_THREAD_CS_ENTER(VCI, MPIDI_VCI(vci).lock); - - int avtid = MPIDIU_GPID_GET_AVTID(remote_gpid); - int lpid = MPIDIU_GPID_GET_LPID(remote_gpid); - ucp_ep_h ep = MPIDI_UCX_AV_TO_EP(&MPIDIU_get_av(avtid, lpid), vci, vci); + ucp_ep_h ep = MPIDI_UCX_AV_TO_EP(MPIDIU_lpid_to_av(remote_lpid), vci, vci); bool done = false; ucp_request_param_t param = { @@ -68,7 +64,6 @@ int MPIDI_UCX_dynamic_send(uint64_t remote_gpid, int tag, const void *buf, int s } fn_exit: - MPID_THREAD_CS_EXIT(VCI, MPIDI_VCI(vci).lock); return mpi_errno; } @@ -80,8 +75,6 @@ int MPIDI_UCX_dynamic_recv(int tag, void *buf, int size, int timeout) uint64_t tag_mask = 0xffffffffffffffff; int vci = 0; - MPID_THREAD_CS_ENTER(VCI, MPIDI_VCI(vci).lock); - bool done = false; ucp_request_param_t param = { .op_attr_mask = UCP_OP_ATTR_FIELD_CALLBACK | UCP_OP_ATTR_FIELD_USER_DATA, @@ -117,7 +110,93 @@ int MPIDI_UCX_dynamic_recv(int tag, void *buf, int size, int timeout) } fn_exit: - MPID_THREAD_CS_EXIT(VCI, MPIDI_VCI(vci).lock); + return mpi_errno; +} + +int MPIDI_UCX_dynamic_sendrecv(MPIR_Lpid remote_lpid, int tag, + const void *send_buf, int send_size, void *recv_buf, int recv_size, + int timeout) +{ + int mpi_errno = MPI_SUCCESS; + + /* NOTE: dynamic_sendrecv is always called inside CS of vci 0 */ + int vci = 0; +#ifdef MPICH_DEBUG_MUTEX + MPID_THREAD_ASSERT_IN_CS(VCI, (*(MPID_Thread_mutex_t *) MPIR_Request_mem[vci].lock)); +#endif + + uint64_t ucx_tag = MPIDI_UCX_DYNPROC_MASK + tag; + uint64_t tag_mask = 0xffffffffffffffff; /* for recv */ + MPIDI_av_entry_t *av = MPIDIU_lpid_to_av_slow(remote_lpid); + ucp_ep_h ep = MPIDI_UCX_AV_TO_EP(av, vci, vci); + + ucs_status_ptr_t status = UCS_OK; + + /* send */ + bool send_done = false; + if (send_size > 0) { + ucp_request_param_t send_param = { + .op_attr_mask = UCP_OP_ATTR_FIELD_CALLBACK | UCP_OP_ATTR_FIELD_USER_DATA, + .cb.send = dynamic_send_cb, + .user_data = &send_done, + }; + + status = ucp_tag_send_nbx(ep, send_buf, send_size, ucx_tag, &send_param); + if (status == UCS_OK) { + send_done = true; + } else if (UCS_PTR_IS_ERR(status)) { + /* FIXME: better error */ + mpi_errno = MPI_ERR_PORT; + goto fn_exit; + } + } else { + send_done = true; + } + + /* recv */ + bool recv_done = false; + if (recv_size > 0) { + ucp_request_param_t recv_param = { + .op_attr_mask = UCP_OP_ATTR_FIELD_CALLBACK | UCP_OP_ATTR_FIELD_USER_DATA, + .cb.recv = dynamic_recv_cb, + .user_data = &recv_done, + }; + + status = ucp_tag_recv_nbx(MPIDI_UCX_global.ctx[vci].worker, recv_buf, recv_size, + ucx_tag, tag_mask, &recv_param); + if (status == UCS_OK) { + recv_done = true; + } else if (UCS_PTR_IS_ERR(status)) { + /* FIXME: better error */ + mpi_errno = MPI_ERR_PORT; + goto fn_exit; + } + } else { + recv_done = true; + } + + /* wait */ + MPL_time_t time_start; + MPL_wtime(&time_start); + while (!send_done || !recv_done) { + ucp_worker_progress(MPIDI_UCX_global.ctx[vci].worker); + + if (timeout > 0) { + MPL_time_t time_now; + double time_gap; + MPL_wtime(&time_now); + MPL_wtime_diff(&time_start, &time_now, &time_gap); + if (time_gap > (double) timeout) { + mpi_errno = MPIX_ERR_TIMEOUT; + break; + } + } + } + + fn_exit: + if (status != UCS_OK) { + ucp_request_release(status); + } return mpi_errno; } @@ -147,58 +226,41 @@ int MPIDI_UCX_get_local_upids(MPIR_Comm * comm, int **local_upid_size, char **lo goto fn_exit; } -int MPIDI_UCX_upids_to_gpids(int size, int *remote_upid_size, char *remote_upids, - uint64_t * remote_gpids) +int MPIDI_UCX_insert_upid(MPIR_Lpid lpid, const char *upid, int upid_len) { int mpi_errno = MPI_SUCCESS; - - int n_new_procs = 0; - int *new_avt_procs; - char **new_upids; - int vci = 0; - MPIR_CHKLMEM_DECL(2); - - MPIR_CHKLMEM_MALLOC(new_avt_procs, int *, sizeof(int) * size, mpi_errno, "new_avt_procs", - MPL_MEM_ADDRESS); - MPIR_CHKLMEM_MALLOC(new_upids, char **, sizeof(char *) * size, mpi_errno, "new_upids", - MPL_MEM_ADDRESS); - - char *curr_upid = remote_upids; - for (int i = 0; i < size; i++) { - MPIDI_upid_hash *t = MPIDIU_upidhash_find(curr_upid, remote_upid_size[i]); - if (t) { - remote_gpids[i] = MPIDIU_GPID_CREATE(t->avtid, t->lpid); + MPIDI_av_entry_t *av = MPIDIU_lpid_to_av_slow(lpid); + + bool is_dynamic = (lpid & MPIR_LPID_DYNAMIC_MASK); + bool do_insert = false; + if (is_dynamic) { + do_insert = true; + } else if (!MPIDI_UCX_AV(av).dest[0][0]) { + MPIDI_av_entry_t *dynamic_av = MPIDIU_find_dynamic_av(upid, upid_len); + if (dynamic_av) { + /* just copy it over */ + MPIDI_UCX_AV(av).dest[0][0] = MPIDI_UCX_AV(dynamic_av).dest[0][0]; } else { - new_avt_procs[n_new_procs] = i; - new_upids[n_new_procs] = curr_upid; - n_new_procs++; - + do_insert = true; } - curr_upid += remote_upid_size[i]; } - /* create new av_table, insert processes */ - if (n_new_procs > 0) { - int avtid; - mpi_errno = MPIDIU_new_avt(n_new_procs, &avtid); - MPIR_ERR_CHECK(mpi_errno); - - for (int i = 0; i < n_new_procs; i++) { - ucp_ep_params_t ep_params; - ucs_status_t ucx_status; - ep_params.field_mask = UCP_EP_PARAM_FIELD_REMOTE_ADDRESS; - ep_params.address = (ucp_address_t *) new_upids[i]; - ucx_status = ucp_ep_create(MPIDI_UCX_global.ctx[vci].worker, &ep_params, - &MPIDI_UCX_AV(&MPIDIU_get_av(avtid, i)).dest[0][0]); - MPIDI_UCX_CHK_STATUS(ucx_status); - MPIDIU_upidhash_add(new_upids[i], remote_upid_size[new_avt_procs[i]], avtid, i); - - remote_gpids[new_avt_procs[i]] = MPIDIU_GPID_CREATE(avtid, i); - } + if (do_insert) { + /* new entry */ + ucp_ep_params_t ep_params; + ucs_status_t ucx_status; + ep_params.field_mask = UCP_EP_PARAM_FIELD_REMOTE_ADDRESS; + ep_params.address = (ucp_address_t *) upid; + ucx_status = ucp_ep_create(MPIDI_UCX_global.ctx[0].worker, &ep_params, + &MPIDI_UCX_AV(av).dest[0][0]); + MPIDI_UCX_CHK_STATUS(ucx_status); + } + + if (!is_dynamic) { + MPIDIU_upidhash_add(upid, upid_len, lpid); } fn_exit: - MPIR_CHKLMEM_FREEALL(); return mpi_errno; fn_fail: goto fn_exit; diff --git a/src/mpid/ch4/shm/ipc/gpu/gpu_post.c b/src/mpid/ch4/shm/ipc/gpu/gpu_post.c index 55bfe3ca03e..b2ab09ec9cc 100644 --- a/src/mpid/ch4/shm/ipc/gpu/gpu_post.c +++ b/src/mpid/ch4/shm/ipc/gpu/gpu_post.c @@ -370,7 +370,7 @@ int MPIDI_GPU_get_ipc_attr(const void *buf, MPI_Aint count, MPI_Datatype datatyp ipc_attr->ipc_type = MPIDI_IPCI_TYPE__GPU; if (remote_rank != MPI_PROC_NULL) { - remote_rank = MPIDI_GPUI_global.local_ranks[MPIDIU_rank_to_lpid(remote_rank, comm)]; + remote_rank = MPIDI_GPUI_global.local_ranks[MPIDIU_get_grank(remote_rank, comm)]; } ipc_attr->u.gpu.remote_rank = remote_rank; diff --git a/src/mpid/ch4/shm/posix/posix_am.h b/src/mpid/ch4/shm/posix/posix_am.h index 5ddc2a45ec1..a0f3d10ebe6 100644 --- a/src/mpid/ch4/shm/posix/posix_am.h +++ b/src/mpid/ch4/shm/posix/posix_am.h @@ -85,7 +85,7 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_POSIX_am_isend(int rank, { int mpi_errno = MPI_SUCCESS; MPIDI_POSIX_am_header_t msg_hdr; - const int grank = MPIDIU_rank_to_lpid(rank, comm); + const int grank = MPIDIU_get_grank(rank, comm); MPIR_FUNC_ENTER; @@ -180,7 +180,7 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_POSIX_am_send_hdr(int rank, MPIR_Comm * comm, { int mpi_errno = MPI_SUCCESS; MPIDI_POSIX_am_header_t msg_hdr; - const int grank = MPIDIU_rank_to_lpid(rank, comm); + const int grank = MPIDIU_get_grank(rank, comm); MPIR_FUNC_ENTER; diff --git a/src/mpid/ch4/shm/posix/posix_pre.h b/src/mpid/ch4/shm/posix/posix_pre.h index 1357eed5895..98ddad5cf17 100644 --- a/src/mpid/ch4/shm/posix/posix_pre.h +++ b/src/mpid/ch4/shm/posix/posix_pre.h @@ -120,7 +120,7 @@ do { \ #define MPIDI_POSIX_EAGER_RECV_POSTED_HOOK(request,rank,communicator)\ do { \ - int grank_ = ((rank) >= 0) ? MPIDIU_rank_to_lpid((rank), (communicator)) : (rank); \ + int grank_ = ((rank) >= 0) ? MPIDIU_get_grank((rank), (communicator)) : (rank); \ (request)->dev.ch4.am.shm_am.posix.eager_recv_posted_hook_grank = grank_; \ MPIDI_POSIX_eager_recv_posted_hook(grank_); \ } while (0) diff --git a/src/mpid/ch4/shm/posix/posix_send.h b/src/mpid/ch4/shm/posix/posix_send.h index 3b66e77b716..cda37c20d07 100644 --- a/src/mpid/ch4/shm/posix/posix_send.h +++ b/src/mpid/ch4/shm/posix/posix_send.h @@ -64,7 +64,7 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_POSIX_mpi_isend(const void *buf, MPI_Aint cou am_hdr.data_sz = data_sz; am_hdr.rndv_hdr_sz = 0; - int grank = MPIDIU_rank_to_lpid(rank, comm); + int grank = MPIDIU_get_grank(rank, comm); MPI_Aint bytes_sent; int rc = MPIDI_POSIX_eager_send(grank, &msg_hdr, &am_hdr, sizeof(am_hdr), buf, count, datatype, 0, vci_src, vci_dst, &bytes_sent); diff --git a/src/mpid/ch4/src/ch4_comm.c b/src/mpid/ch4/src/ch4_comm.c index 808d6f6e21b..c84bc1ea89f 100644 --- a/src/mpid/ch4/src/ch4_comm.c +++ b/src/mpid/ch4/src/ch4_comm.c @@ -118,68 +118,17 @@ int MPIDI_Comm_split_type(MPIR_Comm * user_comm_ptr, int split_type, int key, MP /* --END ERROR HANDLING-- */ } -static void mlut_update_avt_reference(int size, MPIDI_gpid_t * gpid, bool is_release) -{ - int n_avts = MPIDIU_get_n_avts(); - int *uniq_avtids = (int *) MPL_calloc(n_avts, sizeof(int), MPL_MEM_ADDRESS); - for (int i = 0; i < size; i++) { - if (uniq_avtids[gpid[i].avtid] == 0) { - uniq_avtids[gpid[i].avtid] = 1; - if (is_release) { - MPIDIU_avt_release_ref(gpid[i].avtid); - } else { - MPIDIU_avt_add_ref(gpid[i].avtid); - } - } - } - MPL_free(uniq_avtids); -} - int MPID_Comm_commit_pre_hook(MPIR_Comm * comm) { int mpi_errno; MPIR_FUNC_ENTER; - if (comm == MPIR_Process.comm_world) { - MPIDI_COMM(comm, map).mode = MPIDI_RANK_MAP_DIRECT_INTRA; - MPIDI_COMM(comm, map).avtid = 0; - MPIDI_COMM(comm, map).size = MPIR_Process.size; - MPIDI_COMM(comm, local_map).mode = MPIDI_RANK_MAP_NONE; - MPIDIU_avt_add_ref(0); + MPIR_Assert(comm->local_group); + MPIR_Assert(comm->comm_kind == MPIR_COMM_KIND__INTRACOMM || comm->remote_group); + if (comm == MPIR_Process.comm_world) { mpi_errno = MPIDI_world_pre_init(); MPIR_ERR_CHECK(mpi_errno); - } else if (comm == MPIR_Process.comm_self) { - MPIDI_COMM(comm, map).mode = MPIDI_RANK_MAP_OFFSET_INTRA; - MPIDI_COMM(comm, map).avtid = 0; - MPIDI_COMM(comm, map).size = 1; - MPIDI_COMM(comm, map).reg.offset = MPIR_Process.rank; - MPIDI_COMM(comm, local_map).mode = MPIDI_RANK_MAP_NONE; - MPIDIU_avt_add_ref(0); - } else { - MPIDI_comm_create_rank_map(comm); - /* add ref to avts */ - switch (MPIDI_COMM(comm, map).mode) { - case MPIDI_RANK_MAP_NONE: - break; - case MPIDI_RANK_MAP_MLUT: - mlut_update_avt_reference(MPIDI_COMM(comm, map).size, - MPIDI_COMM(comm, map).irreg.mlut.gpid, false); - break; - default: - MPIDIU_avt_add_ref(MPIDI_COMM(comm, map).avtid); - } - - switch (MPIDI_COMM(comm, local_map).mode) { - case MPIDI_RANK_MAP_NONE: - break; - case MPIDI_RANK_MAP_MLUT: - mlut_update_avt_reference(MPIDI_COMM(comm, local_map).size, - MPIDI_COMM(comm, local_map).irreg.mlut.gpid, false); - break; - default: - MPIDIU_avt_add_ref(MPIDI_COMM(comm, local_map).avtid); - } } MPIDI_COMM(comm, multi_leads_comm) = NULL; @@ -306,46 +255,6 @@ int MPID_Comm_free_hook(MPIR_Comm * comm) MPL_free(MPIDI_COMM(comm, allreduce_comp_info)); } - - - /* release ref to avts */ - switch (MPIDI_COMM(comm, map).mode) { - case MPIDI_RANK_MAP_NONE: - break; - case MPIDI_RANK_MAP_MLUT: - mlut_update_avt_reference(MPIDI_COMM(comm, map).size, - MPIDI_COMM(comm, map).irreg.mlut.gpid, true); - break; - default: - MPIDIU_avt_release_ref(MPIDI_COMM(comm, map).avtid); - } - - switch (MPIDI_COMM(comm, local_map).mode) { - case MPIDI_RANK_MAP_NONE: - break; - case MPIDI_RANK_MAP_MLUT: - mlut_update_avt_reference(MPIDI_COMM(comm, local_map).size, - MPIDI_COMM(comm, local_map).irreg.mlut.gpid, true); - break; - default: - MPIDIU_avt_release_ref(MPIDI_COMM(comm, local_map).avtid); - } - - if (MPIDI_COMM(comm, map).mode == MPIDI_RANK_MAP_LUT - || MPIDI_COMM(comm, map).mode == MPIDI_RANK_MAP_LUT_INTRA) { - MPIDIU_release_lut(MPIDI_COMM(comm, map).irreg.lut.t); - } - if (MPIDI_COMM(comm, local_map).mode == MPIDI_RANK_MAP_LUT - || MPIDI_COMM(comm, local_map).mode == MPIDI_RANK_MAP_LUT_INTRA) { - MPIDIU_release_lut(MPIDI_COMM(comm, local_map).irreg.lut.t); - } - if (MPIDI_COMM(comm, map).mode == MPIDI_RANK_MAP_MLUT) { - MPIDIU_release_mlut(MPIDI_COMM(comm, map).irreg.mlut.t); - } - if (MPIDI_COMM(comm, local_map).mode == MPIDI_RANK_MAP_MLUT) { - MPIDIU_release_mlut(MPIDI_COMM(comm, local_map).irreg.mlut.t); - } - mpi_errno = MPIDI_NM_mpi_comm_free_hook(comm); MPIR_ERR_CHECK(mpi_errno); #ifndef MPIDI_CH4_DIRECT_NETMOD @@ -390,323 +299,390 @@ int MPID_Comm_set_hints(MPIR_Comm * comm_ptr, MPIR_Info * info_ptr) goto fn_exit; } -int MPID_Intercomm_exchange_map(MPIR_Comm * local_comm, int local_leader, MPIR_Comm * peer_comm, - int remote_leader, int *remote_size, uint64_t ** remote_gpids, - int *is_low_group) +/* Stages of forming inter communicator: + * 0. establish leader communication - get dynamic_av via PMI, peer_comm, or connect/accept. + * 1. leader exchange data. + * 2. leader broadcast over local_comm. + */ +static int leader_exchange(MPIR_Comm * local_comm, MPIR_Lpid remote_lpid, int tag, + int context_id, int *remote_data_size_out, void **remote_data_out, + int timeout); +static int prepare_local_lpids(MPIR_Comm * local_comm, MPIR_Lpid ** lpids_out, + int *num_worlds_out, int **worlds_out); +static void convert_local_lpids(int local_size, MPIR_Lpid * lpids, int num_worlds, int *worlds); +static int prepare_local_data(int local_size, int context_id, MPIR_Lpid * lpids, + int num_worlds, int *world_idx_array, + int *upid_sizes, char *upids, int *data_size_out, void **data_out); +static int extract_remote_data(void *remote_data, int *remote_size_out, + int *remote_context_id_out, MPIR_Lpid ** remote_lpids_out, + int **remote_upid_sizes_out, char **remote_upids_out); + +int MPID_Intercomm_exchange(MPIR_Comm * local_comm, int local_leader, + MPIR_Comm * peer_comm, int remote_leader, int tag, + int context_id, int *remote_context_id_out, + int *remote_size_out, MPIR_Lpid ** remote_lpids_out, int timeout) { int mpi_errno = MPI_SUCCESS; - int i; - int avtid = 0, lpid = -1; - int local_avtid = 0, remote_avtid = 0; - int local_size_send = 0, remote_size_recv = 0; - int cts_tag = 0; - int pure_intracomm = 1; - int local_size = 0; - uint64_t *local_gpids = NULL; - int *local_upid_size = NULL, *remote_upid_size = NULL; - int upid_send_size = 0, upid_recv_size = 0; - char *local_upids = NULL, *remote_upids = NULL; - - /* - * CH4 only cares about GPID. UPID extraction and exchange should be done - * by netmod - */ MPIR_FUNC_ENTER; - MPIR_CHKPMEM_DECL(1); - MPIR_CHKLMEM_DECL(5); - - cts_tag = 0 | MPIR_TAG_COLL_BIT; - local_size = local_comm->local_size; + bool is_local_leader = (local_comm->rank == local_leader); + struct bcast_data_t { + int mpi_errno; + int remote_data_size; + }; + struct bcast_data_t bcast_data; + + /* Stage 1: exchange between leaders */ + int remote_data_size = 0; + void *remote_data = NULL; + if (is_local_leader) { + MPID_THREAD_CS_ENTER(VCI, MPIDI_VCI(0).lock); + MPIR_Lpid remote_lpid = MPIR_comm_rank_to_lpid(peer_comm, remote_leader); + mpi_errno = leader_exchange(local_comm, remote_lpid, tag, context_id, + &remote_data_size, &remote_data, timeout); + MPID_THREAD_CS_EXIT(VCI, MPIDI_VCI(0).lock); + } - /* - * Stage 1: UPID exchange and GPID conversion in leaders - */ - if (local_comm->rank == local_leader) { - /* We need to check all processes in local group to decide there - * is no dynamic spawned process. */ - for (i = 0; i < local_size; i++) { - MPIDIU_comm_rank_to_pid(local_comm, i, &lpid, &local_avtid); - if (local_avtid > 0) { - pure_intracomm = 0; - break; - } - } - if (pure_intracomm) { - /* check if remote leader is dynamic spawned process */ - MPIDIU_comm_rank_to_pid(peer_comm, remote_leader, &lpid, &remote_avtid); - if (remote_avtid > 0) - pure_intracomm = 0; - } - local_size_send = local_size; - if (!pure_intracomm) { - /* embedded dynamic process info in size */ - local_size_send |= MPIDI_DYNPROC_MASK; - } + /* Stage 2: Broadcast inside local_group */ + if (is_local_leader) { + bcast_data.mpi_errno = mpi_errno; + bcast_data.remote_data_size = remote_data_size; + } + mpi_errno = MPIR_Bcast_impl(&bcast_data, 2, MPI_INT, local_leader, local_comm, MPIR_ERR_NONE); + MPIR_ERR_CHECK(mpi_errno); - MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_COMM, VERBOSE, - (MPL_DBG_FDEST, "rank %d sendrecv to rank %d", - peer_comm->rank, remote_leader)); - mpi_errno = MPIC_Sendrecv(&local_size_send, 1, MPI_INT, - remote_leader, cts_tag, - &remote_size_recv, 1, MPI_INT, - remote_leader, cts_tag, peer_comm, MPI_STATUS_IGNORE, - MPIR_ERR_NONE); + /* error checking of previous leader exchange */ + if (is_local_leader) { + mpi_errno = bcast_data.mpi_errno; MPIR_ERR_CHECK(mpi_errno); + } else { + MPIR_ERR_CHKANDJUMP(bcast_data.mpi_errno, mpi_errno, MPI_ERR_PORT, "**spawn"); + remote_data_size = bcast_data.remote_data_size; + } - if (remote_size_recv & MPIDI_DYNPROC_MASK) - pure_intracomm = 0; - (*remote_size) = remote_size_recv & (~MPIDI_DYNPROC_MASK); - - MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_COMM, VERBOSE, - (MPL_DBG_FDEST, "local size = %d, remote size = %d, pure_intracomm = %d", - local_size, *remote_size, pure_intracomm)); - - MPIR_CHKPMEM_MALLOC((*remote_gpids), uint64_t *, (*remote_size) * sizeof(uint64_t), - mpi_errno, "remote_gpids", MPL_MEM_ADDRESS); - MPIR_CHKLMEM_MALLOC(local_gpids, uint64_t *, local_size * sizeof(uint64_t), - mpi_errno, "local_gpids", MPL_MEM_ADDRESS); - for (i = 0; i < local_size; i++) { - MPIDIU_comm_rank_to_pid(local_comm, i, &lpid, &avtid); - local_gpids[i] = MPIDIU_GPID_CREATE(avtid, lpid); - } + /* bcast remote data */ + if (!is_local_leader) { + remote_data = MPL_malloc(remote_data_size, MPL_MEM_OTHER); + MPIR_ERR_CHKANDJUMP(!remote_data, mpi_errno, MPI_ERR_OTHER, "**nomem"); + } - /* TODO: optimizations -- - * if local_size is 1, we can skip send and local bcast; - * if remote_size is 1, we can skip recv. - */ - MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_COMM, VERBOSE, - (MPL_DBG_FDEST, "Intercomm map exchange stage 1: leaders")); - if (!pure_intracomm) { - /* Stage 1.1 UPID exchange between leaders */ - MPIR_CHKLMEM_MALLOC(remote_upid_size, int *, (*remote_size) * sizeof(int), - mpi_errno, "remote_upid_size", MPL_MEM_ADDRESS); - - mpi_errno = MPIDI_NM_get_local_upids(local_comm, &local_upid_size, &local_upids); - MPIR_ERR_CHECK(mpi_errno); - mpi_errno = MPIC_Sendrecv(local_upid_size, local_size, MPI_INT, - remote_leader, cts_tag, - remote_upid_size, *remote_size, MPI_INT, - remote_leader, cts_tag, - peer_comm, MPI_STATUS_IGNORE, MPIR_ERR_NONE); - MPIR_ERR_CHECK(mpi_errno); - upid_send_size = 0; - for (i = 0; i < local_size; i++) - upid_send_size += local_upid_size[i]; - upid_recv_size = 0; - for (i = 0; i < *remote_size; i++) - upid_recv_size += remote_upid_size[i]; - MPIR_CHKLMEM_MALLOC(remote_upids, char *, upid_recv_size * sizeof(char), - mpi_errno, "remote_upids", MPL_MEM_ADDRESS); - mpi_errno = MPIC_Sendrecv(local_upids, upid_send_size, MPI_BYTE, - remote_leader, cts_tag, - remote_upids, upid_recv_size, MPI_BYTE, - remote_leader, cts_tag, - peer_comm, MPI_STATUS_IGNORE, MPIR_ERR_NONE); - MPIR_ERR_CHECK(mpi_errno); + mpi_errno = MPIR_Bcast_impl(remote_data, remote_data_size, MPI_BYTE, local_leader, local_comm, + MPIR_ERR_NONE); + MPIR_ERR_CHECK(mpi_errno); - /* Stage 1.2 convert remote UPID to GPID and get GPID for local group */ - MPIDIU_upids_to_gpids(*remote_size, remote_upid_size, remote_upids, *remote_gpids); - } else { - /* Stage 1.1f only exchange GPIDS if no dynamic process involved */ - mpi_errno = MPIC_Sendrecv(local_gpids, local_size, MPI_UINT64_T, - remote_leader, cts_tag, - *remote_gpids, *remote_size, MPI_UINT64_T, - remote_leader, cts_tag, - peer_comm, MPI_STATUS_IGNORE, MPIR_ERR_NONE); - MPIR_ERR_CHECK(mpi_errno); - } - /* Stage 1.3 check if local/remote groups are disjoint */ - - /* - * Error checking for this routine requires care. Because this - * routine is collective over two different sets of processes, - * it is relatively easy for the user to try to create an - * intercommunicator from two overlapping groups of processes. - * This is made more likely by inconsistencies in the MPI-1 - * specification (clarified in MPI-2) that seemed to allow - * the groups to overlap. Because of that, we first check that the - * groups are in fact disjoint before performing any collective - * operations. - */ + /* Stage 3: Each process extract data (if necessary: add worlds, convert lpids) */ + MPIR_Lpid *remote_lpids; + int *remote_upid_sizes; + char *remote_upids; + MPID_THREAD_CS_ENTER(VCI, MPIDI_VCI(0).lock); + /* need be inside CS because we are potentially introducing new worlds */ + mpi_errno = extract_remote_data(remote_data, remote_size_out, remote_context_id_out, + &remote_lpids, &remote_upid_sizes, &remote_upids); + MPID_THREAD_CS_EXIT(VCI, MPIDI_VCI(0).lock); + MPIR_ERR_CHECK(mpi_errno); #ifdef HAVE_ERROR_CHECKING - { - MPID_BEGIN_ERROR_CHECKS; - { - /* Now that we have both the local and remote processes, - * check for any overlap */ - mpi_errno = MPIDI_check_disjoint_gpids(local_gpids, local_size, - *remote_gpids, *remote_size); - MPIR_ERR_CHECK(mpi_errno); - } - MPID_END_ERROR_CHECKS; - } -#endif /* HAVE_ERROR_CHECKING */ - - /* - * Make an arbitrary decision about which group of process is - * the low group. The LEADERS do this by comparing the - * local process ids of the 0th member of the two groups - * GPID itself is not enough for determine is_low_group because both - * local group is always smaller than remote - */ - if (pure_intracomm) { - *is_low_group = local_gpids[0] < (*remote_gpids)[0]; - } else { - if (local_upid_size[0] == remote_upid_size[0]) { - *is_low_group = memcmp(local_upids, remote_upids, local_upid_size[0]); - MPIR_Assert(*is_low_group != 0); - if (*is_low_group < 0) - *is_low_group = 0; - else - *is_low_group = 1; - } else { - *is_low_group = local_upid_size[0] < remote_upid_size[0]; - } - } - - /* At this point, we're done with the local lpids; they'll - * be freed with the other local memory on exit */ - local_gpids = NULL; + /* Now that we have both the local and remote processes, + * check for any overlap */ + MPIR_Lpid *local_lpids; + local_lpids = MPL_malloc(local_comm->local_size * sizeof(MPIR_Lpid), MPL_MEM_GROUP); + MPIR_ERR_CHKANDJUMP(!local_lpids, mpi_errno, MPI_ERR_OTHER, "**nomem"); + for (int i = 0; i < local_comm->local_size; i++) { + local_lpids[i] = MPIR_Group_rank_to_lpid(local_comm->local_group, i); } - /* - * Stage 2. Bcast UPID to non-leaders (intra-group) - */ - MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_COMM, VERBOSE, - (MPL_DBG_FDEST, "Intercomm map exchange stage 2: intra-group")); - mpi_errno = MPIDIU_Intercomm_map_bcast_intra(local_comm, local_leader, - remote_size, is_low_group, pure_intracomm, - remote_upid_size, remote_upids, remote_gpids); + mpi_errno = MPIDI_check_disjoint_lpids(local_lpids, local_comm->local_size, + remote_lpids, *remote_size_out); + MPL_free(local_lpids); + MPIR_ERR_CHECK(mpi_errno); +#endif + + /* insert upids */ + char *upid = remote_upids; + MPID_THREAD_CS_ENTER(VCI, MPIDI_VCI(0).lock); + for (int i = 0; i < *remote_size_out; i++) { + mpi_errno = MPIDI_NM_insert_upid(remote_lpids[i], upid, remote_upid_sizes[i]); + if (mpi_errno) { + break; + } + upid += remote_upid_sizes[i]; + } + MPID_THREAD_CS_EXIT(VCI, MPIDI_VCI(0).lock); MPIR_ERR_CHECK(mpi_errno); - MPIR_CHKPMEM_COMMIT(); + /* make a copy of remote_lpids (because it points to remote_data and it will freed) */ + *remote_lpids_out = MPL_malloc((*remote_size_out) * sizeof(MPIR_Lpid), MPL_MEM_GROUP); + MPIR_ERR_CHKANDJUMP(!(*remote_lpids_out), mpi_errno, MPI_ERR_OTHER, "**nomem"); + memcpy(*remote_lpids_out, remote_lpids, (*remote_size_out) * sizeof(MPIR_Lpid)); + + MPL_free(remote_data); + fn_exit: - MPL_free(local_upid_size); - MPL_free(local_upids); - MPIR_CHKLMEM_FREEALL(); MPIR_FUNC_EXIT; return mpi_errno; fn_fail: - MPIR_CHKPMEM_REAP(); - *remote_gpids = NULL; goto fn_exit; } -int MPIDIU_Intercomm_map_bcast_intra(MPIR_Comm * local_comm, int local_leader, int *remote_size, - int *is_low_group, int pure_intracomm, - int *remote_upid_size, char *remote_upids, - uint64_t ** remote_gpids) +/* Allocate and fill local lpids data. We assume remote will be from + * different worlds, so we need worlds info so remote can match worlds + * and convert lpids. + */ +static int prepare_local_lpids(MPIR_Comm * local_comm, MPIR_Lpid ** lpids_out, + int *num_worlds_out, int **worlds_out) { int mpi_errno = MPI_SUCCESS; - int i; - int upid_recv_size = 0; - int map_info[4]; - int *_remote_upid_size = NULL; - char *_remote_upids = NULL; - MPIR_CHKPMEM_DECL(1); - MPIR_CHKLMEM_DECL(3); + int local_size = local_comm->local_size; - MPIR_FUNC_ENTER; + MPIR_Lpid *lpids; + lpids = MPL_malloc(local_size * sizeof(MPIR_Lpid), MPL_MEM_GROUP); + MPIR_ERR_CHKANDJUMP(!lpids, mpi_errno, MPI_ERR_OTHER, "**nomem"); - if (local_comm->rank == local_leader) { - if (!pure_intracomm) { - for (i = 0; i < (*remote_size); i++) { - upid_recv_size += remote_upid_size[i]; + /* a make-shift hash for world_idx's, consider typically only a few worlds (or just 0) + * It is OK to use static array here because the entire leader exchange will be + * under (VCI 0) critical section. + */ +#define MAX_WORLDS 100 + static int world_hash[MAX_WORLDS] = { 0 }; + int num_worlds = 0; + + for (int i = 0; i < local_size; i++) { + lpids[i] = MPIR_Group_rank_to_lpid(local_comm->local_group, i); + int world_idx = MPIR_LPID_WORLD_INDEX(lpids[i]); + + bool found = false; + for (int j = 0; j < num_worlds; j++) { + if (world_hash[j] == world_idx) { + found = true; + break; } } - map_info[0] = *remote_size; - map_info[1] = upid_recv_size; - map_info[2] = *is_low_group; - map_info[3] = pure_intracomm; - mpi_errno = - MPIR_Bcast_allcomm_auto(map_info, 4, MPI_INT, local_leader, local_comm, MPIR_ERR_NONE); - MPIR_ERR_CHECK(mpi_errno); - - if (!pure_intracomm) { - mpi_errno = MPIR_Bcast_allcomm_auto(remote_upid_size, *remote_size, MPI_INT, - local_leader, local_comm, MPIR_ERR_NONE); - MPIR_ERR_CHECK(mpi_errno); - mpi_errno = MPIR_Bcast_allcomm_auto(remote_upids, upid_recv_size, MPI_BYTE, - local_leader, local_comm, MPIR_ERR_NONE); - MPIR_ERR_CHECK(mpi_errno); - } else { - mpi_errno = MPIR_Bcast_allcomm_auto(*remote_gpids, *remote_size, MPI_UINT64_T, - local_leader, local_comm, MPIR_ERR_NONE); - } - } else { - mpi_errno = - MPIR_Bcast_allcomm_auto(map_info, 4, MPI_INT, local_leader, local_comm, MPIR_ERR_NONE); - MPIR_ERR_CHECK(mpi_errno); - *remote_size = map_info[0]; - upid_recv_size = map_info[1]; - *is_low_group = map_info[2]; - pure_intracomm = map_info[3]; - - MPIR_CHKPMEM_MALLOC((*remote_gpids), uint64_t *, (*remote_size) * sizeof(uint64_t), - mpi_errno, "remote_gpids", MPL_MEM_COMM); - if (!pure_intracomm) { - MPIR_CHKLMEM_MALLOC(_remote_upid_size, int *, (*remote_size) * sizeof(int), - mpi_errno, "_remote_upid_size", MPL_MEM_COMM); - mpi_errno = MPIR_Bcast_allcomm_auto(_remote_upid_size, *remote_size, MPI_INT, - local_leader, local_comm, MPIR_ERR_NONE); - MPIR_ERR_CHECK(mpi_errno); - MPIR_CHKLMEM_MALLOC(_remote_upids, char *, upid_recv_size * sizeof(char), - mpi_errno, "_remote_upids", MPL_MEM_COMM); - mpi_errno = MPIR_Bcast_allcomm_auto(_remote_upids, upid_recv_size, MPI_BYTE, - local_leader, local_comm, MPIR_ERR_NONE); - MPIR_ERR_CHECK(mpi_errno); - - MPIDIU_upids_to_gpids(*remote_size, _remote_upid_size, _remote_upids, *remote_gpids); - } else { - mpi_errno = MPIR_Bcast_allcomm_auto(*remote_gpids, *remote_size, MPI_UINT64_T, - local_leader, local_comm, MPIR_ERR_NONE); + if (!found) { + world_hash[num_worlds++] = world_idx; + MPIR_Assert(num_worlds < MAX_WORLDS); } } - MPIR_CHKPMEM_COMMIT(); fn_exit: - MPIR_CHKLMEM_FREEALL(); - MPIR_FUNC_EXIT; + *lpids_out = lpids; + *num_worlds_out = num_worlds; + *worlds_out = world_hash; return mpi_errno; fn_fail: - MPIR_CHKPMEM_REAP(); - *remote_gpids = NULL; goto fn_exit; } -int MPID_Create_intercomm_from_lpids(MPIR_Comm * newcomm_ptr, int size, const uint64_t lpids[]) +static void convert_local_lpids(int local_size, MPIR_Lpid * lpids, int num_worlds, int *worlds) +{ + for (int i = 0; i < local_size; i++) { + int world_idx = MPIR_LPID_WORLD_INDEX(lpids[i]); + int world_rank = MPIR_LPID_WORLD_RANK(lpids[i]); + int transit_world_idx = -1; + for (int j = 0; j < num_worlds; j++) { + if (worlds[j] == world_idx) { + transit_world_idx = j; + break; + } + } + MPIR_Assert(transit_world_idx >= 0); + lpids[i] = MPIR_LPID_FROM(transit_world_idx, world_rank); + } +} + +static int prepare_local_data(int local_size, int context_id, MPIR_Lpid * lpids, + int num_worlds, int *world_idx_array, + int *upid_sizes, char *upids, int *data_size_out, void **data_out) { - int mpi_errno = MPI_SUCCESS, i; + int mpi_errno = MPI_SUCCESS; + + /* layout: + * local_size + * context_id + * lpids[local_size] + * num_worlds + * namespace[num_worlds][MPIR_NAMESPACE_MAX] + * world_sizes[num_worlds] + * upid_sizes[local_size] + * upids[] + */ + int total_upid_size = 0; + for (int i = 0; i < local_size; i++) { + total_upid_size += upid_sizes[i]; + } + + int len = 0; + len += sizeof(int) * 2 + local_size * sizeof(MPIR_Lpid); + len += sizeof(int) + num_worlds * MPIR_NAMESPACE_MAX + num_worlds * sizeof(int); + len += local_size * sizeof(int); + len += total_upid_size; + + char *data = MPL_malloc(len, MPL_MEM_OTHER); + char *s = data; + + *(int *) (s) = local_size; + s += sizeof(int); + *(int *) (s) = context_id; + s += sizeof(int); + + memcpy(s, lpids, local_size * sizeof(MPIR_Lpid)); + s += local_size * sizeof(MPIR_Lpid); + + *(int *) (s) = num_worlds; + s += sizeof(int); + for (int i = 0; i < num_worlds; i++) { + strncpy(s, MPIR_Worlds[world_idx_array[i]].namespace, MPIR_NAMESPACE_MAX); + s += MPIR_NAMESPACE_MAX; + } + for (int i = 0; i < num_worlds; i++) { + *(int *) (s) = MPIR_Worlds[world_idx_array[i]].num_procs; + s += sizeof(int); + } + + memcpy(s, upid_sizes, local_size * sizeof(int)); + s += local_size * sizeof(int); + + memcpy(s, upids, total_upid_size); + + *data_size_out = len; + *data_out = data; + + return mpi_errno; +} + +/* NOTE: will add worlds and convert lpids if necessary */ +static int extract_remote_data(void *remote_data, int *remote_size_out, + int *remote_context_id_out, MPIR_Lpid ** remote_lpids_out, + int **remote_upid_sizes_out, char **remote_upids_out) +{ + int mpi_errno = MPI_SUCCESS; + char *s = remote_data; + + *remote_size_out = *(int *) s; + s += sizeof(int); + int remote_size = *remote_size_out; + + *remote_context_id_out = *(int *) s; + s += sizeof(int); + + *remote_lpids_out = (void *) s; + s += remote_size * sizeof(MPIR_Lpid); + + int num_worlds = *(int *) s; + s += sizeof(int); + + char *p_worlds = s; + s += num_worlds * MPIR_NAMESPACE_MAX; + + int *p_world_sizes = (void *) s; + s += num_worlds * sizeof(int); + + *remote_upid_sizes_out = (void *) s; + s += remote_size * sizeof(int); + + *remote_upids_out = s; + + /* Find or add new worlds */ + int world_hash[MAX_WORLDS]; + for (int i = 0; i < num_worlds; i++) { + char *namespace = p_worlds + i * MPIR_NAMESPACE_MAX; + world_hash[i] = MPIR_find_world(namespace); + if (world_hash[i] == -1) { + world_hash[i] = MPIR_add_world(namespace, p_world_sizes[i]); + } + } + + /* convert remote lpids */ + for (int i = 0; i < remote_size; i++) { + MPIR_Lpid lpid = (*remote_lpids_out)[i]; + int world_idx = MPIR_LPID_WORLD_INDEX(lpid); + int world_rank = MPIR_LPID_WORLD_RANK(lpid); + (*remote_lpids_out)[i] = MPIR_LPID_FROM(world_hash[world_idx], world_rank); + } + + return mpi_errno; +} + +/* exchange data between leaders */ +static int leader_exchange(MPIR_Comm * local_comm, MPIR_Lpid remote_lpid, int tag, int context_id, + int *remote_data_size_out, void **remote_data_out, int timeout) +{ + int mpi_errno = MPI_SUCCESS; + MPIR_CHKLMEM_DECL(4); MPIR_FUNC_ENTER; - MPIDI_rank_map_mlut_t *mlut = NULL; - MPIDI_COMM(newcomm_ptr, map).mode = MPIDI_RANK_MAP_MLUT; - MPIDI_COMM(newcomm_ptr, map).avtid = -1; - mpi_errno = MPIDIU_alloc_mlut(&mlut, size); + /* I am the leader of local_comm, remote_lpid is the remote leader of remote_comm. + * + * 1. Send data sizes + * 2. Send data + * + * Future optimizations + * * Eager mode + * * Optionally skip upids exchange + */ + + /* local prepare */ + int local_size = local_comm->local_size; + MPIR_Lpid *local_lpids; + int num_local_worlds; + int *local_worlds; + mpi_errno = prepare_local_lpids(local_comm, &local_lpids, &num_local_worlds, &local_worlds); MPIR_ERR_CHECK(mpi_errno); - MPIDI_COMM(newcomm_ptr, map).size = size; - MPIDI_COMM(newcomm_ptr, map).irreg.mlut.t = mlut; - MPIDI_COMM(newcomm_ptr, map).irreg.mlut.gpid = mlut->gpid; + MPIR_CHKLMEM_ADD(local_lpids); - for (i = 0; i < size; i++) { - MPIDI_COMM(newcomm_ptr, map).irreg.mlut.gpid[i].avtid = MPIDIU_GPID_GET_AVTID(lpids[i]); - MPIDI_COMM(newcomm_ptr, map).irreg.mlut.gpid[i].lpid = MPIDIU_GPID_GET_LPID(lpids[i]); - MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MAP, VERBOSE, - (MPL_DBG_FDEST, " remote rank=%d, avtid=%d, lpid=%d", i, - MPIDI_COMM(newcomm_ptr, map).irreg.mlut.gpid[i].avtid, - MPIDI_COMM(newcomm_ptr, map).irreg.mlut.gpid[i].lpid)); - } + /* convert local world_idx to transit world_idx */ + convert_local_lpids(local_size, local_lpids, num_local_worlds, local_worlds); + + int *local_upid_sizes; + char *local_upids; + mpi_errno = MPIDI_NM_get_local_upids(local_comm, &local_upid_sizes, &local_upids); + MPIR_ERR_CHECK(mpi_errno); + MPIR_CHKLMEM_ADD(local_upid_sizes); + MPIR_CHKLMEM_ADD(local_upids); + + int local_data_size; + void *local_data; + mpi_errno = prepare_local_data(local_size, context_id, local_lpids, + num_local_worlds, local_worlds, local_upid_sizes, local_upids, + &local_data_size, &local_data); + MPIR_ERR_CHECK(mpi_errno); + MPIR_CHKLMEM_ADD(local_data); + + /* exchange */ + int remote_data_size; + void *remote_data; + mpi_errno = MPIDI_NM_dynamic_sendrecv(remote_lpid, tag, &local_data_size, sizeof(int), + &remote_data_size, sizeof(int), timeout); + MPIR_ERR_CHECK(mpi_errno); + + remote_data = MPL_malloc(remote_data_size, MPL_MEM_OTHER); + MPIR_ERR_CHKANDJUMP(!remote_data, mpi_errno, MPI_ERR_OTHER, "**nomem"); + + mpi_errno = MPIDI_NM_dynamic_sendrecv(remote_lpid, tag, local_data, local_data_size, + remote_data, remote_data_size, timeout); + MPIR_ERR_CHECK(mpi_errno); + + *remote_data_size_out = remote_data_size; + *remote_data_out = remote_data; fn_exit: + MPIR_CHKLMEM_FREEALL(); MPIR_FUNC_EXIT; return mpi_errno; fn_fail: goto fn_exit; } +/* ---- */ +int MPID_Create_intercomm_from_lpids(MPIR_Comm * newcomm_ptr, int size, const MPIR_Lpid lpids[]) +{ + int mpi_errno = MPI_SUCCESS; + + /* Assuming MPID_Intercomm_exchange already called, nothing to do here. */ + + return mpi_errno; +} + /* Create multi-leaders communicator */ /* Create a comm with rank 0 of each node. A comm with rank 1 of each node and so on. Since these * new comms do no overlap, it uses the same context id */ @@ -784,8 +760,9 @@ int MPIDI_Comm_create_multi_leaders(MPIR_Comm * comm) MPL_pof2(MPIDI_COMM(comm, multi_leads_comm)->local_size); MPIDI_COMM(comm, multi_leads_comm)->remote_size = num_external; - MPIR_Comm_map_irregular(MPIDI_COMM(comm, multi_leads_comm), comm, - external_procs, num_external, MPIR_COMM_MAP_DIR__L2L, NULL); + mpi_errno = MPIR_Group_incl_impl(comm->local_group, num_external, external_procs, + &MPIDI_COMM(comm, multi_leads_comm)->local_group); + MPIR_ERR_CHECK(mpi_errno); /* Notify device of communicator creation */ mpi_errno = MPID_Comm_commit_pre_hook(MPIDI_COMM(comm, multi_leads_comm)); @@ -801,8 +778,6 @@ int MPIDI_Comm_create_multi_leaders(MPIR_Comm * comm) mpi_errno = MPID_Comm_commit_post_hook(MPIDI_COMM(comm, multi_leads_comm)); if (mpi_errno) MPIR_ERR_CHECK(mpi_errno); - - MPIR_Comm_map_free(MPIDI_COMM(comm, multi_leads_comm)); } } diff --git a/src/mpid/ch4/src/ch4_comm.h b/src/mpid/ch4/src/ch4_comm.h index dfc9e18967a..45cb787459c 100644 --- a/src/mpid/ch4/src/ch4_comm.h +++ b/src/mpid/ch4/src/ch4_comm.h @@ -15,6 +15,9 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_set_comm_hint_sender_vci(MPIR_Comm * comm, in MPL_STATIC_INLINE_PREFIX int MPIDI_set_comm_hint_receiver_vci(MPIR_Comm * comm, int type, int value); MPL_STATIC_INLINE_PREFIX int MPIDI_set_comm_hint_vci(MPIR_Comm * comm, int type, int value); +int MPIDI_Intercomm_exchange(MPIR_Comm * local_comm, int local_leader, MPIR_Lpid remote_lpid, + int tag, int context_id, int *remote_context_id_out, + int *remote_size_out, MPIR_Lpid ** remote_lpids_out); int MPIDI_Comm_create_multi_leaders(MPIR_Comm * comm); int MPIDI_Comm_create_multi_leader_subcomms(MPIR_Comm * comm, int num_leads); diff --git a/src/mpid/ch4/src/ch4_impl.h b/src/mpid/ch4/src/ch4_impl.h index 8991052f1a5..f8da365d187 100644 --- a/src/mpid/ch4/src/ch4_impl.h +++ b/src/mpid/ch4/src/ch4_impl.h @@ -13,10 +13,6 @@ #include "ch4_self.h" #include "ch4_vci.h" -int MPIDIU_Intercomm_map_bcast_intra(MPIR_Comm * local_comm, int local_leader, int *remote_size, - int *is_low_group, int pure_intracomm, - int *remote_upid_size, char *remote_upids, - uint64_t ** remote_gpids); int MPIDIG_get_context_index(uint64_t context_id); uint64_t MPIDIG_generate_win_id(MPIR_Comm * comm_ptr); @@ -378,16 +374,19 @@ MPL_STATIC_INLINE_PREFIX void MPIDIG_win_hash_clear(MPIR_Win * win) /* We assume this routine is never called with rank=MPI_PROC_NULL. */ MPL_STATIC_INLINE_PREFIX int MPIDIU_valid_group_rank(MPIR_Comm * comm, int rank, MPIR_Group * grp) { - uint64_t gpid; + MPIR_Lpid lpid; int size = grp->size; int z; int ret; MPIR_FUNC_ENTER; - MPIDI_NM_comm_get_gpid(comm, rank, &gpid, FALSE); + lpid = MPIR_comm_rank_to_lpid(comm, rank); - for (z = 0; z < size && gpid != grp->lrank_to_lpid[z].lpid; ++z) { + for (z = 0; z < size; ++z) { + if (lpid == MPIR_Group_rank_to_lpid(grp, z)) { + break; + } } ret = (z < size); diff --git a/src/mpid/ch4/src/ch4_init.c b/src/mpid/ch4/src/ch4_init.c index 365a12b37ad..58af18a41ba 100644 --- a/src/mpid/ch4/src/ch4_init.c +++ b/src/mpid/ch4/src/ch4_init.c @@ -820,7 +820,7 @@ int MPID_Finalize(void) MPIDU_genq_private_pool_destroy(MPIDI_global.gpu_coll_pool); - MPIDIU_avt_destroy(); + MPIDIU_avt_finalize(); mpi_errno = MPIDU_Init_shm_finalize(); MPIR_ERR_CHECK(mpi_errno); @@ -1073,26 +1073,6 @@ int MPID_Free_mem(void *user_buf) goto fn_exit; } -int MPID_Comm_get_lpid(MPIR_Comm * comm_ptr, int idx, uint64_t * lpid_ptr, bool is_remote) -{ - int mpi_errno = MPI_SUCCESS; - int avtid = 0, lpid = 0; - MPIR_FUNC_ENTER; - - if (comm_ptr->comm_kind == MPIR_COMM_KIND__INTRACOMM) - MPIDIU_comm_rank_to_pid(comm_ptr, idx, &lpid, &avtid); - else if (is_remote) - MPIDIU_comm_rank_to_pid(comm_ptr, idx, &lpid, &avtid); - else { - MPIDIU_comm_rank_to_pid_local(comm_ptr, idx, &lpid, &avtid); - } - - *lpid_ptr = MPIDIU_GPID_CREATE(avtid, lpid); - - MPIR_FUNC_EXIT; - return mpi_errno; -} - int MPID_Get_node_id(MPIR_Comm * comm, int rank, int *id_p) { int mpi_errno = MPI_SUCCESS; diff --git a/src/mpid/ch4/src/ch4_proc.c b/src/mpid/ch4/src/ch4_proc.c index 56cb70b48c1..c9442caf662 100644 --- a/src/mpid/ch4/src/ch4_proc.c +++ b/src/mpid/ch4/src/ch4_proc.c @@ -102,8 +102,6 @@ int MPIDIU_new_avt(int size, int *avtid) } MPIDI_global.avt_mgr.av_tables[*avtid] = new_av_table; - MPIR_cc_set(&MPIDI_global.avt_mgr.av_tables[*avtid]->ref_count, 0); - MPIR_FUNC_EXIT; return mpi_errno; } @@ -124,32 +122,8 @@ int MPIDIU_free_avt(int avtid) return mpi_errno; } -int MPIDIU_avt_add_ref(int avtid) -{ - MPIR_FUNC_ENTER; - - MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_GENERAL, VERBOSE, (MPL_DBG_FDEST, " incr avtid=%d", avtid)); - MPIR_cc_inc(&MPIDI_global.avt_mgr.av_tables[avtid]->ref_count); - - MPIR_FUNC_EXIT; - return MPI_SUCCESS; -} - -int MPIDIU_avt_release_ref(int avtid) -{ - int in_use; - - MPIR_FUNC_ENTER; - - MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_GENERAL, VERBOSE, (MPL_DBG_FDEST, " decr avtid=%d", avtid)); - MPIR_cc_decr(&MPIDI_global.avt_mgr.av_tables[avtid]->ref_count, &in_use); - if (!in_use) { - MPIDIU_free_avt(avtid); - } - - MPIR_FUNC_EXIT; - return MPI_SUCCESS; -} +static void init_dynamic_av_table(void); +static void destroy_dynamic_av_table(void); int MPIDIU_avt_init(void) { @@ -175,7 +149,6 @@ int MPIDIU_avt_init(void) #endif MPIDI_global.avt_mgr.av_table0->size = size; - MPIR_cc_set(&MPIDI_global.avt_mgr.av_table0->ref_count, 1); for (int i = 0; i < size; i++) { MPIDI_global.avt_mgr.av_table0->table[i].is_local = @@ -185,21 +158,24 @@ int MPIDIU_avt_init(void) MPIDI_global.avt_mgr.av_tables[0] = MPIDI_global.avt_mgr.av_table0; + init_dynamic_av_table(); + MPIR_FUNC_EXIT; return mpi_errno; } -int MPIDIU_avt_destroy(void) +int MPIDIU_avt_finalize(void) { MPIR_FUNC_ENTER; for (int i = 0; i < MPIDI_global.avt_mgr.n_avts; i++) { if (MPIDI_global.avt_mgr.av_tables[i] != NULL) { - MPIDIU_avt_release_ref(i); - /*TODO: Check all references is cleared and the entry is set to NULL */ + MPIDIU_free_avt(i); } } + destroy_dynamic_av_table(); + MPL_free(MPIDI_global.avt_mgr.av_tables); memset(&MPIDI_global.avt_mgr, 0, sizeof(MPIDI_global.avt_mgr)); @@ -207,151 +183,158 @@ int MPIDIU_avt_destroy(void) return MPI_SUCCESS; } -#ifdef MPIDI_BUILD_CH4_UPID_HASH -/* Store the upid, avtid, lpid in a hash to support get_local_upids and upids_to_lupids */ -static MPIDI_upid_hash *upid_hash = NULL; - -void MPIDIU_upidhash_add(const void *upid, int upid_len, int avtid, int lpid) -{ - MPIDI_upid_hash *t; - t = MPL_malloc(sizeof(MPIDI_upid_hash), MPL_MEM_OTHER); - t->avtid = avtid; - t->lpid = lpid; - t->upid = MPL_malloc(upid_len, MPL_MEM_OTHER); - memcpy(t->upid, upid, upid_len); - t->upid_len = upid_len; - HASH_ADD_KEYPTR(hh, upid_hash, t->upid, upid_len, t, MPL_MEM_OTHER); - - MPIDIU_get_av(avtid, lpid).hash = t; - /* Do not free avt while we use upidhash - FIXME: improve it */ - MPIDIU_avt_add_ref(avtid); -} +#define MPIDIU_DYN_AV_TABLE MPIDI_global.avt_mgr.dynamic_av_table +#define MPIDIU_DYN_AV(idx) (MPIDI_av_entry_t *)((char *) MPIDI_global.avt_mgr.dynamic_av_table.table + (idx) * sizeof(MPIDI_av_entry_t)) -MPIDI_upid_hash *MPIDIU_upidhash_find(const void *upid, int upid_len) +static void init_dynamic_av_table(void) { - MPIDI_upid_hash *t; - HASH_FIND(hh, upid_hash, upid, upid_len, t); - return t; + /* allocate dynamic_av_table */ + int table_size = MPIDIU_DYNAMIC_AV_MAX * sizeof(MPIDI_av_entry_t); + MPIDIU_DYN_AV_TABLE.table = MPL_malloc(table_size, MPL_MEM_ADDRESS); + MPIDIU_DYN_AV_TABLE.size = 0; } -void MPIDIU_upidhash_free(void) +static void destroy_dynamic_av_table(void) { - MPIDI_upid_hash *cur, *tmp; - HASH_ITER(hh, upid_hash, cur, tmp) { - HASH_DEL(upid_hash, cur); - MPIDIU_avt_release_ref(cur->avtid); - MPL_free(cur->upid); - MPL_free(cur); - } + MPIR_Assert(MPIDIU_DYN_AV_TABLE.size == 0); + MPL_free(MPIDIU_DYN_AV_TABLE.table); } -#endif -/* convert upid to gpid by netmod. - * For ofi netmod, it inserts the address and fills an av entry. +/* NOTE: The following functions -- + * * MPIDIU_insert_dynamic_upid + * * MPIDIU_free_dynamic_lpid + * * MPIDIU_find_dynamic_av + * are thread-unsafe. Caller should enter (VCI-0) critical section. */ -int MPIDIU_upids_to_gpids(int size, int *remote_upid_size, char *remote_upids, - uint64_t * remote_gpids) + +int MPIDIU_insert_dynamic_upid(MPIR_Lpid * lpid_out, const char *upid, int upid_len) { int mpi_errno = MPI_SUCCESS; - MPIR_FUNC_ENTER; - - MPID_THREAD_CS_ENTER(VCI, MPIDIU_THREAD_DYNPROC_MUTEX); - mpi_errno = MPIDI_NM_upids_to_gpids(size, remote_upid_size, remote_upids, remote_gpids); - MPIR_ERR_CHECK(mpi_errno); - fn_exit: - MPID_THREAD_CS_EXIT(VCI, MPIDIU_THREAD_DYNPROC_MUTEX); - MPIR_FUNC_EXIT; - return mpi_errno; - fn_fail: - goto fn_exit; -} + /* allocate idx from dynamic av table */ + int idx = MPIDIU_DYN_AV_TABLE.size; + for (int i = 0; i < MPIDIU_DYN_AV_TABLE.size; i++) { + if (MPIDIU_DYN_AV_TABLE.upids[i] == NULL) { + idx = i; + break; + } + } + if (idx == MPIDIU_DYN_AV_TABLE.size) { + MPIDIU_DYN_AV_TABLE.size++; + if (MPIDIU_DYN_AV_TABLE.size >= MPIDIU_DYNAMIC_AV_MAX) { + MPIR_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**intern"); + } + } -int MPIDIU_alloc_lut(MPIDI_rank_map_lut_t ** lut, int size) -{ - int mpi_errno = MPI_SUCCESS; - MPIDI_rank_map_lut_t *new_lut = NULL; + /* copy the upid */ + char *upid_copy = MPL_malloc(upid_len, MPL_MEM_OTHER); + MPIR_ERR_CHKANDJUMP(!upid_copy, mpi_errno, MPI_ERR_OTHER, "**nomem"); + memcpy(upid_copy, upid, upid_len); - MPIR_FUNC_ENTER; + MPIDIU_DYN_AV_TABLE.upids[idx] = upid_copy; + MPIDIU_DYN_AV_TABLE.upid_sizes[idx] = upid_len; - new_lut = (MPIDI_rank_map_lut_t *) MPL_malloc(sizeof(MPIDI_rank_map_lut_t) - + size * sizeof(MPIDI_lpid_t), MPL_MEM_ADDRESS); - if (new_lut == NULL) { - *lut = NULL; - MPIR_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**nomem"); - } + /* insert upid */ + *lpid_out = MPIR_LPID_DYNAMIC_MASK | idx; - MPIR_cc_set(&new_lut->ref_count, 1); - *lut = new_lut; + mpi_errno = MPIDI_NM_insert_upid(*lpid_out, upid, upid_len); + MPIR_ERR_CHECK(mpi_errno); - MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MEMORY, VERBOSE, - (MPL_DBG_FDEST, "alloc lut %p, size %lu, refcount=%d", - new_lut, size * sizeof(MPIDI_lpid_t), MPIR_cc_get(&new_lut->ref_count))); fn_exit: - MPIR_FUNC_EXIT; - return mpi_errno; + return MPI_SUCCESS; fn_fail: goto fn_exit; } -int MPIDIU_release_lut(MPIDI_rank_map_lut_t * lut) +int MPIDIU_free_dynamic_lpid(MPIR_Lpid lpid) { - int mpi_errno = MPI_SUCCESS; - int in_use = 0; + MPIR_Assert(lpid & MPIR_LPID_DYNAMIC_MASK); + int idx = lpid & (~MPIR_LPID_DYNAMIC_MASK); + MPIR_Assert(idx >= 0 && idx < MPIDIU_DYN_AV_TABLE.size); + + /* free the upid buffer */ + MPL_free((char *) MPIDIU_DYN_AV_TABLE.upids[idx]); + /* mark the av as free by setting upid to NULL and upid_size to 0 */ + MPIDIU_DYN_AV_TABLE.upids[idx] = NULL; + MPIDIU_DYN_AV_TABLE.upid_sizes[idx] = 0; + + /* if the last entry is empty, reduce size */ + while (MPIDIU_DYN_AV_TABLE.size > 0 && + MPIDIU_DYN_AV_TABLE.upids[MPIDIU_DYN_AV_TABLE.size - 1] == NULL) { + MPIDIU_DYN_AV_TABLE.size--; + } - MPIR_FUNC_ENTER; + return MPI_SUCCESS; +} - MPIR_cc_decr(&lut->ref_count, &in_use); - MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MEMORY, VERBOSE, (MPL_DBG_FDEST, "dec ref to lut %p", lut)); - if (!in_use) { - MPL_free(lut); - MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MEMORY, VERBOSE, (MPL_DBG_FDEST, "free lut %p", lut)); +MPIDI_av_entry_t *MPIDIU_find_dynamic_av(const char *upid, int upid_len) +{ + for (int i = 0; i < MPIDIU_DYN_AV_TABLE.size; i++) { + if (MPIDIU_DYN_AV_TABLE.upid_sizes[i] == upid_len && + memcmp(MPIDIU_DYN_AV_TABLE.upids[i], upid, upid_len) == 0) { + return MPIDIU_DYN_AV(i); + } } - MPIR_FUNC_EXIT; - return mpi_errno; + return NULL; } -int MPIDIU_alloc_mlut(MPIDI_rank_map_mlut_t ** mlut, int size) +/* this version handles dynamic av or av entries that are not allocated yet (e.g. new world) + */ +MPIDI_av_entry_t *MPIDIU_lpid_to_av_slow(MPIR_Lpid lpid) { - int mpi_errno = MPI_SUCCESS; - MPIDI_rank_map_mlut_t *new_mlut = NULL; - - MPIR_FUNC_ENTER; + if (lpid & MPIR_LPID_DYNAMIC_MASK) { + int idx = lpid & (~MPIR_LPID_DYNAMIC_MASK); + MPIR_Assert(idx >= 0 && idx < MPIDIU_DYN_AV_TABLE.size); + return &MPIDIU_DYN_AV_TABLE.table[idx]; + } else { + int world_idx = MPIR_LPID_WORLD_INDEX(lpid); + int world_rank = MPIR_LPID_WORLD_RANK(lpid); + + MPIR_Assert(world_rank < MPIR_Worlds[world_idx].num_procs); + + if (world_idx >= MPIDI_global.avt_mgr.n_avts) { + for (int i = MPIDI_global.avt_mgr.n_avts; i < world_idx + 1; i++) { + int avtid; + MPIDIU_new_avt(MPIR_Worlds[i].num_procs, &avtid); + MPIR_Assert(avtid == i); + } + } - new_mlut = (MPIDI_rank_map_mlut_t *) MPL_malloc(sizeof(MPIDI_rank_map_mlut_t) - + size * sizeof(MPIDI_gpid_t), MPL_MEM_ADDRESS); - if (new_mlut == NULL) { - *mlut = NULL; - MPIR_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**nomem"); + return &MPIDI_global.avt_mgr.av_tables[world_idx]->table[world_rank]; } +} - MPIR_cc_set(&new_mlut->ref_count, 1); - *mlut = new_mlut; +#ifdef MPIDI_BUILD_CH4_UPID_HASH +/* Store the upid, avtid, lpid in a hash to support get_local_upids and insert_upid */ +static MPIDI_upid_hash *upid_hash = NULL; - MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MEMORY, VERBOSE, - (MPL_DBG_FDEST, "alloc mlut %p, size %lu, refcount=%d", - new_mlut, size * sizeof(MPIDI_gpid_t), MPIR_cc_get(&new_mlut->ref_count))); - fn_exit: - MPIR_FUNC_EXIT; - return mpi_errno; - fn_fail: - goto fn_exit; +void MPIDIU_upidhash_add(const void *upid, int upid_len, MPIR_Lpid lpid) +{ + MPIDI_upid_hash *t; + t = MPL_malloc(sizeof(MPIDI_upid_hash), MPL_MEM_OTHER); + t->lpid = lpid; + t->upid = MPL_malloc(upid_len, MPL_MEM_OTHER); + memcpy(t->upid, upid, upid_len); + t->upid_len = upid_len; + HASH_ADD_KEYPTR(hh, upid_hash, t->upid, upid_len, t, MPL_MEM_OTHER); + + MPIDIU_lpid_to_av(lpid)->hash = t; } -int MPIDIU_release_mlut(MPIDI_rank_map_mlut_t * mlut) +MPIDI_upid_hash *MPIDIU_upidhash_find(const void *upid, int upid_len) { - int mpi_errno = MPI_SUCCESS; - int in_use = 0; - - MPIR_FUNC_ENTER; + MPIDI_upid_hash *t; + HASH_FIND(hh, upid_hash, upid, upid_len, t); + return t; +} - MPIR_cc_decr(&mlut->ref_count, &in_use); - MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MEMORY, VERBOSE, (MPL_DBG_FDEST, "dec ref to mlut %p", mlut)); - if (!in_use) { - MPL_free(mlut); - MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MEMORY, VERBOSE, (MPL_DBG_FDEST, "free mlut %p", mlut)); +void MPIDIU_upidhash_free(void) +{ + MPIDI_upid_hash *cur, *tmp; + HASH_ITER(hh, upid_hash, cur, tmp) { + HASH_DEL(upid_hash, cur); + MPL_free(cur->upid); + MPL_free(cur); } - - MPIR_FUNC_EXIT; - return mpi_errno; } +#endif diff --git a/src/mpid/ch4/src/ch4_proc.h b/src/mpid/ch4/src/ch4_proc.h index 749afce18f5..706f67b5f7e 100644 --- a/src/mpid/ch4/src/ch4_proc.h +++ b/src/mpid/ch4/src/ch4_proc.h @@ -9,7 +9,7 @@ #include "ch4_types.h" /* There are 3 terms referencing processes: - * upid, or "unversal process id", is netmod layer address (addrname) + * upid, or "universal process id", is netmod layer address (addrname) * lpid, or "local process id", is av entry index in an ch4-layer table * gpid, or "global process id", is av table index plus av entry index * @@ -22,207 +22,29 @@ int MPIDIU_get_n_avts(void); int MPIDIU_get_avt_size(int avtid); int MPIDIU_new_avt(int size, int *avtid); int MPIDIU_free_avt(int avtid); -int MPIDIU_avt_add_ref(int avtid); -int MPIDIU_avt_release_ref(int avtid); int MPIDIU_avt_init(void); -int MPIDIU_avt_destroy(void); +int MPIDIU_avt_finalize(void); int MPIDIU_get_node_id(MPIR_Comm * comm, int rank, int *id_p); #ifdef MPIDI_BUILD_CH4_UPID_HASH -void MPIDIU_upidhash_add(const void *upid, int upid_len, int avtid, int lpid); +void MPIDIU_upidhash_add(const void *upid, int upid_len, MPIR_Lpid lpid); MPIDI_upid_hash *MPIDIU_upidhash_find(const void *upid, int upid_len); void MPIDIU_upidhash_free(void); #endif -int MPIDIU_upids_to_gpids(int size, int *remote_upid_size, char *remote_upids, - uint64_t * remote_gpids); -int MPIDIU_alloc_lut(MPIDI_rank_map_lut_t ** lut, int size); -int MPIDIU_release_lut(MPIDI_rank_map_lut_t * lut); -int MPIDIU_alloc_mlut(MPIDI_rank_map_mlut_t ** mlut, int size); -int MPIDIU_release_mlut(MPIDI_rank_map_mlut_t * mlut); -#define MPIDIU_lut_add_ref(lut) \ - do { \ - MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MEMORY, VERBOSE, (MPL_DBG_FDEST, "inc ref to lut %p", lut)); \ - MPIR_cc_inc(&(lut)->ref_count); \ - } while (0) - -#define MPIDIU_mlut_add_ref(mlut) \ - do { \ - MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MEMORY, VERBOSE, (MPL_DBG_FDEST, "inc ref to mlut %p", mlut)); \ - MPIR_cc_inc(&(mlut)->ref_count); \ - } while (0) - -MPL_STATIC_INLINE_PREFIX int MPIDIU_comm_rank_to_pid(MPIR_Comm * comm, int rank, int *idx, - int *avtid) -{ - MPIR_FUNC_ENTER; - - *avtid = 0; - *idx = 0; - - switch (MPIDI_COMM(comm, map).mode) { - case MPIDI_RANK_MAP_DIRECT: - *avtid = MPIDI_COMM(comm, map).avtid; - *idx = rank; - break; - case MPIDI_RANK_MAP_DIRECT_INTRA: - *idx = rank; - break; - case MPIDI_RANK_MAP_OFFSET: - *avtid = MPIDI_COMM(comm, map).avtid; - *idx = rank + MPIDI_COMM(comm, map).reg.offset; - break; - case MPIDI_RANK_MAP_OFFSET_INTRA: - *idx = rank + MPIDI_COMM(comm, map).reg.offset; - break; - case MPIDI_RANK_MAP_STRIDE: - *avtid = MPIDI_COMM(comm, map).avtid; - *idx = MPIDI_CALC_STRIDE_SIMPLE(rank, MPIDI_COMM(comm, map).reg.stride.stride, - MPIDI_COMM(comm, map).reg.stride.offset); - break; - case MPIDI_RANK_MAP_STRIDE_INTRA: - *idx = MPIDI_CALC_STRIDE_SIMPLE(rank, MPIDI_COMM(comm, map).reg.stride.stride, - MPIDI_COMM(comm, map).reg.stride.offset); - break; - case MPIDI_RANK_MAP_STRIDE_BLOCK: - *avtid = MPIDI_COMM(comm, map).avtid; - *idx = MPIDI_CALC_STRIDE(rank, MPIDI_COMM(comm, map).reg.stride.stride, - MPIDI_COMM(comm, map).reg.stride.blocksize, - MPIDI_COMM(comm, map).reg.stride.offset); - break; - case MPIDI_RANK_MAP_STRIDE_BLOCK_INTRA: - *idx = MPIDI_CALC_STRIDE(rank, MPIDI_COMM(comm, map).reg.stride.stride, - MPIDI_COMM(comm, map).reg.stride.blocksize, - MPIDI_COMM(comm, map).reg.stride.offset); - break; - case MPIDI_RANK_MAP_LUT: - *avtid = MPIDI_COMM(comm, map).avtid; - *idx = MPIDI_COMM(comm, map).irreg.lut.lpid[rank]; - break; - case MPIDI_RANK_MAP_LUT_INTRA: - *idx = MPIDI_COMM(comm, map).irreg.lut.lpid[rank]; - break; - case MPIDI_RANK_MAP_MLUT: - *idx = MPIDI_COMM(comm, map).irreg.mlut.gpid[rank].lpid; - *avtid = MPIDI_COMM(comm, map).irreg.mlut.gpid[rank].avtid; - break; - case MPIDI_RANK_MAP_NONE: - MPIR_Assert(0); - break; - } - MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MAP, VERBOSE, - (MPL_DBG_FDEST, " comm_to_pid: rank=%d, avtid=%d idx=%d", rank, *avtid, *idx)); - MPIR_FUNC_EXIT; - return *idx; -} MPL_STATIC_INLINE_PREFIX MPIDI_av_entry_t *MPIDIU_comm_rank_to_av(MPIR_Comm * comm, int rank) { MPIDI_av_entry_t *ret = NULL; MPIR_FUNC_ENTER; - int lpid; - switch (MPIDI_COMM(comm, map).mode) { - case MPIDI_RANK_MAP_DIRECT: - ret = &MPIDI_global.avt_mgr.av_tables[MPIDI_COMM(comm, map).avtid]->table[rank]; - break; - case MPIDI_RANK_MAP_DIRECT_INTRA: - ret = &MPIDI_global.avt_mgr.av_table0->table[rank]; - break; - case MPIDI_RANK_MAP_OFFSET: - ret = &MPIDI_global.avt_mgr.av_tables[MPIDI_COMM(comm, map).avtid] - ->table[rank + MPIDI_COMM(comm, map).reg.offset]; - break; - case MPIDI_RANK_MAP_OFFSET_INTRA: - ret = &MPIDI_global.avt_mgr.av_table0->table[rank + MPIDI_COMM(comm, map).reg.offset]; - break; - case MPIDI_RANK_MAP_STRIDE: - lpid = MPIDI_CALC_STRIDE_SIMPLE(rank, MPIDI_COMM(comm, map).reg.stride.stride, - MPIDI_COMM(comm, map).reg.stride.offset); - ret = &MPIDI_global.avt_mgr.av_tables[MPIDI_COMM(comm, map).avtid]->table[lpid]; - break; - case MPIDI_RANK_MAP_STRIDE_INTRA: - lpid = MPIDI_CALC_STRIDE_SIMPLE(rank, MPIDI_COMM(comm, map).reg.stride.stride, - MPIDI_COMM(comm, map).reg.stride.offset); - ret = &MPIDI_global.avt_mgr.av_table0->table[lpid]; - break; - case MPIDI_RANK_MAP_STRIDE_BLOCK: - lpid = MPIDI_CALC_STRIDE(rank, MPIDI_COMM(comm, map).reg.stride.stride, - MPIDI_COMM(comm, map).reg.stride.blocksize, - MPIDI_COMM(comm, map).reg.stride.offset); - ret = &MPIDI_global.avt_mgr.av_tables[MPIDI_COMM(comm, map).avtid]->table[lpid]; - break; - case MPIDI_RANK_MAP_STRIDE_BLOCK_INTRA: - lpid = MPIDI_CALC_STRIDE(rank, MPIDI_COMM(comm, map).reg.stride.stride, - MPIDI_COMM(comm, map).reg.stride.blocksize, - MPIDI_COMM(comm, map).reg.stride.offset); - ret = &MPIDI_global.avt_mgr.av_table0->table[lpid]; - break; - case MPIDI_RANK_MAP_LUT: - ret = &MPIDI_global.avt_mgr.av_tables[MPIDI_COMM(comm, map).avtid] - ->table[MPIDI_COMM(comm, map).irreg.lut.lpid[rank]]; - break; - case MPIDI_RANK_MAP_LUT_INTRA: - ret = - &MPIDI_global.avt_mgr.av_table0->table[MPIDI_COMM(comm, map).irreg.lut.lpid[rank]]; - break; - case MPIDI_RANK_MAP_MLUT: - ret = &MPIDI_global.avt_mgr.av_tables[MPIDI_COMM(comm, map).irreg.mlut.gpid[rank].avtid] - ->table[MPIDI_COMM(comm, map).irreg.mlut.gpid[rank].lpid]; - break; - case MPIDI_RANK_MAP_NONE: - MPIR_Assert(0); - break; - } + MPIR_Lpid lpid = MPIR_comm_rank_to_lpid(comm, rank); + int world_idx = MPIR_LPID_WORLD_INDEX(lpid); + int world_rank = MPIR_LPID_WORLD_RANK(lpid); - MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MAP, VERBOSE, - (MPL_DBG_FDEST, " comm_to_av_addr: rank=%d, av addr=%p", rank, (void *) ret)); - MPIR_FUNC_EXIT; - return ret; -} + ret = &MPIDI_global.avt_mgr.av_tables[world_idx]->table[world_rank]; -MPL_STATIC_INLINE_PREFIX int MPIDIU_comm_rank_to_pid_local(MPIR_Comm * comm, int rank, int *idx, - int *avtid) -{ - MPIR_FUNC_ENTER; - - *avtid = MPIDI_COMM(comm, local_map).avtid; - switch (MPIDI_COMM(comm, local_map).mode) { - case MPIDI_RANK_MAP_DIRECT: - case MPIDI_RANK_MAP_DIRECT_INTRA: - *idx = rank; - break; - case MPIDI_RANK_MAP_OFFSET: - case MPIDI_RANK_MAP_OFFSET_INTRA: - *idx = rank + MPIDI_COMM(comm, local_map).reg.offset; - break; - case MPIDI_RANK_MAP_STRIDE: - case MPIDI_RANK_MAP_STRIDE_INTRA: - *idx = MPIDI_CALC_STRIDE_SIMPLE(rank, MPIDI_COMM(comm, local_map).reg.stride.stride, - MPIDI_COMM(comm, local_map).reg.stride.offset); - break; - case MPIDI_RANK_MAP_STRIDE_BLOCK: - case MPIDI_RANK_MAP_STRIDE_BLOCK_INTRA: - *idx = MPIDI_CALC_STRIDE(rank, MPIDI_COMM(comm, local_map).reg.stride.stride, - MPIDI_COMM(comm, local_map).reg.stride.blocksize, - MPIDI_COMM(comm, local_map).reg.stride.offset); - break; - case MPIDI_RANK_MAP_LUT: - case MPIDI_RANK_MAP_LUT_INTRA: - *idx = MPIDI_COMM(comm, local_map).irreg.lut.lpid[rank]; - break; - case MPIDI_RANK_MAP_MLUT: - *idx = MPIDI_COMM(comm, local_map).irreg.mlut.gpid[rank].lpid; - *avtid = MPIDI_COMM(comm, local_map).irreg.mlut.gpid[rank].avtid; - break; - case MPIDI_RANK_MAP_NONE: - MPIR_Assert(0); - break; - } - MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MAP, VERBOSE, - (MPL_DBG_FDEST, " comm_to_pid_local: rank=%d, avtid=%d idx=%d", - rank, *avtid, *idx)); MPIR_FUNC_EXIT; - return *idx; + return ret; } MPL_STATIC_INLINE_PREFIX int MPIDIU_av_is_local(MPIDI_av_entry_t * av) @@ -238,21 +60,22 @@ MPL_STATIC_INLINE_PREFIX int MPIDIU_av_is_local(MPIDI_av_entry_t * av) return ret; } -MPL_STATIC_INLINE_PREFIX int MPIDIU_rank_to_lpid(int rank, MPIR_Comm * comm) +MPL_STATIC_INLINE_PREFIX int MPIDIU_get_grank(int rank, MPIR_Comm * comm) { - int ret; - MPIR_FUNC_ENTER; - - int avtid = 0, lpid = 0; - MPIDIU_comm_rank_to_pid(comm, rank, &lpid, &avtid); - if (avtid == 0) { - ret = lpid; + MPIR_Lpid lpid = MPIR_comm_rank_to_lpid(comm, rank); + if (MPIR_LPID_WORLD_INDEX(lpid) == 0) { + return (int) lpid; } else { - ret = -1; + return -1; } +} - MPIR_FUNC_EXIT; - return ret; +/* used in fast path where we know the lpid has a valid av, such as from a committed communicator */ +MPL_STATIC_INLINE_PREFIX MPIDI_av_entry_t *MPIDIU_lpid_to_av(MPIR_Lpid lpid) +{ + int world_idx = MPIR_LPID_WORLD_INDEX(lpid); + int world_rank = MPIR_LPID_WORLD_RANK(lpid); + return &MPIDI_global.avt_mgr.av_tables[world_idx]->table[world_rank]; } MPL_STATIC_INLINE_PREFIX int MPIDI_rank_is_local(int rank, MPIR_Comm * comm) @@ -262,7 +85,7 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_rank_is_local(int rank, MPIR_Comm * comm) #ifdef MPIDI_CH4_DIRECT_NETMOD /* Ask the netmod for locality information. If it decided not to build it, - * it will call back up to the MPIDIU function to get the infomration. */ + * it will call back up to the MPIDIU function to get the information. */ ret = MPIDI_NM_rank_is_local(rank, comm); #else ret = MPIDIU_av_is_local(MPIDIU_comm_rank_to_av(comm, rank)); @@ -288,4 +111,10 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_av_is_local(MPIDI_av_entry_t * av) return ret; } +int MPIDIU_insert_dynamic_upid(MPIR_Lpid * lpid_out, const char *upid, int upid_len); +int MPIDIU_free_dynamic_lpid(MPIR_Lpid lpid); +MPIDI_av_entry_t *MPIDIU_find_dynamic_av(const char *upid, int upid_len); +/* used in communicator creation paths when the av entry may not exist or inserted yet */ +MPIDI_av_entry_t *MPIDIU_lpid_to_av_slow(MPIR_Lpid lpid); + #endif /* CH4_PROC_H_INCLUDED */ diff --git a/src/mpid/ch4/src/ch4_spawn.c b/src/mpid/ch4/src/ch4_spawn.c index 6b59d171620..ba8a4ada024 100644 --- a/src/mpid/ch4/src/ch4_spawn.c +++ b/src/mpid/ch4/src/ch4_spawn.c @@ -273,87 +273,52 @@ int MPID_Close_port(const char *port_name) /* MPID_Comm_accept, MPID_Comm_connect */ -static int peer_intercomm_create(char *remote_addrname, int len, int tag, int timeout, - bool is_sender, MPIR_Comm ** newcomm); -static int dynamic_intercomm_create(const char *port_name, MPIR_Info * info, int root, - MPIR_Comm * comm_ptr, int timeout, bool is_sender, - MPIR_Comm ** newcomm); - -struct dynproc_conn_hdr { - int context_id; - int addrname_len; - char addrname[MPIDI_DYNPROC_NAME_MAX]; -}; - -static int peer_intercomm_create(char *remote_addrname, int len, int tag, - int timeout, bool is_sender, MPIR_Comm ** newcomm) +static int establish_peer_conn(char *remote_addrname, int remote_addrname_len, int tag, + int timeout, bool is_sender, MPIR_Lpid * remote_lpid_out) { int mpi_errno = MPI_SUCCESS; - int context_id, recvcontext_id; - uint64_t remote_gpid; + MPIR_Lpid remote_lpid = MPIR_LPID_INVALID; - mpi_errno = MPIR_Get_contextid_sparse(MPIR_Process.comm_self, &recvcontext_id, FALSE); - MPIR_ERR_CHECK(mpi_errno); + struct dynproc_conn_hdr { + int addrname_len; + char addrname[MPIDI_DYNPROC_NAME_MAX]; + } hdr; - struct dynproc_conn_hdr hdr; if (is_sender) { /* insert remote address */ - int addrname_len = len; - uint64_t *remote_gpids = &remote_gpid; - mpi_errno = MPIDIU_upids_to_gpids(1, &addrname_len, remote_addrname, remote_gpids); + mpi_errno = MPIDIU_insert_dynamic_upid(&remote_lpid, remote_addrname, remote_addrname_len); MPIR_ERR_CHECK(mpi_errno); - /* fill hdr with context_id and addrname */ - hdr.context_id = recvcontext_id; - - char *addrname; - int *addrname_size; - mpi_errno = MPIDI_NM_get_local_upids(MPIR_Process.comm_self, &addrname_size, &addrname); + /* get my addrname and send it to remote */ + char *my_addrname; + int *my_addrname_len; + mpi_errno = MPIDI_NM_get_local_upids(MPIR_Process.comm_self, + &my_addrname_len, &my_addrname); MPIR_ERR_CHECK(mpi_errno); - MPIR_Assert(addrname_size[0] <= MPIDI_DYNPROC_NAME_MAX); - memcpy(hdr.addrname, addrname, addrname_size[0]); - hdr.addrname_len = addrname_size[0]; - - /* send remote context_id + addrname */ + MPIR_Assert(my_addrname_len[0] <= MPIDI_DYNPROC_NAME_MAX); + memcpy(hdr.addrname, my_addrname, my_addrname_len[0]); + hdr.addrname_len = my_addrname_len[0]; + /* send it to remote */ int hdr_sz = sizeof(hdr) - MPIDI_DYNPROC_NAME_MAX + hdr.addrname_len; - mpi_errno = MPIDI_NM_dynamic_send(remote_gpid, tag, &hdr, hdr_sz, timeout); - MPL_free(addrname); - MPL_free(addrname_size); + mpi_errno = MPIDI_NM_dynamic_send(remote_lpid, tag, &hdr, hdr_sz, timeout); + MPL_free(my_addrname); + MPL_free(my_addrname_len); MPIR_ERR_CHECK(mpi_errno); - - mpi_errno = MPIDI_NM_dynamic_recv(tag, &hdr, sizeof(hdr), timeout); - MPIR_ERR_CHECK(mpi_errno); - context_id = hdr.context_id; } else { /* recv remote address */ mpi_errno = MPIDI_NM_dynamic_recv(tag, &hdr, sizeof(hdr), timeout); MPIR_ERR_CHECK(mpi_errno); - context_id = hdr.context_id; /* insert remote address */ - int addrname_len = hdr.addrname_len; - uint64_t *remote_gpids = &remote_gpid; - mpi_errno = MPIDIU_upids_to_gpids(1, &addrname_len, hdr.addrname, remote_gpids); - MPIR_ERR_CHECK(mpi_errno); - - /* send remote context_id */ - hdr.context_id = recvcontext_id; - mpi_errno = MPIDI_NM_dynamic_send(remote_gpid, tag, &hdr, sizeof(hdr.context_id), timeout); + mpi_errno = MPIDIU_insert_dynamic_upid(&remote_lpid, hdr.addrname, hdr.addrname_len); MPIR_ERR_CHECK(mpi_errno); } - /* create peer intercomm */ - mpi_errno = MPIR_peer_intercomm_create(context_id, recvcontext_id, - remote_gpid, is_sender, newcomm); - MPIR_ERR_CHECK(mpi_errno); - fn_exit: + *remote_lpid_out = remote_lpid; return mpi_errno; fn_fail: - if (recvcontext_id) { - MPIR_Free_contextid(recvcontext_id); - } goto fn_exit; } @@ -362,15 +327,13 @@ static int dynamic_intercomm_create(const char *port_name, MPIR_Info * info, int MPIR_Comm ** newcomm) { int mpi_errno = MPI_SUCCESS; - - MPIR_Comm *peer_intercomm = NULL; + MPIR_Lpid remote_lpid = MPIR_LPID_INVALID; + MPIR_Comm *peer_comm = NULL; int tag; - int bcast_ints[2]; /* used to bcast tag and errno */ if (comm_ptr->rank == root) { /* NOTE: do not goto fn_fail on error, or it will leave children hanging */ mpi_errno = get_tag_from_port(port_name, &tag); - if (mpi_errno) - goto bcast_tag_and_errno; + MPIR_ERR_CHECK(mpi_errno); char remote_addrname[MPIDI_DYNPROC_NAME_MAX]; char *addrname; @@ -379,43 +342,59 @@ static int dynamic_intercomm_create(const char *port_name, MPIR_Info * info, int addrname = remote_addrname; mpi_errno = get_conn_name_from_port(port_name, remote_addrname, MPIDI_DYNPROC_NAME_MAX, &len); - if (mpi_errno) - goto bcast_tag_and_errno; + MPIR_ERR_CHECK(mpi_errno); } else { - /* Use NULL for better error behavior */ addrname = NULL; len = 0; } - mpi_errno = peer_intercomm_create(addrname, len, tag, timeout, is_sender, &peer_intercomm); - bcast_tag_and_errno: - bcast_ints[0] = tag; - bcast_ints[1] = mpi_errno; - mpi_errno = MPIR_Bcast_allcomm_auto(bcast_ints, 2, MPI_INT, root, comm_ptr, MPIR_ERR_NONE); - MPIR_ERR_CHECK(mpi_errno); - mpi_errno = bcast_ints[1]; + MPID_THREAD_CS_ENTER(VCI, MPIDI_VCI(0).lock); + mpi_errno = establish_peer_conn(addrname, len, tag, timeout, is_sender, &remote_lpid); + MPID_THREAD_CS_EXIT(VCI, MPIDI_VCI(0).lock); MPIR_ERR_CHECK(mpi_errno); + + /* create peer intercomm - + * Since we will only use peer intercomm to call back MPID_Intercomm_exchange, which + * just need to extract remote_lpid from the peer_comm, we can cheat a bit here - just + * fill peer_comm->remote_group. + */ + peer_comm = (MPIR_Comm *) MPIR_Handle_obj_alloc(&MPIR_Comm_mem); + MPIR_ERR_CHKANDJUMP(!peer_comm, mpi_errno, MPI_ERR_OTHER, "**nomem"); + + peer_comm->comm_kind = MPIR_COMM_KIND__INTERCOMM; + peer_comm->remote_size = 1; + peer_comm->local_size = 1; + peer_comm->rank = 0; + peer_comm->local_group = NULL; + + MPIR_Group_create_stride(1, 0, NULL, remote_lpid, 1, &peer_comm->remote_group); + + fn_fail: + /* In case root fails, we bcast mpi_errno so other ranks will abort too */ + MPIR_Bcast_impl(&mpi_errno, 1, MPI_INT, root, comm_ptr, MPIR_ERR_NONE); } else { - mpi_errno = MPIR_Bcast_allcomm_auto(bcast_ints, 2, MPI_INT, root, comm_ptr, MPIR_ERR_NONE); - MPIR_ERR_CHECK(mpi_errno); - if (bcast_ints[1]) { - /* errno from root cannot be directly returned */ + int root_errno; + MPIR_Bcast_impl(&root_errno, 1, MPI_INT, root, comm_ptr, MPIR_ERR_NONE); + if (root_errno) { MPIR_ERR_SET(mpi_errno, MPI_ERR_PORT, "**comm_connect_fail"); - goto fn_fail; } - tag = bcast_ints[0]; } - mpi_errno = MPIR_Intercomm_create_impl(comm_ptr, root, peer_intercomm, 0, tag, newcomm); - MPIR_ERR_CHECK(mpi_errno); + if (mpi_errno == MPI_SUCCESS) { + mpi_errno = MPIR_Intercomm_create_timeout(comm_ptr, root, peer_comm, 0, tag, timeout, + newcomm); + } - fn_exit: - if (peer_intercomm) { - MPIR_Comm_free_impl(peer_intercomm); + if (comm_ptr->rank == root && peer_comm) { + /* destroy peer_comm */ + MPID_THREAD_CS_ENTER(VCI, MPIDI_VCI(0).lock); + MPIDIU_free_dynamic_lpid(remote_lpid); + MPIR_Group_release(peer_comm->remote_group); + MPIR_Handle_obj_free(&MPIR_Comm_mem, peer_comm); + MPID_THREAD_CS_EXIT(VCI, MPIDI_VCI(0).lock); } + return mpi_errno; - fn_fail: - goto fn_exit; } int MPID_Comm_accept(const char *port_name, MPIR_Info * info, int root, MPIR_Comm * comm, diff --git a/src/mpid/ch4/src/ch4_types.h b/src/mpid/ch4/src/ch4_types.h index 88fbb5ac4ab..c8ea6c1cc09 100644 --- a/src/mpid/ch4/src/ch4_types.h +++ b/src/mpid/ch4/src/ch4_types.h @@ -185,22 +185,33 @@ typedef struct MPIDIG_acc_ack_msg_t { typedef MPIDIG_acc_ack_msg_t MPIDIG_get_acc_ack_msg_t; typedef struct { - MPIR_cc_t ref_count; int size; MPIDI_av_entry_t table[]; } MPIDI_av_table_t; +/* dynamic av is used for building inter communicators, such as MPID_Comm_connect/accept, + * when we need temoprarily establish communication betweer peer group leaders. + * Because the entries are expected to be released once the intercomm is committed, we expect + * the dynamic av table size to remain finite. + * We keep the upid along with the av entry to avoid later duplicate av insertion. + * */ +#define MPIDIU_DYNAMIC_AV_MAX 100 +typedef struct { + int size; + const char *upids[MPIDIU_DYNAMIC_AV_MAX]; + int upid_sizes[MPIDIU_DYNAMIC_AV_MAX]; + MPIDI_av_entry_t *table; +} MPIDI_dyn_av_table_t; + typedef struct { int max_n_avts; int n_avts; int n_free; MPIDI_av_table_t *av_table0; MPIDI_av_table_t **av_tables; + MPIDI_dyn_av_table_t dynamic_av_table; } MPIDIU_avt_manager; -#define MPIDIU_get_av_table(avtid) (MPIDI_global.avt_mgr.av_tables[(avtid)]) -#define MPIDIU_get_av(avtid, lpid) (MPIDI_global.avt_mgr.av_tables[(avtid)]->table[(lpid)]) - typedef struct { uint64_t key; void *value; diff --git a/src/mpid/ch4/src/ch4_vci.h b/src/mpid/ch4/src/ch4_vci.h index e60adac148d..aff146298ac 100644 --- a/src/mpid/ch4/src/ch4_vci.h +++ b/src/mpid/ch4/src/ch4_vci.h @@ -47,7 +47,7 @@ /* VCI hashing function (fast path) */ /* For consistent hashing, we may need differentiate between src and dst vci and whether - * it is being called from sender side or receiver side (consdier intercomm). We use an + * it is being called from sender side or receiver side (consider intercomm). We use an * integer flag to encode the information. * * The flag constants are designed as bit fields, so different hashing algorithm can easily @@ -71,7 +71,7 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_hash_remote_vci(int raw_vci, MPIR_Comm * comm /* MPI_ANY_SOURCE, MPI_PROC_NULL, return a dummy, won't be used */ return 0; } else { - int grank = MPIDIU_rank_to_lpid(rank, comm_ptr); + int grank = MPIDIU_get_grank(rank, comm_ptr); MPIR_Assert(grank >= 0); return raw_vci % MPIDI_global.all_num_vcis[grank]; } diff --git a/src/mpid/ch4/src/ch4i_comm.c b/src/mpid/ch4/src/ch4i_comm.c index 7a8b5a97d9f..a723dbfc858 100644 --- a/src/mpid/ch4/src/ch4i_comm.c +++ b/src/mpid/ch4/src/ch4i_comm.c @@ -7,906 +7,6 @@ #include "mpidch4r.h" #include "ch4i_comm.h" -enum MPIDI_src_mapper_models { - MPIDI_SRC_MAPPER_IRREGULAR = 0, - MPIDI_SRC_MAPPER_DIRECT = 1, - MPIDI_SRC_MAPPER_OFFSET = 2, - MPIDI_SRC_MAPPER_STRIDE = 3 -}; - -static int map_size(MPIR_Comm_map_t map); -static int detect_regular_model(int *lpid, int size, int *offset, int *blocksize, int *stride); -static int src_comm_to_mlut(MPIDI_rank_map_t * src, MPIDI_rank_map_t * dest, int size, - int total_mapper_size, int mapper_offset); -static int src_mlut_to_mlut(MPIDI_rank_map_t * src, MPIDI_rank_map_t * dest, - MPIR_Comm_map_t * mapper, int total_mapper_size, int mapper_offset); -static int src_map_to_lut(MPIDI_rank_map_t * src, MPIDI_rank_map_t * dest, MPIR_Comm_map_t * mapper, - int total_mapper_size, int mapper_offset); -static void direct_of_src_rmap(MPIDI_rank_map_t * src, MPIDI_rank_map_t * dest, - MPIR_Comm_map_t * mapper); -static void offset_of_src_rmap(MPIDI_rank_map_t * src, MPIDI_rank_map_t * dest, - MPIR_Comm_map_t * mapper, int offset); -static void stride_of_src_rmap(MPIDI_rank_map_t * src, MPIDI_rank_map_t * dest, - MPIR_Comm_map_t * mapper, int stride, int blocksize, int offset); -static int check_convert_mlut_to_lut(MPIDI_rank_map_t * src); -static int check_convert_lut_to_regular(MPIDI_rank_map_t * src); -static int set_map(MPIDI_rank_map_t * src_rmap, MPIDI_rank_map_t * dest_rmap, - MPIR_Comm_map_t * mapper, int src_comm_size, int total_mapper_size, - int mapper_offset); - -static int map_size(MPIR_Comm_map_t map) -{ - int ret = 0; - MPIR_FUNC_ENTER; - - if (map.type == MPIR_COMM_MAP_TYPE__IRREGULAR) - ret = map.src_mapping_size; - else if (map.dir == MPIR_COMM_MAP_DIR__L2L || map.dir == MPIR_COMM_MAP_DIR__L2R) - ret = map.src_comm->local_size; - else - ret = map.src_comm->remote_size; - - MPIR_FUNC_EXIT; - return ret; -} - -static int detect_regular_model(int *lpid, int size, int *offset, int *blocksize, int *stride) -{ - int off = 0, bs = 0, st = 0; - int i; - int ret = MPIDI_SRC_MAPPER_IRREGULAR; - - MPIR_FUNC_ENTER; - - if (size == 0) { - ret = MPIDI_SRC_MAPPER_DIRECT; - goto fn_exit; - } - - off = lpid[0]; - MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MAP, VERBOSE, (MPL_DBG_FDEST, "\tdetect model: offset %d", off)); - - for (i = 0; i < size; i++) { - if (lpid[i] != i + off) { - break; - } - bs++; - } - - MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MAP, VERBOSE, - (MPL_DBG_FDEST, "\tdetect model: blocksize %d", bs)); - if (bs == size) { - if (off == 0) { - ret = MPIDI_SRC_MAPPER_DIRECT; - goto fn_exit; - } else { - *offset = off; - ret = MPIDI_SRC_MAPPER_OFFSET; - goto fn_exit; - } - } - - /* blocksize less than total size, try if this is stride */ - st = lpid[bs] - lpid[0]; - if (st < 0 || st <= bs) { - ret = MPIDI_SRC_MAPPER_IRREGULAR; - goto fn_exit; - } - MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MAP, VERBOSE, (MPL_DBG_FDEST, "\tdetect model: stride %d", st)); - for (i = bs; i < size; i++) { - if (lpid[i] != MPIDI_CALC_STRIDE(i, st, bs, off)) { - ret = MPIDI_SRC_MAPPER_IRREGULAR; - goto fn_exit; - } - } - *offset = off; - *blocksize = bs; - *stride = st; - ret = MPIDI_SRC_MAPPER_STRIDE; - - fn_exit: - MPIR_FUNC_EXIT; - return ret; -} - -static int src_comm_to_mlut(MPIDI_rank_map_t * src, MPIDI_rank_map_t * dest, int size, - int total_mapper_size, int mapper_offset) -{ - int mpi_errno = MPI_SUCCESS, i; - MPIDI_rank_map_mlut_t *mlut = NULL; - - MPIR_FUNC_ENTER; - - if (!mapper_offset) { - mpi_errno = MPIDIU_alloc_mlut(&mlut, total_mapper_size); - MPIR_ERR_CHECK(mpi_errno); - dest->size = total_mapper_size; - dest->mode = MPIDI_RANK_MAP_MLUT; - dest->avtid = -1; - dest->irreg.mlut.t = mlut; - dest->irreg.mlut.gpid = mlut->gpid; - } - - MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MAP, VERBOSE, - (MPL_DBG_FDEST, " source mode %d", (int) src->mode)); - MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MAP, VERBOSE, (MPL_DBG_FDEST, " size %d", size)); - switch (src->mode) { - case MPIDI_RANK_MAP_DIRECT: - case MPIDI_RANK_MAP_DIRECT_INTRA: - for (i = 0; i < size; i++) { - dest->irreg.mlut.gpid[i + mapper_offset].lpid = i; - dest->irreg.mlut.gpid[i + mapper_offset].avtid = src->avtid; - } - break; - case MPIDI_RANK_MAP_OFFSET: - case MPIDI_RANK_MAP_OFFSET_INTRA: - for (i = 0; i < size; i++) { - dest->irreg.mlut.gpid[i + mapper_offset].lpid = i + src->reg.offset; - dest->irreg.mlut.gpid[i + mapper_offset].avtid = src->avtid; - } - MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MAP, VERBOSE, - (MPL_DBG_FDEST, " source offset %d", src->reg.offset)); - break; - case MPIDI_RANK_MAP_STRIDE: - case MPIDI_RANK_MAP_STRIDE_INTRA: - for (i = 0; i < size; i++) { - dest->irreg.mlut.gpid[i + mapper_offset].lpid = MPIDI_CALC_STRIDE_SIMPLE(i, - src->reg. - stride.stride, - src->reg. - stride.offset); - dest->irreg.mlut.gpid[i + mapper_offset].avtid = src->avtid; - } - MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MAP, VERBOSE, - (MPL_DBG_FDEST, " source stride %d blocksize %d offset %d", - src->reg.stride.stride, src->reg.stride.blocksize, - src->reg.stride.offset)); - break; - case MPIDI_RANK_MAP_STRIDE_BLOCK: - case MPIDI_RANK_MAP_STRIDE_BLOCK_INTRA: - for (i = 0; i < size; i++) { - dest->irreg.mlut.gpid[i + mapper_offset].lpid = MPIDI_CALC_STRIDE(i, - src->reg.stride. - stride, - src->reg.stride. - blocksize, - src->reg.stride. - offset); - dest->irreg.mlut.gpid[i + mapper_offset].avtid = src->avtid; - } - MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MAP, VERBOSE, - (MPL_DBG_FDEST, " source stride %d blocksize %d offset %d", - src->reg.stride.stride, src->reg.stride.blocksize, - src->reg.stride.offset)); - break; - case MPIDI_RANK_MAP_LUT: - case MPIDI_RANK_MAP_LUT_INTRA: - for (i = 0; i < size; i++) { - dest->irreg.mlut.gpid[i + mapper_offset].lpid = src->irreg.lut.lpid[i]; - dest->irreg.mlut.gpid[i + mapper_offset].avtid = src->avtid; - } - break; - case MPIDI_RANK_MAP_MLUT: - for (i = 0; i < size; i++) { - dest->irreg.mlut.gpid[i + mapper_offset].lpid = src->irreg.mlut.gpid[i].lpid; - dest->irreg.mlut.gpid[i + mapper_offset].avtid = src->irreg.mlut.gpid[i].avtid; - } - break; - case MPIDI_RANK_MAP_NONE: - MPIR_Assert(0); - break; - } - - fn_exit: - MPIR_FUNC_EXIT; - return mpi_errno; - fn_fail: - goto fn_exit; -} - -static int src_mlut_to_mlut(MPIDI_rank_map_t * src, MPIDI_rank_map_t * dest, - MPIR_Comm_map_t * mapper, int total_mapper_size, int mapper_offset) -{ - int mpi_errno = MPI_SUCCESS, i; - int size = map_size(*mapper); - MPIDI_rank_map_mlut_t *mlut = NULL; - - MPIR_FUNC_ENTER; - - if (!mapper_offset) { - mpi_errno = MPIDIU_alloc_mlut(&mlut, total_mapper_size); - MPIR_ERR_CHECK(mpi_errno); - dest->size = total_mapper_size; - } - - dest->mode = src->mode; - dest->irreg.mlut.t = mlut; - dest->irreg.mlut.gpid = mlut->gpid; - for (i = 0; i < size; i++) { - dest->irreg.mlut.gpid[i + mapper_offset].avtid = - src->irreg.mlut.gpid[mapper->src_mapping[i]].avtid; - dest->irreg.mlut.gpid[i + mapper_offset].lpid = - src->irreg.mlut.gpid[mapper->src_mapping[i]].lpid; - } - fn_exit: - MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MAP, VERBOSE, - (MPL_DBG_FDEST, " src mode %d, dest mode %d", - (int) src->mode, (int) dest->mode)); - MPIR_FUNC_EXIT; - return mpi_errno; - fn_fail: - goto fn_exit; -} - -static int src_map_to_lut(MPIDI_rank_map_t * src, MPIDI_rank_map_t * dest, MPIR_Comm_map_t * mapper, - int total_mapper_size, int mapper_offset) -{ - int mpi_errno = MPI_SUCCESS, i; - int size = map_size(*mapper); - MPIDI_rank_map_lut_t *lut = NULL; - - MPIR_FUNC_ENTER; - - if (!mapper_offset) { - mpi_errno = MPIDIU_alloc_lut(&lut, total_mapper_size); - MPIR_ERR_CHECK(mpi_errno); - dest->size = total_mapper_size; - } - - MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MAP, VERBOSE, - (MPL_DBG_FDEST, " source mode %d", (int) src->mode)); - MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MAP, VERBOSE, - (MPL_DBG_FDEST, " size %d, mapper->src_mapping_size %d", - size, mapper->src_mapping_size)); - dest->mode = MPIDI_RANK_MAP_LUT; - dest->avtid = src->avtid; - dest->irreg.lut.t = lut; - dest->irreg.lut.lpid = lut->lpid; - switch (src->mode) { - case MPIDI_RANK_MAP_DIRECT: - case MPIDI_RANK_MAP_DIRECT_INTRA: - for (i = 0; i < size; i++) { - dest->irreg.lut.lpid[i + mapper_offset] = mapper->src_mapping[i]; - } - break; - case MPIDI_RANK_MAP_OFFSET: - case MPIDI_RANK_MAP_OFFSET_INTRA: - for (i = 0; i < size; i++) { - dest->irreg.lut.lpid[i + mapper_offset] = mapper->src_mapping[i] + src->reg.offset; - } - MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MAP, VERBOSE, - (MPL_DBG_FDEST, " source offset %d", src->reg.offset)); - break; - case MPIDI_RANK_MAP_STRIDE: - case MPIDI_RANK_MAP_STRIDE_INTRA: - for (i = 0; i < size; i++) { - dest->irreg.lut.lpid[i + mapper_offset] = - MPIDI_CALC_STRIDE_SIMPLE(mapper->src_mapping[i], src->reg.stride.stride, - src->reg.stride.offset); - } - MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MAP, VERBOSE, - (MPL_DBG_FDEST, " source stride %d blocksize %d offset %d", - src->reg.stride.stride, src->reg.stride.blocksize, - src->reg.stride.offset)); - break; - case MPIDI_RANK_MAP_STRIDE_BLOCK: - case MPIDI_RANK_MAP_STRIDE_BLOCK_INTRA: - for (i = 0; i < size; i++) { - dest->irreg.lut.lpid[i + mapper_offset] = MPIDI_CALC_STRIDE(mapper->src_mapping[i], - src->reg.stride.stride, - src->reg. - stride.blocksize, - src->reg.stride.offset); - } - MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MAP, VERBOSE, - (MPL_DBG_FDEST, " source stride %d blocksize %d offset %d", - src->reg.stride.stride, src->reg.stride.blocksize, - src->reg.stride.offset)); - break; - case MPIDI_RANK_MAP_LUT: - case MPIDI_RANK_MAP_LUT_INTRA: - for (i = 0; i < size; i++) { - dest->irreg.lut.lpid[i + mapper_offset] = - src->irreg.lut.lpid[mapper->src_mapping[i]]; - } - break; - default: - mpi_errno = 1; - MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MAP, VERBOSE, - (MPL_DBG_FDEST, " cannot convert mode %d to lut", (int) src->mode)); - goto fn_fail; - } - fn_exit: - MPIR_FUNC_EXIT; - return mpi_errno; - fn_fail: - goto fn_exit; -} - -static void direct_of_src_rmap(MPIDI_rank_map_t * src, MPIDI_rank_map_t * dest, - MPIR_Comm_map_t * mapper) -{ - MPIR_FUNC_ENTER; - dest->mode = src->mode; - if (mapper) { - dest->size = map_size(*mapper); - } else { - dest->size = src->size; - } - dest->avtid = src->avtid; - MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MAP, VERBOSE, - (MPL_DBG_FDEST, " source mode %d", (int) src->mode)); - switch (src->mode) { - case MPIDI_RANK_MAP_DIRECT: - case MPIDI_RANK_MAP_DIRECT_INTRA: - break; - case MPIDI_RANK_MAP_OFFSET: - case MPIDI_RANK_MAP_OFFSET_INTRA: - dest->reg.offset = src->reg.offset; - break; - case MPIDI_RANK_MAP_STRIDE: - case MPIDI_RANK_MAP_STRIDE_INTRA: - case MPIDI_RANK_MAP_STRIDE_BLOCK: - case MPIDI_RANK_MAP_STRIDE_BLOCK_INTRA: - dest->reg.stride.stride = src->reg.stride.stride; - dest->reg.stride.blocksize = src->reg.stride.blocksize; - dest->reg.stride.offset = src->reg.stride.offset; - break; - case MPIDI_RANK_MAP_LUT: - case MPIDI_RANK_MAP_LUT_INTRA: - dest->irreg.lut.t = src->irreg.lut.t; - dest->irreg.lut.lpid = src->irreg.lut.lpid; - MPIDIU_lut_add_ref(src->irreg.lut.t); - break; - case MPIDI_RANK_MAP_MLUT: - dest->irreg.mlut.t = src->irreg.mlut.t; - dest->irreg.mlut.gpid = src->irreg.mlut.gpid; - MPIDIU_mlut_add_ref(src->irreg.mlut.t); - break; - case MPIDI_RANK_MAP_NONE: - MPIR_Assert(0); - break; - } - MPIR_FUNC_EXIT; -} - -static void offset_of_src_rmap(MPIDI_rank_map_t * src, MPIDI_rank_map_t * dest, - MPIR_Comm_map_t * mapper, int offset) -{ - MPIR_FUNC_ENTER; - dest->avtid = src->avtid; - dest->size = map_size(*mapper); - MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MAP, VERBOSE, - (MPL_DBG_FDEST, " source mode %d", (int) src->mode)); - switch (src->mode) { - case MPIDI_RANK_MAP_DIRECT_INTRA: - dest->mode = MPIDI_RANK_MAP_OFFSET_INTRA; - dest->reg.offset = offset; - break; - case MPIDI_RANK_MAP_DIRECT: - dest->mode = MPIDI_RANK_MAP_OFFSET; - dest->reg.offset = offset; - break; - case MPIDI_RANK_MAP_OFFSET: - dest->mode = MPIDI_RANK_MAP_OFFSET; - dest->reg.offset = src->reg.offset + offset; - break; - case MPIDI_RANK_MAP_OFFSET_INTRA: - dest->mode = MPIDI_RANK_MAP_OFFSET_INTRA; - dest->reg.offset = src->reg.offset + offset; - break; - case MPIDI_RANK_MAP_STRIDE: - dest->mode = MPIDI_RANK_MAP_STRIDE; - dest->reg.stride.stride = src->reg.stride.stride; - dest->reg.stride.blocksize = src->reg.stride.blocksize; - dest->reg.stride.offset = src->reg.stride.offset + offset * src->reg.stride.stride; - break; - case MPIDI_RANK_MAP_STRIDE_INTRA: - dest->mode = MPIDI_RANK_MAP_STRIDE_INTRA; - dest->reg.stride.stride = src->reg.stride.stride; - dest->reg.stride.blocksize = src->reg.stride.blocksize; - dest->reg.stride.offset = src->reg.stride.offset + offset * src->reg.stride.stride; - break; - case MPIDI_RANK_MAP_STRIDE_BLOCK: - case MPIDI_RANK_MAP_STRIDE_BLOCK_INTRA: - src_map_to_lut(src, dest, mapper, mapper->src_mapping_size, 0); - break; - case MPIDI_RANK_MAP_LUT: - case MPIDI_RANK_MAP_LUT_INTRA: - dest->mode = src->mode; - dest->irreg.lut.t = src->irreg.lut.t; - dest->irreg.lut.lpid = &src->irreg.lut.lpid[offset]; - MPIDIU_lut_add_ref(src->irreg.lut.t); - break; - case MPIDI_RANK_MAP_MLUT: - dest->mode = src->mode; - dest->irreg.mlut.t = src->irreg.mlut.t; - dest->irreg.mlut.gpid = &src->irreg.mlut.gpid[offset]; - MPIDIU_mlut_add_ref(src->irreg.mlut.t); - break; - case MPIDI_RANK_MAP_NONE: - MPIR_Assert(0); - break; - } - MPIR_FUNC_EXIT; -} - -static void stride_of_src_rmap(MPIDI_rank_map_t * src, MPIDI_rank_map_t * dest, - MPIR_Comm_map_t * mapper, int stride, int blocksize, int offset) -{ - MPIR_FUNC_ENTER; - dest->avtid = src->avtid; - MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MAP, VERBOSE, - (MPL_DBG_FDEST, " source mode %d", (int) src->mode)); - switch (src->mode) { - case MPIDI_RANK_MAP_DIRECT_INTRA: - if (blocksize == 1) { - dest->mode = MPIDI_RANK_MAP_STRIDE_INTRA; - } else { - dest->mode = MPIDI_RANK_MAP_STRIDE_BLOCK_INTRA; - } - dest->size = map_size(*mapper); - dest->reg.stride.stride = stride; - dest->reg.stride.blocksize = blocksize; - dest->reg.stride.offset = offset; - MPIR_Assert(stride > 0); - MPIR_Assert(blocksize > 0); - break; - case MPIDI_RANK_MAP_DIRECT: - if (blocksize == 1) { - dest->mode = MPIDI_RANK_MAP_STRIDE; - } else { - dest->mode = MPIDI_RANK_MAP_STRIDE_BLOCK; - } - dest->size = map_size(*mapper); - dest->reg.stride.stride = stride; - dest->reg.stride.blocksize = blocksize; - dest->reg.stride.offset = offset; - MPIR_Assert(stride > 0); - MPIR_Assert(blocksize > 0); - break; - case MPIDI_RANK_MAP_OFFSET: - if (blocksize == 1) { - dest->mode = MPIDI_RANK_MAP_STRIDE; - } else { - dest->mode = MPIDI_RANK_MAP_STRIDE_BLOCK; - } - dest->size = map_size(*mapper); - dest->reg.stride.stride = stride; - dest->reg.stride.blocksize = blocksize; - dest->reg.stride.offset = offset + src->reg.offset; - break; - case MPIDI_RANK_MAP_OFFSET_INTRA: - if (blocksize == 1) { - dest->mode = MPIDI_RANK_MAP_STRIDE_INTRA; - } else { - dest->mode = MPIDI_RANK_MAP_STRIDE_BLOCK_INTRA; - } - dest->size = map_size(*mapper); - dest->reg.stride.stride = stride; - dest->reg.stride.blocksize = blocksize; - dest->reg.stride.offset = offset + src->reg.offset; - break; - case MPIDI_RANK_MAP_STRIDE: - if (blocksize == 1) { - dest->mode = MPIDI_RANK_MAP_STRIDE; - dest->reg.stride.stride = src->reg.stride.stride * stride; - dest->reg.stride.blocksize = blocksize; - dest->reg.stride.offset = src->reg.stride.stride * offset + src->reg.stride.offset; - } else { - src_map_to_lut(src, dest, mapper, mapper->src_mapping_size, 0); - } - break; - case MPIDI_RANK_MAP_STRIDE_INTRA: - if (blocksize == 1) { - dest->mode = MPIDI_RANK_MAP_STRIDE_INTRA; - dest->reg.stride.stride = src->reg.stride.stride * stride; - dest->reg.stride.blocksize = blocksize; - dest->reg.stride.offset = src->reg.stride.stride * offset + src->reg.stride.offset; - } else { - src_map_to_lut(src, dest, mapper, mapper->src_mapping_size, 0); - } - break; - case MPIDI_RANK_MAP_STRIDE_BLOCK: - case MPIDI_RANK_MAP_STRIDE_BLOCK_INTRA: - src_map_to_lut(src, dest, mapper, mapper->src_mapping_size, 0); - break; - case MPIDI_RANK_MAP_LUT: - case MPIDI_RANK_MAP_LUT_INTRA: - src_map_to_lut(src, dest, mapper, mapper->src_mapping_size, 0); - break; - case MPIDI_RANK_MAP_MLUT: - src_mlut_to_mlut(src, dest, mapper, mapper->src_mapping_size, 0); - break; - case MPIDI_RANK_MAP_NONE: - MPIR_Assert(0); - break; - } - MPIR_FUNC_EXIT; -} - -static int check_convert_mlut_to_lut(MPIDI_rank_map_t * src) -{ - int mpi_errno = MPI_SUCCESS, i; - int flag = 1; - int avtid; - MPIDI_rank_map_lut_t *lut = NULL; - - MPIR_FUNC_ENTER; - - if (src->mode != MPIDI_RANK_MAP_MLUT) { - goto fn_exit; - } - - /* check if all mlut item has the same avtid */ - avtid = src->irreg.mlut.gpid[0].avtid; - for (i = 1; i < src->size; i++) { - if (src->irreg.mlut.gpid[i].avtid != avtid) { - flag = 0; - break; - } - } - if (!flag) { /* multiple avtid */ - goto fn_exit; - } - - src->avtid = avtid; - if (avtid == 0) { - src->mode = MPIDI_RANK_MAP_LUT_INTRA; - } else { - src->mode = MPIDI_RANK_MAP_LUT; - } - mpi_errno = MPIDIU_alloc_lut(&lut, src->size); - MPIR_ERR_CHECK(mpi_errno); - for (i = 0; i < src->size; i++) { - lut->lpid[i] = src->irreg.mlut.gpid[i].lpid; - } - MPIDIU_release_mlut(src->irreg.mlut.t); - src->irreg.lut.t = lut; - src->irreg.lut.lpid = src->irreg.lut.t->lpid; - MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MAP, VERBOSE, (MPL_DBG_FDEST, " avtid %d", src->avtid)); - - fn_exit: - MPIR_FUNC_EXIT; - return mpi_errno; - fn_fail: - goto fn_exit; -} - -static int check_convert_lut_to_regular(MPIDI_rank_map_t * src) -{ - int mpi_errno = MPI_SUCCESS; - int mode_detected, offset, blocksize, stride; - MPIDI_rank_map_lut_t *lut = NULL; - - MPIR_FUNC_ENTER; - - if (src->mode != MPIDI_RANK_MAP_LUT && src->mode != MPIDI_RANK_MAP_LUT_INTRA) { - goto fn_exit; - } - - lut = src->irreg.lut.t; - mode_detected = detect_regular_model(src->irreg.lut.lpid, src->size, &offset, &blocksize, - &stride); - MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MAP, VERBOSE, - (MPL_DBG_FDEST, " detected mode: %d", mode_detected)); - - - switch (mode_detected) { - case MPIDI_SRC_MAPPER_DIRECT: - src->mode = MPIDI_RANK_MAP_DIRECT; - if (src->avtid == 0) { - src->mode = MPIDI_RANK_MAP_DIRECT_INTRA; - } - src->irreg.lut.t = NULL; - src->irreg.lut.lpid = NULL; - MPIDIU_release_lut(lut); - MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MAP, VERBOSE, - (MPL_DBG_FDEST, "\tlut to mode %d", (int) src->mode)); - break; - case MPIDI_SRC_MAPPER_OFFSET: - src->mode = MPIDI_RANK_MAP_OFFSET; - if (src->avtid == 0) { - src->mode = MPIDI_RANK_MAP_OFFSET_INTRA; - } - src->reg.offset = offset; - src->irreg.lut.t = NULL; - src->irreg.lut.lpid = NULL; - MPIDIU_release_lut(lut); - MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MAP, VERBOSE, - (MPL_DBG_FDEST, " lut to mode %d", (int) src->mode)); - MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MAP, VERBOSE, - (MPL_DBG_FDEST, "\toffset: %d", src->reg.offset)); - break; - case MPIDI_SRC_MAPPER_STRIDE: - if (blocksize == 1) { - src->mode = MPIDI_RANK_MAP_STRIDE; - if (src->avtid == 0) { - src->mode = MPIDI_RANK_MAP_STRIDE_INTRA; - } - } else { - src->mode = MPIDI_RANK_MAP_STRIDE_BLOCK; - if (src->avtid == 0) { - src->mode = MPIDI_RANK_MAP_STRIDE_BLOCK_INTRA; - } - } - src->reg.stride.stride = stride; - src->reg.stride.blocksize = blocksize; - src->reg.stride.offset = offset; - src->irreg.lut.t = NULL; - src->irreg.lut.lpid = NULL; - MPIDIU_release_lut(lut); - MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MAP, VERBOSE, - (MPL_DBG_FDEST, " lut to mode %d", (int) src->mode)); - MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MAP, VERBOSE, - (MPL_DBG_FDEST, "\toffset: %d", src->reg.stride.offset)); - MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MAP, VERBOSE, - (MPL_DBG_FDEST, "\tblocksize: %d", src->reg.stride.blocksize)); - MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MAP, VERBOSE, - (MPL_DBG_FDEST, "\tstride: %d", src->reg.stride.stride)); - break; - } - fn_exit: - MPIR_FUNC_EXIT; - return mpi_errno; -} - -static int set_map(MPIDI_rank_map_t * src_rmap, MPIDI_rank_map_t * dest_rmap, - MPIR_Comm_map_t * mapper, int src_comm_size, int total_mapper_size, - int mapper_offset) -{ - int mpi_errno = MPI_SUCCESS; - - MPIR_FUNC_ENTER; - - /* Simplest case: MAP_DUP, exact duplication of src_comm */ - if (mapper->type == MPIR_COMM_MAP_TYPE__DUP && src_comm_size == total_mapper_size) { - direct_of_src_rmap(src_rmap, dest_rmap, mapper); - goto fn_exit; - } - /* single src_comm, newcomm is smaller than src_comm, only one mapper */ - else if (mapper->type == MPIR_COMM_MAP_TYPE__IRREGULAR && - mapper->src_mapping_size == total_mapper_size) { - /* check if new comm has the same mapping as src_comm */ - /* detect src_mapping_offset for direct_to_direct and offset_to_offset */ - int mode_detected, offset = 0, blocksize, stride; - mode_detected = detect_regular_model(mapper->src_mapping, mapper->src_mapping_size, &offset, - &blocksize, &stride); - MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MAP, VERBOSE, - (MPL_DBG_FDEST, "\tdetected mode: %d", mode_detected)); - - switch (mode_detected) { - case MPIDI_SRC_MAPPER_DIRECT: - direct_of_src_rmap(src_rmap, dest_rmap, mapper); - break; - case MPIDI_SRC_MAPPER_OFFSET: - offset_of_src_rmap(src_rmap, dest_rmap, mapper, offset); - break; - case MPIDI_SRC_MAPPER_STRIDE: - stride_of_src_rmap(src_rmap, dest_rmap, mapper, stride, blocksize, offset); - break; - default: - if (src_rmap->mode == MPIDI_RANK_MAP_MLUT) { - src_mlut_to_mlut(src_rmap, dest_rmap, mapper, total_mapper_size, mapper_offset); - } else { - src_map_to_lut(src_rmap, dest_rmap, mapper, mapper->src_mapping_size, - mapper_offset); - } - } - goto fn_exit; - } - - /* more complex case: multiple mappers - * We always alloc lut (or mlut is src_rmap is mlut). We will check if a - * lut mapping can be converted to something simpler after all the mapper - * are processed - */ - - MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MAP, VERBOSE, (MPL_DBG_FDEST, " multiple mapper")); - if (mapper->type == MPIR_COMM_MAP_TYPE__DUP) { - MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MAP, VERBOSE, - (MPL_DBG_FDEST, " check map_size %d, src_comm_size %d", - map_size(*mapper), src_comm_size)); - src_comm_to_mlut(src_rmap, dest_rmap, src_comm_size, total_mapper_size, mapper_offset); - } else { /* mapper->type == MPIR_COMM_MAP_TYPE__IRREGULAR */ - src_mlut_to_mlut(src_rmap, dest_rmap, mapper, total_mapper_size, mapper_offset); - } - - fn_exit: - MPIR_FUNC_EXIT; - return mpi_errno; -} - -int MPIDI_comm_create_rank_map(MPIR_Comm * comm) -{ - int mpi_errno = MPI_SUCCESS; - MPIR_Comm_map_t *mapper; - MPIR_Comm *src_comm; - int total_mapper_size, mapper_offset; - - - MPIR_FUNC_ENTER; - - /* do some sanity checks */ - LL_FOREACH(comm->mapper_head, mapper) { - if (mapper->src_comm->comm_kind == MPIR_COMM_KIND__INTRACOMM) { - MPIR_Assert(mapper->dir == MPIR_COMM_MAP_DIR__L2L || - mapper->dir == MPIR_COMM_MAP_DIR__L2R); - } - - if (comm->comm_kind == MPIR_COMM_KIND__INTRACOMM) { - MPIR_Assert(mapper->dir == MPIR_COMM_MAP_DIR__L2L || - mapper->dir == MPIR_COMM_MAP_DIR__R2L); - } - } - - /* First, handle all the mappers that contribute to the local part - * of the comm */ - total_mapper_size = 0; - LL_FOREACH(comm->mapper_head, mapper) { - if (mapper->dir == MPIR_COMM_MAP_DIR__L2R || mapper->dir == MPIR_COMM_MAP_DIR__R2R) - continue; - - total_mapper_size += map_size(*mapper); - } - mapper_offset = 0; - LL_FOREACH(comm->mapper_head, mapper) { - src_comm = mapper->src_comm; - - if (mapper->dir == MPIR_COMM_MAP_DIR__L2R || mapper->dir == MPIR_COMM_MAP_DIR__R2R) - continue; - - if (mapper->dir == MPIR_COMM_MAP_DIR__L2L) { - if (src_comm->comm_kind == MPIR_COMM_KIND__INTRACOMM && - comm->comm_kind == MPIR_COMM_KIND__INTRACOMM) { - MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MAP, VERBOSE, - (MPL_DBG_FDEST, - " intra->intra, L2L, size=%d, total_mapper_size=%d, mapper_offset=%d", - src_comm->local_size, total_mapper_size, mapper_offset)); - set_map(&MPIDI_COMM(src_comm, map), &MPIDI_COMM(comm, map), mapper, - src_comm->local_size, total_mapper_size, mapper_offset); - } else if (src_comm->comm_kind == MPIR_COMM_KIND__INTRACOMM && - comm->comm_kind == MPIR_COMM_KIND__INTERCOMM) { - MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MAP, VERBOSE, - (MPL_DBG_FDEST, - " intra->inter, L2L, size=%d, total_mapper_size=%d, mapper_offset=%d", - src_comm->local_size, total_mapper_size, mapper_offset)); - set_map(&MPIDI_COMM(src_comm, map), &MPIDI_COMM(comm, local_map), mapper, - src_comm->local_size, total_mapper_size, mapper_offset); - } else if (src_comm->comm_kind == MPIR_COMM_KIND__INTERCOMM && - comm->comm_kind == MPIR_COMM_KIND__INTRACOMM) { - MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MAP, VERBOSE, - (MPL_DBG_FDEST, - " inter->intra, L2L, size=%d, total_mapper_size=%d, mapper_offset=%d", - src_comm->local_size, total_mapper_size, mapper_offset)); - set_map(&MPIDI_COMM(src_comm, local_map), &MPIDI_COMM(comm, map), mapper, - src_comm->local_size, total_mapper_size, mapper_offset); - } else { /* src_comm->comm_kind == MPIR_COMM_KIND__INTERCOMM && comm->comm_kind == MPIR_COMM_KIND__INTERCOMM */ - MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MAP, VERBOSE, - (MPL_DBG_FDEST, - " inter->inter, L2L, size=%d, total_mapper_size=%d, mapper_offset=%d", - src_comm->local_size, total_mapper_size, mapper_offset)); - set_map(&MPIDI_COMM(src_comm, local_map), &MPIDI_COMM(comm, local_map), - mapper, src_comm->local_size, total_mapper_size, mapper_offset); - } - } else { /* mapper->dir == MPIR_COMM_MAP_DIR__R2L */ - MPIR_Assert(src_comm->comm_kind == MPIR_COMM_KIND__INTERCOMM); - - if (comm->comm_kind == MPIR_COMM_KIND__INTRACOMM) { - MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MAP, VERBOSE, - (MPL_DBG_FDEST, - " ->intra, R2L, size=%d, total_mapper_size=%d, mapper_offset=%d", - src_comm->remote_size, total_mapper_size, mapper_offset)); - set_map(&MPIDI_COMM(src_comm, map), &MPIDI_COMM(comm, map), mapper, - src_comm->remote_size, total_mapper_size, mapper_offset); - } else { - MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MAP, VERBOSE, - (MPL_DBG_FDEST, - " ->inter, R2L, size=%d, total_mapper_size=%d, mapper_offset=%d", - src_comm->remote_size, total_mapper_size, mapper_offset)); - set_map(&MPIDI_COMM(src_comm, map), &MPIDI_COMM(comm, local_map), mapper, - src_comm->remote_size, total_mapper_size, mapper_offset); - } - } - - mapper_offset += map_size(*mapper); - } - - /* Next, handle all the mappers that contribute to the remote part - * of the comm (only valid for intercomms) - */ - total_mapper_size = 0; - LL_FOREACH(comm->mapper_head, mapper) { - if (mapper->dir == MPIR_COMM_MAP_DIR__L2L || mapper->dir == MPIR_COMM_MAP_DIR__R2L) - continue; - - total_mapper_size += map_size(*mapper); - } - mapper_offset = 0; - LL_FOREACH(comm->mapper_head, mapper) { - src_comm = mapper->src_comm; - - if (mapper->dir == MPIR_COMM_MAP_DIR__L2L || mapper->dir == MPIR_COMM_MAP_DIR__R2L) - continue; - - MPIR_Assert(comm->comm_kind == MPIR_COMM_KIND__INTERCOMM); - - if (mapper->dir == MPIR_COMM_MAP_DIR__L2R) { - if (src_comm->comm_kind == MPIR_COMM_KIND__INTRACOMM) { - MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MAP, VERBOSE, - (MPL_DBG_FDEST, - " intra->, L2R, size=%d, total_mapper_size=%d, mapper_offset=%d", - src_comm->local_size, total_mapper_size, mapper_offset)); - set_map(&MPIDI_COMM(src_comm, map), &MPIDI_COMM(comm, map), mapper, - src_comm->local_size, total_mapper_size, mapper_offset); - } else { /* src_comm->comm_kind == MPIR_COMM_KIND__INTERCOMM */ - MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MAP, VERBOSE, - (MPL_DBG_FDEST, - " inter->, L2R, size=%d, total_mapper_size=%d, mapper_offset=%d", - src_comm->local_size, total_mapper_size, mapper_offset)); - set_map(&MPIDI_COMM(src_comm, local_map), &MPIDI_COMM(comm, map), mapper, - src_comm->local_size, total_mapper_size, mapper_offset); - } - } else { /* mapper->dir == MPIR_COMM_MAP_DIR__R2R */ - MPIR_Assert(src_comm->comm_kind == MPIR_COMM_KIND__INTERCOMM); - MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MAP, VERBOSE, - (MPL_DBG_FDEST, - " inter->, R2R, size=%d, total_mapper_size=%d, mapper_offset=%d", - src_comm->remote_size, total_mapper_size, mapper_offset)); - set_map(&MPIDI_COMM(src_comm, map), &MPIDI_COMM(comm, map), mapper, - src_comm->remote_size, total_mapper_size, mapper_offset); - } - - mapper_offset += map_size(*mapper); - } - - /* check before finishing - * 1. if mlut can be converted to lut: all avtids are the same - * 2. if lut can be converted to regular modes: direct, offset, and more - */ - check_convert_mlut_to_lut(&MPIDI_COMM(comm, map)); - check_convert_lut_to_regular(&MPIDI_COMM(comm, map)); - if (comm->comm_kind == MPIR_COMM_KIND__INTERCOMM) { - check_convert_mlut_to_lut(&MPIDI_COMM(comm, local_map)); - check_convert_lut_to_regular(&MPIDI_COMM(comm, local_map)); - } - - if (comm->comm_kind == MPIR_COMM_KIND__INTERCOMM) { - /* setup the lut for the local_comm in the intercomm */ - if (comm->local_comm) { - MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MAP, VERBOSE, - (MPL_DBG_FDEST, "\t create local_comm using src_comm")); - direct_of_src_rmap(&MPIDI_COMM(comm, local_map), - &MPIDI_COMM(comm->local_comm, map), NULL); - - MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_MEMORY, VERBOSE, - (MPL_DBG_FDEST, "create local_comm using src_comm")); - } - } - - if (comm->comm_kind == MPIR_COMM_KIND__INTRACOMM) { - MPIDI_COMM(comm, local_map).mode = MPIDI_RANK_MAP_NONE; - } -#ifdef MPL_USE_DBG_LOGGING - int rank_; - int avtid_, lpid_ = -1; - if (comm->remote_size < 16) { - for (rank_ = 0; rank_ < comm->remote_size; ++rank_) { - MPIDIU_comm_rank_to_pid(comm, rank_, &lpid_, &avtid_); - MPIDIU_comm_rank_to_av(comm, rank_); - } - } - if (comm->comm_kind == MPIR_COMM_KIND__INTERCOMM && comm->local_size < 16) { - for (rank_ = 0; rank_ < comm->local_size; ++rank_) { - MPIDIU_comm_rank_to_pid_local(comm, rank_, &lpid_, &avtid_); - } - } -#endif - - MPIR_FUNC_EXIT; - return mpi_errno; -} - /* number of leading zeros, from Hacker's Delight */ static int nlz(uint32_t x) { @@ -928,7 +28,7 @@ static uint64_t shrink(uint64_t x, int num_low_bits) return ((x >> 32) << num_low_bits) + (x & 0xffffffff); } -int MPIDI_check_disjoint_gpids(uint64_t gpids1[], int n1, uint64_t gpids2[], int n2) +int MPIDI_check_disjoint_lpids(MPIR_Lpid lpids1[], int n1, MPIR_Lpid lpids2[], int n2) { int mpi_errno = MPI_SUCCESS; uint32_t gpidmaskPrealloc[128]; @@ -944,12 +44,12 @@ int MPIDI_check_disjoint_gpids(uint64_t gpids1[], int n1, uint64_t gpids2[], int /* Find the max low-32-bit gpid */ uint64_t max_lpid = 0; for (int i = 0; i < n1; i++) { - uint64_t n = gpids1[i] & 0xffffffff; + uint64_t n = lpids1[i] & 0xffffffff; if (n > max_lpid) max_lpid = n; } for (int i = 0; i < n2; i++) { - uint64_t n = gpids2[i] & 0xffffffff; + uint64_t n = lpids2[i] & 0xffffffff; if (n > max_lpid) max_lpid = n; } @@ -958,12 +58,12 @@ int MPIDI_check_disjoint_gpids(uint64_t gpids1[], int n1, uint64_t gpids2[], int uint64_t max_gpid = 0; for (int i = 0; i < n1; i++) { - uint64_t n = shrink(gpids1[i], num_low_bits); + uint64_t n = shrink(lpids1[i], num_low_bits); if (n > max_gpid) max_gpid = n; } for (int i = 0; i < n2; i++) { - uint64_t n = shrink(gpids2[i], num_low_bits); + uint64_t n = shrink(lpids2[i], num_low_bits); if (n > max_gpid) max_gpid = n; } @@ -981,7 +81,7 @@ int MPIDI_check_disjoint_gpids(uint64_t gpids1[], int n1, uint64_t gpids2[], int /* Set the bits for the first array */ for (int i = 0; i < n1; i++) { - uint64_t n = shrink(gpids1[i], num_low_bits); + uint64_t n = shrink(lpids1[i], num_low_bits); int idx = n / 32; int bit = n % 32; gpidmask[idx] = gpidmask[idx] | (1 << bit); @@ -990,12 +90,12 @@ int MPIDI_check_disjoint_gpids(uint64_t gpids1[], int n1, uint64_t gpids2[], int /* Look for any duplicates in the second array */ for (int i = 0; i < n2; i++) { - uint64_t n = shrink(gpids2[i], num_low_bits); + uint64_t n = shrink(lpids2[i], num_low_bits); int idx = n / 32; int bit = n % 32; if (gpidmask[idx] & (1 << bit)) { MPIR_ERR_SET1(mpi_errno, MPI_ERR_COMM, - "**dupprocesses", "**dupprocesses %d", gpids2[i]); + "**dupprocesses", "**dupprocesses %d", (int) lpids2[i]); goto fn_fail; } /* Add a check on duplicates *within* group 2 */ diff --git a/src/mpid/ch4/src/ch4i_comm.h b/src/mpid/ch4/src/ch4i_comm.h index 823d945ded4..4966798b6a3 100644 --- a/src/mpid/ch4/src/ch4i_comm.h +++ b/src/mpid/ch4/src/ch4i_comm.h @@ -8,7 +8,6 @@ #include "ch4_types.h" -int MPIDI_comm_create_rank_map(MPIR_Comm * comm); -int MPIDI_check_disjoint_gpids(uint64_t gpids1[], int n1, uint64_t gpids2[], int n2); +int MPIDI_check_disjoint_lpids(MPIR_Lpid lpids1[], int n1, MPIR_Lpid lpids2[], int n2); #endif /* CH4I_COMM_H_INCLUDED */ diff --git a/src/mpid/ch4/src/init_comm.c b/src/mpid/ch4/src/init_comm.c index e546337bd6f..09b1729c284 100644 --- a/src/mpid/ch4/src/init_comm.c +++ b/src/mpid/ch4/src/init_comm.c @@ -21,7 +21,6 @@ int MPIDI_create_init_comm(MPIR_Comm ** comm) int node_roots_comm_size = MPIR_Process.num_nodes; int node_roots_comm_rank = MPIR_Process.node_map[world_rank]; MPIR_Comm *init_comm = NULL; - MPIDI_rank_map_lut_t *lut = NULL; mpi_errno = MPIR_Comm_create(&init_comm); MPIR_ERR_CHECK(mpi_errno); @@ -33,22 +32,22 @@ int MPIDI_create_init_comm(MPIR_Comm ** comm) init_comm->remote_size = node_roots_comm_size; init_comm->local_size = node_roots_comm_size; init_comm->coll.pof2 = MPL_pof2(node_roots_comm_size); - MPIDI_COMM(init_comm, map).mode = MPIDI_RANK_MAP_LUT_INTRA; - mpi_errno = MPIDIU_alloc_lut(&lut, node_roots_comm_size); - MPIR_ERR_CHECK(mpi_errno); - MPIDI_COMM(init_comm, map).size = node_roots_comm_size; - MPIDI_COMM(init_comm, map).avtid = 0; - MPIDI_COMM(init_comm, map).irreg.lut.t = lut; - MPIDI_COMM(init_comm, map).irreg.lut.lpid = lut->lpid; - MPIDI_COMM(init_comm, local_map).mode = MPIDI_RANK_MAP_NONE; + + MPIR_Lpid *map; + map = MPL_malloc(node_roots_comm_size * sizeof(MPIR_Lpid), MPL_MEM_GROUP); + MPIR_ERR_CHKANDJUMP(!map, mpi_errno, MPI_ERR_OTHER, "**nomem"); for (i = 0; i < node_roots_comm_size; ++i) { - lut->lpid[i] = MPIR_Process.node_root_map[i]; + map[i] = MPIR_Process.node_root_map[i]; } + mpi_errno = MPIR_Group_create_map(node_roots_comm_size, node_roots_comm_rank, NULL, + map, &init_comm->local_group); + MPIR_ERR_CHECK(mpi_errno); + mpi_errno = MPIDIG_init_comm(init_comm); MPIR_ERR_CHECK(mpi_errno); /* hacky, consider a separate MPIDI_{NM,SHM}_init_comm_hook - * to initialize the init_comm, e.g. to eliminate potential - * runtime features for stability during init */ + * to initialize the init_comm, e.g. to eliminate potential + * runtime features for stability during init */ mpi_errno = MPIDI_NM_mpi_comm_commit_pre_hook(init_comm); MPIR_ERR_CHECK(mpi_errno); @@ -66,8 +65,8 @@ void MPIDI_destroy_init_comm(MPIR_Comm ** comm_ptr) MPIR_Comm *comm = NULL; if (*comm_ptr != NULL) { comm = *comm_ptr; - MPIDIU_release_lut(MPIDI_COMM(comm, map).irreg.lut.t); MPIDIG_destroy_comm(comm); + MPIR_Group_release(comm->local_group); MPIR_Object_release_ref(comm, &in_use); MPIR_Assertp(in_use == 0); MPII_COMML_FORGET(comm); diff --git a/src/mpid/ch4/src/mpidig_win.h b/src/mpid/ch4/src/mpidig_win.h index 6353bf7def3..63054ea36f9 100644 --- a/src/mpid/ch4/src/mpidig_win.h +++ b/src/mpid/ch4/src/mpidig_win.h @@ -562,18 +562,9 @@ MPL_STATIC_INLINE_PREFIX int MPIDIG_win_shared_query_part(MPIR_Win * win, int ra *disp_unit = 0; *((void **) baseptr) = NULL; } else { - int shm_rank = -1; /* find shm_rank in node_comm. Q: can we rely on comm_ptr->intranode_table? */ - int avtid, idx; - MPIDIU_comm_rank_to_pid(win->comm_ptr, rank, &idx, &avtid); - for (int i = 0; i < win->comm_ptr->node_comm->local_size; i++) { - int tmp_avtid, tmp_idx; - MPIDIU_comm_rank_to_pid(win->comm_ptr->node_comm, i, &tmp_idx, &tmp_avtid); - if (tmp_avtid == avtid && tmp_idx == idx) { - shm_rank = i; - break; - } - } + MPIR_Lpid lpid = MPIR_comm_rank_to_lpid(win->comm_ptr, rank); + int shm_rank = MPIR_Group_lpid_to_rank(win->comm_ptr->node_comm->local_group, lpid); MPIR_Assert(shm_rank >= 0); MPIDIG_win_shared_info_t *shared_table = MPIDIG_WIN(win, shared_table); diff --git a/src/mpid/common/hcoll/hcoll_rte.c b/src/mpid/common/hcoll/hcoll_rte.c index 0db52cd6226..aa055d6a022 100644 --- a/src/mpid/common/hcoll/hcoll_rte.c +++ b/src/mpid/common/hcoll/hcoll_rte.c @@ -301,11 +301,9 @@ static void coll_handle_complete(void *handle) static int world_rank(rte_grp_handle_t grp_h, rte_ec_handle_t ec) { -#ifdef MPIDCH4_H_INCLUDED - return MPIDIU_rank_to_lpid(ec.rank, (MPIR_Comm *) grp_h); -#else - return ((struct MPIDI_VC *) ec.handle)->pg_rank; -#endif + MPIR_Lpid lpid = MPIR_comm_rank_to_lpid((MPIR_Comm *) grp_h, ec.rank); + MPIR_Assert(MPIR_LPID_WORLD_INDEX(lpid) == 0); + return MPIR_LPID_WORLD_RANK(lpid); } #if HCOLL_API >= HCOLL_VERSION(3,6) diff --git a/src/util/mpir_pmi.c b/src/util/mpir_pmi.c index 9aff4e38dfa..d2b9eae8e5d 100644 --- a/src/util/mpir_pmi.c +++ b/src/util/mpir_pmi.c @@ -168,6 +168,9 @@ int MPIR_pmi_init(void) pmi_connected = true; } + int world_idx = MPIR_add_world(pmi_kvs_name, size); + MPIR_Assertp(world_idx == 0); + MPIR_Process.has_parent = has_parent; MPIR_Process.rank = rank; MPIR_Process.size = size; diff --git a/test/mpi/group/Makefile.am b/test/mpi/group/Makefile.am index d647c9d377a..993dab99371 100644 --- a/test/mpi/group/Makefile.am +++ b/test/mpi/group/Makefile.am @@ -16,7 +16,3 @@ noinst_PROGRAMS = \ groupcreate \ gtranks \ groupnullincl - -# glpid is a whitebox test that uses mpiimpl.h; it is unlikely to build with the -# current build system setup -#EXTRA_PROGRAMS = glpid diff --git a/test/mpi/group/glpid.c b/test/mpi/group/glpid.c deleted file mode 100644 index 06238aeb942..00000000000 --- a/test/mpi/group/glpid.c +++ /dev/null @@ -1,44 +0,0 @@ -/* - * Copyright (C) by Argonne National Laboratory - * See COPYRIGHT in top-level directory - */ - -#include -#include "mpi.h" -#include "mpiimpl.h" - -int main(int argc, char *argv[]) -{ - MPIR_Group group, *group_ptr = &group; - int i; - - MPI_Init(&argc, &argv); - - /* Setup a sample group */ - group.handle = 1; - group.ref_count = 1; - group.size = 4; - group.rank = 0; - group.idx_of_first_lpid = -1; - group.lrank_to_lpid = (MPII_Group_pmap_t *) - MPL_malloc(group.size * sizeof(MPII_Group_pmap_t), MPL_MEM_OTHER); - for (i = 0; i < group.size; i++) { - group.lrank_to_lpid[i].lrank = i; - group.lrank_to_lpid[i].lpid = group.size - i - 1; - group.lrank_to_lpid[i].next_lpid = -1; - group.lrank_to_lpid[i].flag = 0; - } - - /* Set up the group lpid list */ - MPII_Group_setup_lpid_list(group_ptr); - - /* Print the group structure */ - printf("Index of first lpid = %d\n", group.idx_of_first_lpid); - for (i = 0; i < group.size; i++) { - printf("lrank_to_lpid[%d].next_lpid = %d, .lpid = %d\n", - i, group.lrank_to_lpid[i].next_lpid, group.lrank_to_lpid[i].lpid); - } - - MPI_Finalize(); - return 0; -} diff --git a/test/mpi/include/multi_tests.c b/test/mpi/include/multi_tests.c index 96341d20abf..d7ccfc0c66b 100644 --- a/test/mpi/include/multi_tests.c +++ b/test/mpi/include/multi_tests.c @@ -204,7 +204,7 @@ static void cleanup_cvars(void) { for (int i = 0; i < num_cvars; i++) { if (cvar_list[i].num_enums > 0) { - for (int j = 0; j < num_cvars; j++) { + for (int j = 0; j < cvar_list[i].num_enums; j++) { free(cvar_list[i].enum_list[j]); } } diff --git a/test/mpi/runtests b/test/mpi/runtests index d39f72a1a79..45fc3aaf894 100755 --- a/test/mpi/runtests +++ b/test/mpi/runtests @@ -765,6 +765,22 @@ sub run_mpitests { } } close($in); + { + my @inline; + while (<$out>) { + print " $_" if $g_opt{verbose}; + push @inline, $_; + } + if (@inline) { + my $runtime = 0; + my $test_opt = {name=>"run_mpitests", np=>$np, dir=>".", args=>[], envs=>[] }; + RunPreMsg($test_opt); + print "run_mpitests: stray output in finalize\n"; + show_failed_test_detail($test_opt, \@inline); + RunTestFailed($test_opt, join('', @inline), $runtime); + RunPostMsg($test_opt); + } + } close($out); waitpid($pid, 0); # TODO: check $? if ($flag_aborted) {