Skip to content

Commit a15d221

Browse files
committed
SQUASHME: osc/ucx needs to exchange shared memory information
Signed-off-by: Joseph Schuchart <[email protected]>
1 parent 74239b6 commit a15d221

File tree

2 files changed

+33
-0
lines changed

2 files changed

+33
-0
lines changed

ompi/mca/osc/ucx/osc_ucx.h

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,19 @@ typedef struct ompi_osc_ucx_mem_ranges {
116116
uint64_t tail;
117117
} ompi_osc_ucx_mem_ranges_t;
118118

119+
/**
120+
* Structure to hold information about shared memory regions.
121+
* We store the rank, it's address, and the size of the window region.
122+
* We don't store the disp_unit here, as that is stored elsewhere already.
123+
*/
124+
struct ompi_osc_ucx_shmem_info_s {
125+
int peer; /* rank of the peer this information belongs to */
126+
char *addr; /* address of the shared memory region */
127+
size_t size; /* size of the shared memory region */
128+
};
129+
130+
typedef struct ompi_osc_ucx_shmem_info_s ompi_osc_ucx_shmem_info_t;
131+
119132
typedef struct ompi_osc_ucx_module {
120133
ompi_osc_base_module_t super;
121134
struct ompi_communicator_t *comm;
@@ -128,6 +141,7 @@ typedef struct ompi_osc_ucx_module {
128141
* disp unit size; if disp_unit == -1, then we
129142
* need to look at disp_units */
130143
ptrdiff_t *disp_units;
144+
ompi_osc_ucx_shmem_info_t *shmem_info; /* shared memory info */
131145

132146
ompi_osc_ucx_state_t state; /* remote accessible flags */
133147
ompi_osc_local_dynamic_win_info_t local_dynamic_win_info[OMPI_OSC_UCX_ATTACH_MAX];

ompi/mca/osc/ucx/osc_ucx_component.c

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -679,6 +679,13 @@ static int component_select(struct ompi_win_t *win, void **base, size_t size, pt
679679
module->acc_single_intrinsic = check_config_value_bool ("acc_single_intrinsic", info);
680680
module->skip_sync_check = false;
681681

682+
/**
683+
* TODO: we need to collect the shared memory information from all processes
684+
* on the same node. This includes the size and base address, which needs
685+
* to be passed to ucp_rkey_ptr().
686+
*/
687+
module->shmem_info = NULL;
688+
682689
/* share everyone's displacement units. Only do an allgather if
683690
strictly necessary, since it requires O(p) state. */
684691
values[0] = disp_unit;
@@ -847,6 +854,18 @@ static int component_select(struct ompi_win_t *win, void **base, size_t size, pt
847854

848855
module->size = module->sizes[ompi_comm_rank(module->comm)];
849856
*base = (void *)module->shmem_addrs[ompi_comm_rank(module->comm)];
857+
} else {
858+
/* non-shared memory: exchange sizes and addresses so they can be queried for shared memory */
859+
for (i = 0; i < comm_size; i++) {
860+
ompi_proc_t *peer = ompi_comm_peer_lookup(module->comm, i);
861+
peer->
862+
if (ompi_comm_peer_lookup(module->comm, i) == NULL) {
863+
OSC_UCX_ERROR("Failed to lookup peer %d in communicator %s", i, ompi_comm_print_cid(module->comm));
864+
ret = OMPI_ERR_COMM_FAILURE;
865+
goto error;
866+
}
867+
}
868+
850869
}
851870

852871
void **mem_base = base;

0 commit comments

Comments
 (0)