Skip to content

Commit caa091d

Browse files
committed
SQUASHME: osc/ucx needs to exchange shared memory information
Signed-off-by: Joseph Schuchart <[email protected]>
1 parent 15d14a0 commit caa091d

File tree

2 files changed

+33
-0
lines changed

2 files changed

+33
-0
lines changed

ompi/mca/osc/ucx/osc_ucx.h

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,19 @@ typedef struct ompi_osc_ucx_mem_ranges {
116116
uint64_t tail;
117117
} ompi_osc_ucx_mem_ranges_t;
118118

119+
/**
120+
* Structure to hold information about shared memory regions.
121+
* We store the rank, it's address, and the size of the window region.
122+
* We don't store the disp_unit here, as that is stored elsewhere already.
123+
*/
124+
struct ompi_osc_ucx_shmem_info_s {
125+
int peer; /* rank of the peer this information belongs to */
126+
char *addr; /* address of the shared memory region */
127+
size_t size; /* size of the shared memory region */
128+
};
129+
130+
typedef struct ompi_osc_ucx_shmem_info_s ompi_osc_ucx_shmem_info_t;
131+
119132
typedef struct ompi_osc_ucx_module {
120133
ompi_osc_base_module_t super;
121134
struct ompi_communicator_t *comm;
@@ -128,6 +141,7 @@ typedef struct ompi_osc_ucx_module {
128141
* disp unit size; if disp_unit == -1, then we
129142
* need to look at disp_units */
130143
ptrdiff_t *disp_units;
144+
ompi_osc_ucx_shmem_info_t *shmem_info; /* shared memory info */
131145

132146
ompi_osc_ucx_state_t state; /* remote accessible flags */
133147
ompi_osc_local_dynamic_win_info_t local_dynamic_win_info[OMPI_OSC_UCX_ATTACH_MAX];

ompi/mca/osc/ucx/osc_ucx_component.c

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -676,6 +676,13 @@ static int component_select(struct ompi_win_t *win, void **base, size_t size, pt
676676
module->acc_single_intrinsic = check_config_value_bool ("acc_single_intrinsic", info);
677677
module->skip_sync_check = false;
678678

679+
/**
680+
* TODO: we need to collect the shared memory information from all processes
681+
* on the same node. This includes the size and base address, which needs
682+
* to be passed to ucp_rkey_ptr().
683+
*/
684+
module->shmem_info = NULL;
685+
679686
/* share everyone's displacement units. Only do an allgather if
680687
strictly necessary, since it requires O(p) state. */
681688
values[0] = disp_unit;
@@ -844,6 +851,18 @@ static int component_select(struct ompi_win_t *win, void **base, size_t size, pt
844851

845852
module->size = module->sizes[ompi_comm_rank(module->comm)];
846853
*base = (void *)module->shmem_addrs[ompi_comm_rank(module->comm)];
854+
} else {
855+
/* non-shared memory: exchange sizes and addresses so they can be queried for shared memory */
856+
for (i = 0; i < comm_size; i++) {
857+
ompi_proc_t *peer = ompi_comm_peer_lookup(module->comm, i);
858+
peer->
859+
if (ompi_comm_peer_lookup(module->comm, i) == NULL) {
860+
OSC_UCX_ERROR("Failed to lookup peer %d in communicator %s", i, ompi_comm_print_cid(module->comm));
861+
ret = OMPI_ERR_COMM_FAILURE;
862+
goto error;
863+
}
864+
}
865+
847866
}
848867

849868
void **mem_base = base;

0 commit comments

Comments
 (0)