Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

oshmem/shmem: Allocate and exchange base segment address beforehand #13029

Merged
merged 1 commit into from
Jan 23, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
147 changes: 142 additions & 5 deletions oshmem/mca/memheap/base/memheap_base_select.c
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@
#include "oshmem/mca/sshmem/base/base.h"
#include "ompi/util/timings.h"

#include <sys/mman.h>

mca_memheap_base_config_t mca_memheap_base_config = {
.device_nic_mem_seg_size = 0
};
Expand Down Expand Up @@ -107,6 +109,136 @@ static size_t _memheap_size(void)
return (size_t) memheap_align(oshmem_shmem_info_env.symmetric_heap_size);
}

static void *memheap_mmap_get(void *hint, size_t size)
{
void *addr;

addr = mmap(hint, size,
PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
if (addr == MAP_FAILED) {
return NULL;
}

return addr;
}

static int memheap_exchange_base_address(size_t size, void **address)
{
int nprocs = oshmem_num_procs();
int need_sync = (*address == NULL);
void *base = NULL;
void *ptr = NULL;
int rc, i;
void **bases;

bases = calloc(nprocs, sizeof(*bases));
if (NULL == bases) {
return OSHMEM_ERROR;
}

if (oshmem_my_proc_id() == 0) {
ptr = memheap_mmap_get(NULL, size);
base = ptr;
}

rc = oshmem_shmem_bcast(&base, sizeof(base), 0);
if (OSHMEM_SUCCESS != rc) {
MEMHEAP_ERROR("Failed to exchange allocated vma for base segment "
"(error %d)", rc);
goto out;
}

if (oshmem_my_proc_id() != 0) {
ptr = memheap_mmap_get(base, size);
}

MEMHEAP_VERBOSE(100, "#%d: exchange base address: base %p: %s",
oshmem_my_proc_id(), base,
(base == ptr)? "ok" : "unavailable");

*address = base;
if (need_sync) {
/* They all succeed or fail to allow fallback */
rc = oshmem_shmem_allgather(&ptr, bases, sizeof(ptr));
if (OSHMEM_SUCCESS != rc) {
MEMHEAP_ERROR("Failed to exchange selected vma for base segment "
"(error %d)", rc);
goto out;
}

for (i = 0; i < nprocs; i++) {
if ((NULL == bases[i]) || (bases[i] != base)) {
*address = NULL;
break;
}
}
} else if (ptr != base) {
/* Any failure terminates the rank and others start teardown */
rc = OSHMEM_ERROR;
}

out:
if (((OSHMEM_SUCCESS != rc) || (*address == NULL)) && (ptr != NULL)) {
(void)munmap(ptr, size);
}

free(bases);
return rc;
}


/*
* The returned mca_sshmem_base_start_address value is reserved by using
* mmap() for the expected size.
*/
static int memheap_base_segment_setup(size_t size)
{
int rc;

if ((mca_sshmem_base_start_address == (void *)UINTPTR_MAX) ||
(mca_sshmem_base_start_address == NULL)) {
if (UINTPTR_MAX == 0xFFFFFFFF) {
/**
* if 32 bit we set sshmem_base_start_adress to 0
* to let OS allocate segment automatically
*/
mca_sshmem_base_start_address = NULL;
return OSHMEM_SUCCESS;
}

rc = memheap_exchange_base_address(size, &mca_sshmem_base_start_address);
if (OSHMEM_SUCCESS != rc) {
MEMHEAP_ERROR("Failed to setup base segment address (error %d)", rc);
return rc;
}

if (NULL != mca_sshmem_base_start_address) {
goto done; /* Region is reserved */
}

#if defined(__aarch64__)
mca_sshmem_base_start_address = (void*)0xAB0000000000;
#else
mca_sshmem_base_start_address = (void*)0xFF000000;
#endif
}

if (mca_sshmem_base_start_address != memheap_mmap_get(
mca_sshmem_base_start_address, size)) {
MEMHEAP_ERROR("Failed to create segment address %p/%zu",
mca_sshmem_base_start_address, size);
return OSHMEM_ERROR;
}

done:
if (oshmem_my_proc_id() == 0) {
MEMHEAP_VERBOSE(10, "Using symmetric segment address %p/%zu",
mca_sshmem_base_start_address, size);
}

return OSHMEM_SUCCESS;
}

static memheap_context_t* _memheap_create(void)
{
int rc = OSHMEM_SUCCESS;
Expand All @@ -124,13 +256,18 @@ static memheap_context_t* _memheap_create(void)

OPAL_TIMING_ENV_NEXT(timing, "_memheap_size()");

/* Inititialize symmetric area */
if (OSHMEM_SUCCESS == rc) {
rc = mca_memheap_base_alloc_init(&mca_memheap_base_map,
user_size + MEMHEAP_BASE_PRIVATE_SIZE, 0,
"regular_mem");
/* Locate and reserve symmetric area */
rc = memheap_base_segment_setup(user_size + MEMHEAP_BASE_PRIVATE_SIZE);
if (OSHMEM_SUCCESS != rc) {
MEMHEAP_ERROR("Failed to negotiate base segment addres");
return NULL;
}

/* Initialize symmetric area */
rc = mca_memheap_base_alloc_init(&mca_memheap_base_map,
user_size + MEMHEAP_BASE_PRIVATE_SIZE, 0,
"regular_mem");

OPAL_TIMING_ENV_NEXT(timing, "mca_memheap_base_alloc_init()");

/* Initialize atomic symmetric area */
Expand Down
44 changes: 32 additions & 12 deletions oshmem/mca/memheap/base/memheap_base_static.c
Original file line number Diff line number Diff line change
Expand Up @@ -23,13 +23,17 @@
#include <pthread.h>

static int _check_perms(const char *perm);
static int _check_non_static_segment(const map_segment_t *mem_segs,
int n_segment,
const void *start, const void *end);
static int _check_address(void *start, void **end);
static int _check_pathname(uint64_t inode, const char *pathname);

int mca_memheap_base_static_init(mca_memheap_map_t *map)
{
/* read and parse segments from /proc/self/maps */
int ret = OSHMEM_SUCCESS;
int n_segments = map->n_segments;
uint64_t total_mem = 0;
void* start;
void* end;
Expand All @@ -52,14 +56,6 @@ int mca_memheap_base_static_init(mca_memheap_map_t *map)
return OSHMEM_ERROR;
}

#ifdef __linux__
extern unsigned _end;
if (mca_sshmem_base_start_address < (uintptr_t)&_end) {
MEMHEAP_VERBOSE(1, "sshmem base start address is inside data region"
" (%p < %p)", mca_sshmem_base_start_address, &_end);
}
#endif

while (NULL != fgets(line, sizeof(line), fp)) {
if (3 > sscanf(line,
"%llx-%llx %s %llx %s %llx %s",
Expand All @@ -75,6 +71,12 @@ int mca_memheap_base_static_init(mca_memheap_map_t *map)
goto out;
}

if (OSHMEM_ERROR == _check_non_static_segment(
map->mem_segs, n_segments,
start, end)) {
continue;
}

if (OSHMEM_ERROR == _check_address(start, &end))
continue;

Expand Down Expand Up @@ -136,6 +138,26 @@ static int _check_perms(const char *perms)
return OSHMEM_ERROR;
}

static int _check_non_static_segment(const map_segment_t *mem_segs,
int n_segment,
const void *start, const void *end)
{
int i;

for (i = 0; i < n_segment; i++) {
if ((start <= mem_segs[i].super.va_base) &&
(mem_segs[i].super.va_base < end)) {
MEMHEAP_VERBOSE(100,
"non static segment: %p-%p already exists as %p-%p",
start, end, mem_segs[i].super.va_base,
mem_segs[i].super.va_end);
return OSHMEM_ERROR;
}
}

return OSHMEM_SUCCESS;
}

static int _check_address(void *start, void **end)
{
/* FIXME Linux specific code */
Expand All @@ -146,11 +168,9 @@ static int _check_address(void *start, void **end)
/**
* SGI shmem only supports globals&static in main program.
* It does not support them in shared objects or in dlopen()
* (Clarified on PGAS 2011 tutorial)
* (Clarified on PGAS 2011 tutorial).
*
* So ignored any maps that start higher then process _end
* FIXME: make sure we do not register symmetric heap twice
* if we decide to allow shared objects
* So ignored any maps that start higher then process _end.
*/
if ((uintptr_t)start > data_end) {
MEMHEAP_VERBOSE(100,
Expand Down
12 changes: 1 addition & 11 deletions oshmem/mca/sshmem/base/sshmem_base_open.c
Original file line number Diff line number Diff line change
Expand Up @@ -31,17 +31,7 @@
* globals
*/

/**
* if 32 bit we set sshmem_base_start_adress to 0
* to let OS allocate segment automatically
*/
#if UINTPTR_MAX == 0xFFFFFFFF
void *mca_sshmem_base_start_address = (void*)0;
#elif defined(__aarch64__)
void* mca_sshmem_base_start_address = (void*)0xAB0000000000;
#else
void* mca_sshmem_base_start_address = (void*)0xFF000000;
#endif
void *mca_sshmem_base_start_address = UINTPTR_MAX;

char * mca_sshmem_base_backing_file_dir = NULL;

Expand Down
1 change: 1 addition & 0 deletions oshmem/mca/sshmem/sysv/sshmem_sysv_module.c
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,7 @@ segment_create(map_segment_t *ds_buf,
}

/* Attach to the segment */
(void)munmap(mca_sshmem_base_start_address, size);
addr = shmat(shmid, (void *) mca_sshmem_base_start_address, 0);
if (addr == (void *) -1L) {
opal_show_help("help-oshmem-sshmem.txt",
Expand Down
9 changes: 9 additions & 0 deletions oshmem/runtime/oshmem_shmem_exchange.c
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,15 @@
#include "oshmem/runtime/runtime.h"
#include "oshmem/runtime/params.h"

int oshmem_shmem_bcast(void *buf, int elem_size, int root)
{
int rc;

rc = PMPI_Bcast(buf, elem_size, MPI_BYTE, root, oshmem_comm_world);

return rc;
}

int oshmem_shmem_allgather(void *send_buf, void *rcv_buf, int elem_size)
{
int rc;
Expand Down
5 changes: 5 additions & 0 deletions oshmem/runtime/runtime.h
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,11 @@ int oshmem_shmem_finalize(void);
*/
OSHMEM_DECLSPEC int oshmem_shmem_abort(int errcode);

/**
* Broadcast between all PEs
*/
OSHMEM_DECLSPEC int oshmem_shmem_bcast(void *buf, int elem_size, int root);

/**
* Allgather between all PEs
*/
Expand Down
Loading