diff --git a/ompi/mca/mtl/ofi/Makefile.am b/ompi/mca/mtl/ofi/Makefile.am index 5842abd3018..7baad1b211f 100644 --- a/ompi/mca/mtl/ofi/Makefile.am +++ b/ompi/mca/mtl/ofi/Makefile.am @@ -9,7 +9,7 @@ # and Technology (RIST). All rights reserved. # Copyright (c) 2020 Triad National Security, LLC. All rights # reserved. -# Copyright (c) 2022 Amazon.com, Inc. or its affiliates. All Rights reserved. +# Copyright (c) 2022-2025 Amazon.com, Inc. or its affiliates. All Rights reserved. # Copyright (c) 2025 Jeffrey M. Squyres. All rights reserved. # $COPYRIGHT$ # @@ -48,6 +48,7 @@ mtl_ofi_sources = \ mtl_ofi_component.c \ mtl_ofi_endpoint.h \ mtl_ofi_endpoint.c \ + mtl_ofi_mr.c \ mtl_ofi_request.h \ mtl_ofi_types.h \ mtl_ofi_opt.h \ diff --git a/ompi/mca/mtl/ofi/mtl_ofi.h b/ompi/mca/mtl/ofi/mtl_ofi.h index aae756b0518..323477d74f4 100644 --- a/ompi/mca/mtl/ofi/mtl_ofi.h +++ b/ompi/mca/mtl/ofi/mtl_ofi.h @@ -4,8 +4,7 @@ * reserved. * Copyright (c) 2019-2024 Triad National Security, LLC. All rights * reserved. - * Copyright (c) 2018-2023 Amazon.com, Inc. or its affiliates. All Rights reserved. - * reserved. + * Copyright (c) 2018-2025 Amazon.com, Inc. or its affiliates. All Rights reserved. * Copyright (c) 2021 Cisco Systems, Inc. All rights reserved * Copyright (c) 2021 The University of Tennessee and The University * of Tennessee Research Foundation. All rights @@ -73,6 +72,8 @@ extern int ompi_mtl_ofi_del_comm(struct mca_mtl_base_module_t *mtl, int ompi_mtl_ofi_progress_no_inline(void); +int ompi_mtl_ofi_rcache_init(void); + #if OPAL_HAVE_THREAD_LOCAL extern opal_thread_local int ompi_mtl_ofi_per_thread_ctx; #endif @@ -291,78 +292,37 @@ ompi_mtl_ofi_set_mr_null(ompi_mtl_ofi_request_t *ofi_req) { static int ompi_mtl_ofi_register_buffer(struct opal_convertor_t *convertor, ompi_mtl_ofi_request_t *ofi_req, - void* buffer) { + void* buffer) +{ + int ret; + uint32_t cache_flags = 0; + ofi_req->mr = NULL; if (ofi_req->length <= 0 || NULL == buffer) { return OMPI_SUCCESS; } -#if OPAL_OFI_HAVE_FI_MR_IFACE - - if ((convertor->flags & CONVERTOR_ACCELERATOR) && ompi_mtl_ofi.hmem_needs_reg) { - /* Register buffer */ - int ret; - struct fi_mr_attr attr = {0}; - struct iovec iov = {0}; - - iov.iov_base = buffer; - iov.iov_len = ofi_req->length; - attr.mr_iov = &iov; - attr.iov_count = 1; - attr.access = FI_SEND | FI_RECV; - attr.offset = 0; - attr.context = NULL; - if (false == ompi_mtl_base_selected_component->accelerator_support) { - goto reg; - } else if (0 == strcmp(opal_accelerator_base_selected_component.base_version.mca_component_name, "cuda")) { - attr.iface = FI_HMEM_CUDA; - opal_accelerator.get_device(&attr.device.cuda); -#if OPAL_OFI_HAVE_FI_HMEM_ROCR - } else if (0 == strcmp(opal_accelerator_base_selected_component.base_version.mca_component_name, "rocm")) { - attr.iface = FI_HMEM_ROCR; - opal_accelerator.get_device(&attr.device.cuda); -#endif -#if OPAL_OFI_HAVE_FI_HMEM_ZE - } else if (0 == strcmp(opal_accelerator_base_selected_component.base_version.mca_component_name, "ze")) { - attr.iface = FI_HMEM_ZE; - opal_accelerator.get_device(&attr.device.ze); -#endif - } else { - return OPAL_ERROR; - } -reg: - ret = fi_mr_regattr(ompi_mtl_ofi.domain, &attr, 0, &ofi_req->mr); - - if (ret) { - opal_show_help("help-mtl-ofi.txt", "Buffer Memory Registration Failed", true, - opal_accelerator_base_selected_component.base_version.mca_component_name, - buffer, ofi_req->length, - fi_strerror(-ret), ret); - ofi_req->mr = NULL; - return OMPI_ERROR; - } + if (! ((convertor->flags & CONVERTOR_ACCELERATOR) && ompi_mtl_ofi.hmem_needs_reg)) { + return OMPI_SUCCESS; } -#endif - - return OMPI_SUCCESS; + /* note - the cache access flags are a little broken, because rcache doesn't + * understand send/recv requirements. Since this rcache is only used in the + * MTL, that isn't a problem and we fix it in the underlying register call. + */ + ret = ompi_mtl_ofi.rcache->rcache_register(ompi_mtl_ofi.rcache, buffer, ofi_req->length, + cache_flags, MCA_RCACHE_ACCESS_ANY, + (mca_rcache_base_registration_t **) &ofi_req->mr); + return ret; } /** Deregister buffer */ __opal_attribute_always_inline__ static inline int ompi_mtl_ofi_deregister_buffer(ompi_mtl_ofi_request_t *ofi_req) { if (ofi_req->mr) { - int ret; - ret = fi_close(&ofi_req->mr->fid); - if (ret) { - opal_show_help("help-mtl-ofi.txt", "OFI call fail", true, - "fi_close", - ompi_process_info.nodename, __FILE__, __LINE__, - fi_strerror(-ret), ofi_req->mr->fid); - return OMPI_ERROR; - } - ofi_req->mr = NULL; + (void)ompi_mtl_ofi.rcache->rcache_deregister(ompi_mtl_ofi.rcache, &ofi_req->mr->base); } + return OMPI_SUCCESS; } diff --git a/ompi/mca/mtl/ofi/mtl_ofi_component.c b/ompi/mca/mtl/ofi/mtl_ofi_component.c index 256dd483fc0..049ff4cf8c8 100644 --- a/ompi/mca/mtl/ofi/mtl_ofi_component.c +++ b/ompi/mca/mtl/ofi/mtl_ofi_component.c @@ -5,7 +5,7 @@ * Copyright (c) 2014-2021 Cisco Systems, Inc. All rights reserved * Copyright (c) 2015-2016 Los Alamos National Security, LLC. All rights * reserved. - * Copyright (c) 2018-2022 Amazon.com, Inc. or its affiliates. All Rights reserved. + * Copyright (c) 2018-2025 Amazon.com, Inc. or its affiliates. All Rights reserved. * Copyright (c) 2020-2023 Triad National Security, LLC. All rights * reserved. * $COPYRIGHT$ @@ -823,20 +823,17 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads, } } else { *accelerator_support = true; - ompi_mtl_ofi.hmem_needs_reg = true; - /* - * Workaround for the fact that the CXI provider actually doesn't need for accelerator memory to be registered - * for local buffers, but if one does do so using fi_mr_regattr, one actually needs to manage the - * requested_key field in the fi_mr_attr attr argument, and the OFI MTL doesn't track which requested_keys - * have already been registered. So just set a flag to disable local registration. Note the OFI BTL doesn't - * have a problem here since it uses fi_mr_regattr only within the context of an rcache, and manages the - * requested_key field in this way. - */ - if ((NULL != strstr(prov->fabric_attr->prov_name, "cxi")) || - (NULL != strstr(prov->fabric_attr->prov_name, "CXI")) ) { - ompi_mtl_ofi.hmem_needs_reg = false; - } + /* Only explicitly register domain buffers if the provider requires it. + For example, CXI does not require it but EFA does require it. */ + if ((prov->domain_attr->mr_mode & FI_MR_HMEM) != 0) { + ompi_mtl_ofi.hmem_needs_reg = true; + opal_output_verbose(50, opal_common_ofi.output, + "Support for device buffers enabled with explicit registration"); + } else { + opal_output_verbose(50, opal_common_ofi.output, + "Support for device buffers enabled with implicit registration"); + } } #else opal_output_verbose(50, opal_common_ofi.output, @@ -844,6 +841,10 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads, __FILE__, __LINE__); #endif + if (ompi_mtl_ofi.hmem_needs_reg) { + ompi_mtl_ofi_rcache_init(); + } + /** * Select the format of the OFI tag */ @@ -1177,6 +1178,11 @@ ompi_mtl_ofi_finalize(struct mca_mtl_base_module_t *mtl) { ssize_t ret; + if (NULL != ompi_mtl_ofi.rcache) { + mca_rcache_base_module_destroy(ompi_mtl_ofi.rcache); + ompi_mtl_ofi.rcache = NULL; + } + opal_progress_unregister(ompi_mtl_ofi_progress_no_inline); /* Close all the OFI objects */ diff --git a/ompi/mca/mtl/ofi/mtl_ofi_mr.c b/ompi/mca/mtl/ofi/mtl_ofi_mr.c new file mode 100644 index 00000000000..2f39a98ba23 --- /dev/null +++ b/ompi/mca/mtl/ofi/mtl_ofi_mr.c @@ -0,0 +1,124 @@ +/* + * Copyright (c) 2025 Amazon.com, Inc. or its affiliates. All Rights reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "opal_config.h" + +#include "mtl_ofi.h" + +static int +ompi_mtl_ofi_reg_mem(void *reg_data, void *base, size_t size, + mca_rcache_base_registration_t *reg) +{ + int ret; + struct fi_mr_attr attr = {0}; + struct iovec iov = {0}; + ompi_mtl_ofi_reg_t *mtl_reg = (ompi_mtl_ofi_reg_t *)reg; + int dev_id; + uint64_t flags; + + iov.iov_base = base; + iov.iov_len = size; + attr.mr_iov = &iov; + attr.iov_count = 1; + attr.access = FI_SEND | FI_RECV; + attr.offset = 0; + attr.context = NULL; + +#if OPAL_OFI_HAVE_FI_MR_IFACE + if (OPAL_LIKELY(NULL != base)) { + ret = opal_accelerator.check_addr(base, &dev_id, &flags); + if (ret < 0) { + return ret; + } else if (ret > 0 ) { + if (0 == strcmp(opal_accelerator_base_selected_component.base_version.mca_component_name, "cuda")) { + attr.iface = FI_HMEM_CUDA; + opal_accelerator.get_device(&attr.device.cuda); +#if OPAL_OFI_HAVE_FI_HMEM_ROCR + } else if (0 == strcmp(opal_accelerator_base_selected_component.base_version.mca_component_name, "rocm")) { + attr.iface = FI_HMEM_ROCR; + opal_accelerator.get_device(&attr.device.cuda); +#endif +#if OPAL_OFI_HAVE_FI_HMEM_ZE + } else if (0 == strcmp(opal_accelerator_base_selected_component.base_version.mca_component_name, "ze")) { + attr.iface = FI_HMEM_ZE; + opal_accelerator.get_device(&attr.device.ze); +#endif + } else { + return OPAL_ERROR; + } + } + } +#endif + + ret = fi_mr_regattr(ompi_mtl_ofi.domain, &attr, 0, &mtl_reg->ofi_mr); + if (0 != ret) { + opal_show_help("help-mtl-ofi.txt", "Buffer Memory Registration Failed", true, + opal_accelerator_base_selected_component.base_version.mca_component_name, + base, size, fi_strerror(-ret), ret); + mtl_reg->ofi_mr = NULL; + return OPAL_ERR_OUT_OF_RESOURCE; + } + + mtl_reg->mem_desc = fi_mr_desc(mtl_reg->ofi_mr); + + return OPAL_SUCCESS; +} + + +static int +ompi_mtl_ofi_dereg_mem(void *reg_data, mca_rcache_base_registration_t *reg) +{ + ompi_mtl_ofi_reg_t *mtl_reg = (ompi_mtl_ofi_reg_t *)reg; + int ret; + + if (mtl_reg->ofi_mr != NULL) { + ret = fi_close(&mtl_reg->ofi_mr->fid); + if (0 != ret) { + opal_output_verbose(1, opal_common_ofi.output, + "%s: error unpinning memory mr=%p: %s", + __func__, (void *)mtl_reg->ofi_mr, + fi_strerror(-ret)); + return OPAL_ERROR; + } + } + + return OPAL_SUCCESS; +} + + +int +ompi_mtl_ofi_rcache_init(void) +{ + mca_rcache_base_resources_t rcache_resources; + char *tmp; + + if (NULL != ompi_mtl_ofi.rcache) { + return OMPI_SUCCESS; + } + + tmp = strdup("mtl-ofi"); + rcache_resources.cache_name = tmp; + rcache_resources.reg_data = NULL; + rcache_resources.sizeof_reg = sizeof(ompi_mtl_ofi_reg_t); + rcache_resources.register_mem = ompi_mtl_ofi_reg_mem; + rcache_resources.deregister_mem = ompi_mtl_ofi_dereg_mem; + + ompi_mtl_ofi.rcache = mca_rcache_base_module_create("grdma", &ompi_mtl_ofi, &rcache_resources); + free(tmp); + + if (NULL == ompi_mtl_ofi.rcache) { + /* something when horribly wrong */ + opal_output_verbose(1, opal_common_ofi.output, + "creating rcache failed"); + return OMPI_ERROR; + } + + return OMPI_SUCCESS; +} diff --git a/ompi/mca/mtl/ofi/mtl_ofi_request.h b/ompi/mca/mtl/ofi/mtl_ofi_request.h index cb746f341db..74355f7bca0 100644 --- a/ompi/mca/mtl/ofi/mtl_ofi_request.h +++ b/ompi/mca/mtl/ofi/mtl_ofi_request.h @@ -2,6 +2,7 @@ * Copyright (c) 2013-2016 Intel, Inc. All rights reserved * Copyright (c) 2017 Los Alamos National Security, LLC. All rights * reserved. + * Copyright (c) 2025 Amazon.com, Inc. or its affiliates. All Rights reserved. * * $COPYRIGHT$ * @@ -25,6 +26,7 @@ typedef enum { OMPI_MTL_OFI_PROBE } ompi_mtl_ofi_request_type_t; +struct ompi_mtl_ofi_reg_t; struct ompi_mtl_ofi_request_t; struct ompi_mtl_ofi_request_t { @@ -89,8 +91,9 @@ struct ompi_mtl_ofi_request_t { struct mca_mtl_request_t *mrecv_req; /** Stores reference to memory region from registration */ - /* Set to NULL if memory not registered or if non accelerator buffer */ - struct fid_mr *mr; + + /* Set to NULL if memory not registered */ + struct ompi_mtl_ofi_reg_t *mr; }; typedef struct ompi_mtl_ofi_request_t ompi_mtl_ofi_request_t; diff --git a/ompi/mca/mtl/ofi/mtl_ofi_types.h b/ompi/mca/mtl/ofi/mtl_ofi_types.h index a925f0ec28e..b89cc2c274e 100644 --- a/ompi/mca/mtl/ofi/mtl_ofi_types.h +++ b/ompi/mca/mtl/ofi/mtl_ofi_types.h @@ -4,6 +4,7 @@ * Copyright (c) 2015 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2022-2023 Triad National Security, LLC. All rights * reserved. + * Copyright (c) 2025 Amazon.com, Inc. or its affiliates. All Rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -16,6 +17,9 @@ #include "mtl_ofi.h" +#include "opal/mca/rcache/base/base.h" + + BEGIN_C_DECLS /** @@ -102,6 +106,8 @@ typedef struct mca_mtl_ofi_module_t { bool has_posted_initial_buffer; bool hmem_needs_reg; + /** registration cache */ + mca_rcache_base_module_t *rcache; } mca_mtl_ofi_module_t; extern mca_mtl_ofi_module_t ompi_mtl_ofi; @@ -116,6 +122,14 @@ typedef enum { OFI_SCALABLE_EP, } mca_mtl_ofi_ep_type; +struct ompi_mtl_ofi_reg_t { + mca_rcache_base_registration_t base; + struct fid_mr *ofi_mr; + void *mem_desc; +}; +typedef struct ompi_mtl_ofi_reg_t ompi_mtl_ofi_reg_t; + + /* * Define upper limit for number of events read from a CQ. * Setting this to 100 as this was deemed optimal from empirical data. diff --git a/opal/mca/btl/ofi/btl_ofi.h b/opal/mca/btl/ofi/btl_ofi.h index 0019065ecfe..e12d490b390 100644 --- a/opal/mca/btl/ofi/btl_ofi.h +++ b/opal/mca/btl/ofi/btl_ofi.h @@ -13,8 +13,7 @@ * Copyright (c) 2015-2018 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2018-2019 Intel, Inc. All rights reserved. - * Copyright (c) 2020 Amazon.com, Inc. or its affiliates. - * All Rights reserved. + * Copyright (c) 2020-2025 Amazon.com, Inc. or its affiliates. All Rights reserved. * Copyright (c) 2022 Triad National Security, LLC. All rights * reserved. * $COPYRIGHT$ @@ -140,9 +139,6 @@ struct mca_btl_ofi_module_t { /** registration cache */ mca_rcache_base_module_t *rcache; - /* If the underlying OFI provider has its own cache, we want to bypass - * rcache registration */ - bool bypass_cache; }; typedef struct mca_btl_ofi_module_t mca_btl_ofi_module_t; diff --git a/opal/mca/btl/ofi/btl_ofi_component.c b/opal/mca/btl/ofi/btl_ofi_component.c index 6785dcc74a0..bafa29f6c9d 100644 --- a/opal/mca/btl/ofi/btl_ofi_component.c +++ b/opal/mca/btl/ofi/btl_ofi_component.c @@ -14,7 +14,7 @@ * reserved. * Copyright (c) 2018-2019 Intel, Inc. All rights reserved. * - * Copyright (c) 2018-2021 Amazon.com, Inc. or its affiliates. All Rights reserved. + * Copyright (c) 2018-2025 Amazon.com, Inc. or its affiliates. All Rights reserved. * Copyright (c) 2020-2023 Triad National Security, LLC. All rights * reserved. * $COPYRIGHT$ @@ -657,7 +657,6 @@ static int mca_btl_ofi_init_device(struct fi_info *info) module->outstanding_rdma = 0; module->use_virt_addr = false; module->use_fi_mr_bind = false; - module->bypass_cache = false; #if defined(FI_HMEM) if (ofi_info->caps & FI_HMEM) { @@ -674,13 +673,6 @@ static int mca_btl_ofi_init_device(struct fi_info *info) module->use_fi_mr_bind = true; } - /* Currently there is no API to query whether the libfabric provider - * uses an underlying registration cache. For now, just check for known - * providers that use registration caching. */ - if (!strncasecmp(info->fabric_attr->prov_name, "efa", 3)) { - module->bypass_cache = true; - } - /* create endpoint list */ OBJ_CONSTRUCT(&module->endpoints, opal_list_t); OBJ_CONSTRUCT(&module->module_lock, opal_mutex_t); diff --git a/opal/mca/btl/ofi/btl_ofi_module.c b/opal/mca/btl/ofi/btl_ofi_module.c index e213d5b1865..58844bee018 100644 --- a/opal/mca/btl/ofi/btl_ofi_module.c +++ b/opal/mca/btl/ofi/btl_ofi_module.c @@ -14,7 +14,7 @@ * reserved. * Copyright (c) 2018 Intel, Inc, All rights reserved * - * Copyright (c) 2018 Amazon.com, Inc. or its affiliates. All Rights reserved. + * Copyright (c) 2018-2025 Amazon.com, Inc. or its affiliates. All Rights reserved. * Copyright (c) 2020 Google, LLC. All rights reserved. * Copyright (c) 2022-2024 Triad National Security, LLC. All rights * reserved. @@ -148,6 +148,20 @@ void mca_btl_ofi_rcache_init(mca_btl_ofi_module_t *module) if (!module->initialized) { mca_rcache_base_resources_t rcache_resources; char *tmp; + int ret; + + /* this must be called during single threaded part of the code and + * before Libfabric configures its memory monitors. Easiest to do + * that before domain open. Silently ignore not-supported errors, + * as they are not critical to program correctness, but only + * indicate that LIbfabric will have to pick a different, possibly + * less optimal, monitor. */ + ret = opal_common_ofi_export_memory_monitor(); + if (0 != ret && -FI_ENOSYS != ret) { + opal_output_verbose(1, opal_common_ofi.output, + "Failed to inject Libfabric memory monitor: %s", + fi_strerror(-ret)); + } (void) opal_asprintf(&tmp, "ofi.%s", module->linux_device_name); @@ -198,9 +212,6 @@ mca_btl_ofi_register_mem(struct mca_btl_base_module_t *btl, int access_flags = flags & MCA_BTL_REG_FLAG_ACCESS_ANY; int rc; uint32_t cache_flags = 0; - if (ofi_module->bypass_cache) { - cache_flags |= MCA_RCACHE_FLAGS_CACHE_BYPASS; - } rc = ofi_module->rcache->rcache_register(ofi_module->rcache, base, size, cache_flags, access_flags, (mca_rcache_base_registration_t **) ®);