Skip to content

Commit

Permalink
prov/efa: Make efa_hmem_info a global variable
Browse files Browse the repository at this point in the history
Currently efa_hmem_info is part of efa_domain and created for every
efa domain. hmem_info init involves several operations like device
memory allocation / free, and trial ibv reg mr, which is expensive
and can potentially cause more memory usage. Make efa_hmem_info
a global variable and call it only once per process.
Remove p2p_disabled_by_user and p2p_required_by_impl from
efa_hmem_info since they are only used for ep level operations.

Signed-off-by: Jessie Yang <[email protected]>
  • Loading branch information
jiaxiyan authored and shijin-aws committed Oct 30, 2024
1 parent f16e8be commit 5ae97ef
Show file tree
Hide file tree
Showing 14 changed files with 96 additions and 140 deletions.
7 changes: 0 additions & 7 deletions prov/efa/src/efa_domain.c
Original file line number Diff line number Diff line change
Expand Up @@ -297,13 +297,6 @@ int efa_domain_open(struct fid_fabric *fabric_fid, struct fi_info *info,
goto err_free;
}

err = efa_domain_hmem_info_init_all(efa_domain);
if (err) {
ret = err;
EFA_WARN(FI_LOG_DOMAIN, "Failed to check hmem support status. err: %d\n", ret);
goto err_free;
}

dlist_insert_tail(&efa_domain->list_entry, &g_efa_domain_list);
return 0;

Expand Down
1 change: 0 additions & 1 deletion prov/efa/src/efa_domain.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@ struct efa_domain {
struct ofi_mr_cache *cache;
struct efa_qp **qp_table;
size_t qp_table_sz_m1;
struct efa_hmem_info hmem_info[OFI_HMEM_MAX];
size_t mtu_size;
size_t addrlen;
bool mr_local;
Expand Down
70 changes: 26 additions & 44 deletions prov/efa/src/efa_hmem.c
Original file line number Diff line number Diff line change
Expand Up @@ -5,16 +5,18 @@
#include "efa_hmem.h"
#include "rdm/efa_rdm_pkt_type.h"

struct efa_hmem_info g_efa_hmem_info[OFI_HMEM_MAX];

#if HAVE_CUDA || HAVE_NEURON
static size_t efa_max_eager_msg_size_with_largest_header(struct efa_domain *efa_domain) {
static size_t efa_max_eager_msg_size_with_largest_header() {
int mtu_size;

mtu_size = efa_domain->device->rdm_info->ep_attr->max_msg_size;
mtu_size = g_device_list[0].rdm_info->ep_attr->max_msg_size;

return mtu_size - efa_rdm_pkt_type_get_max_hdr_size();
}
#else
static size_t efa_max_eager_msg_size_with_largest_header(struct efa_domain *efa_domain) {
static size_t efa_max_eager_msg_size_with_largest_header() {
return 0;
}
#endif
Expand All @@ -23,14 +25,13 @@ static size_t efa_max_eager_msg_size_with_largest_header(struct efa_domain *efa_
* @brief Initialize the various protocol thresholds tracked in efa_hmem_info
* according to the given FI_HMEM interface.
*
* @param[in,out] efa_domain Pointer to struct efa_domain
* @param[in] iface The FI_HMEM interface to initialize
*
* @return 0
*/
static int efa_domain_hmem_info_init_protocol_thresholds(struct efa_domain *efa_domain, enum fi_hmem_iface iface)
static int efa_domain_hmem_info_init_protocol_thresholds(enum fi_hmem_iface iface)
{
struct efa_hmem_info *info = &efa_domain->hmem_info[iface];
struct efa_hmem_info *info = &g_efa_hmem_info[iface];
size_t tmp_value;

/* Fall back to FI_HMEM_SYSTEM initialization logic when p2p is
Expand All @@ -53,8 +54,8 @@ static int efa_domain_hmem_info_init_protocol_thresholds(struct efa_domain *efa_
case FI_HMEM_CUDA:
info->runt_size = EFA_DEFAULT_RUNT_SIZE;
info->max_medium_msg_size = 0;
info->min_read_msg_size = efa_max_eager_msg_size_with_largest_header(efa_domain) + 1;
info->min_read_write_size = efa_max_eager_msg_size_with_largest_header(efa_domain) + 1;
info->min_read_msg_size = efa_max_eager_msg_size_with_largest_header() + 1;
info->min_read_write_size = efa_max_eager_msg_size_with_largest_header() + 1;
fi_param_get_size_t(&efa_prov, "runt_size", &info->runt_size);
fi_param_get_size_t(&efa_prov, "inter_min_read_message_size", &info->min_read_msg_size);
fi_param_get_size_t(&efa_prov, "inter_min_read_write_size", &info->min_read_write_size);
Expand All @@ -68,8 +69,8 @@ static int efa_domain_hmem_info_init_protocol_thresholds(struct efa_domain *efa_
case FI_HMEM_NEURON:
info->runt_size = EFA_NEURON_RUNT_SIZE;
info->max_medium_msg_size = 0;
info->min_read_msg_size = efa_max_eager_msg_size_with_largest_header(efa_domain) + 1;
info->min_read_write_size = efa_max_eager_msg_size_with_largest_header(efa_domain) + 1;
info->min_read_msg_size = efa_max_eager_msg_size_with_largest_header() + 1;
info->min_read_write_size = efa_max_eager_msg_size_with_largest_header() + 1;
fi_param_get_size_t(&efa_prov, "runt_size", &info->runt_size);
fi_param_get_size_t(&efa_prov, "inter_min_read_message_size", &info->min_read_msg_size);
fi_param_get_size_t(&efa_prov, "inter_min_read_write_size", &info->min_read_write_size);
Expand Down Expand Up @@ -105,7 +106,7 @@ static int efa_domain_hmem_info_init_protocol_thresholds(struct efa_domain *efa_
return 0;
}

static inline void efa_domain_hmem_info_check_p2p_support_cuda(struct efa_hmem_info *info) {
static inline void efa_hmem_info_check_p2p_support_cuda(struct efa_hmem_info *info) {
#if HAVE_CUDA
cudaError_t cuda_ret;
void *ptr = NULL;
Expand Down Expand Up @@ -168,7 +169,7 @@ static inline void efa_domain_hmem_info_check_p2p_support_cuda(struct efa_hmem_i
return;
}

static inline void efa_domain_hmem_info_check_p2p_support_neuron(struct efa_hmem_info *info) {
static inline void efa_hmem_info_check_p2p_support_neuron(struct efa_hmem_info *info) {
#if HAVE_NEURON
struct ibv_mr *ibv_mr = NULL;
int ibv_access = IBV_ACCESS_LOCAL_WRITE;
Expand Down Expand Up @@ -239,13 +240,12 @@ static inline void efa_domain_hmem_info_check_p2p_support_neuron(struct efa_hmem
/**
* @brief Initialize the efa_hmem_info state for iface
*
* @param[in,out] efa_domain Pointer to struct efa_domain
* @param[in] iface HMEM interface
*/
static void
efa_domain_hmem_info_init_iface(struct efa_domain *efa_domain, enum fi_hmem_iface iface)
efa_hmem_info_init_iface(enum fi_hmem_iface iface)
{
struct efa_hmem_info *info = &efa_domain->hmem_info[iface];
struct efa_hmem_info *info = &g_efa_hmem_info[iface];

if (!ofi_hmem_is_initialized(iface)) {
EFA_INFO(FI_LOG_DOMAIN, "%s is not initialized\n",
Expand All @@ -262,41 +262,27 @@ efa_domain_hmem_info_init_iface(struct efa_domain *efa_domain, enum fi_hmem_ifac
}

info->initialized = true;
info->p2p_disabled_by_user = (iface == FI_HMEM_SYSTEM) ? false : ofi_hmem_p2p_disabled();

if (iface == FI_HMEM_SYNAPSEAI || iface == FI_HMEM_SYSTEM) {
info->p2p_supported_by_device = true;
} else if (info->p2p_disabled_by_user) {
} else if (ofi_hmem_p2p_disabled()) {
info->p2p_supported_by_device = false;
} else {
if (iface == FI_HMEM_CUDA)
efa_domain_hmem_info_check_p2p_support_cuda(info);
efa_hmem_info_check_p2p_support_cuda(info);
if (iface == FI_HMEM_NEURON)
efa_domain_hmem_info_check_p2p_support_neuron(info);
efa_hmem_info_check_p2p_support_neuron(info);
if (!info->p2p_supported_by_device)
EFA_INFO(FI_LOG_DOMAIN, "%s P2P support is not available.\n", fi_tostr(&iface, FI_TYPE_HMEM_IFACE));
}

info->p2p_required_by_impl = true;
/* If user is using libfabric API 1.18 or later, by default EFA
* provider is permitted to use CUDA library to support CUDA
* memory, therefore p2p is not required.
*/
if (iface == FI_HMEM_CUDA &&
FI_VERSION_GE(efa_domain->util_domain.fabric->fabric_fid.api_version, FI_VERSION(1, 18)))
info->p2p_required_by_impl = !hmem_ops[iface].initialized;
if (iface == FI_HMEM_SYSTEM)
info->p2p_required_by_impl = false;

efa_domain_hmem_info_init_protocol_thresholds(efa_domain, iface);
efa_domain_hmem_info_init_protocol_thresholds(iface);
}

/**
* @brief Validate an FI_OPT_FI_HMEM_P2P (FI_OPT_ENDPOINT) option for a
* specified HMEM interface.
* Also update hmem_info[iface]->p2p_disabled_by_user accordingly.
*
* @param[in,out] domain The efa_domain struct which contains an efa_hmem_info array
* @param[in] iface The fi_hmem_iface enum of the FI_HMEM interface to validate
* @param[in] p2p_opt The P2P option to validate
*
Expand All @@ -305,9 +291,9 @@ efa_domain_hmem_info_init_iface(struct efa_domain *efa_domain, enum fi_hmem_ifac
* -FI_ENODATA if the given HMEM interface was not initialized
* -FI_EINVAL if p2p_opt is not a valid FI_OPT_FI_HMEM_P2P option
*/
int efa_domain_hmem_validate_p2p_opt(struct efa_domain *efa_domain, enum fi_hmem_iface iface, int p2p_opt)
int efa_hmem_validate_p2p_opt(enum fi_hmem_iface iface, int p2p_opt, uint32_t api_version)
{
struct efa_hmem_info *info = &efa_domain->hmem_info[iface];
struct efa_hmem_info *info = &g_efa_hmem_info[iface];

if (OFI_UNLIKELY(!info->initialized))
return -FI_ENODATA;
Expand All @@ -317,7 +303,6 @@ int efa_domain_hmem_validate_p2p_opt(struct efa_domain *efa_domain, enum fi_hmem
if (OFI_UNLIKELY(ofi_hmem_p2p_disabled()) || !info->p2p_supported_by_device)
return -FI_EOPNOTSUPP;

info->p2p_disabled_by_user = false;
return 0;
/*
* According to fi_setopt() document:
Expand All @@ -334,14 +319,13 @@ int efa_domain_hmem_validate_p2p_opt(struct efa_domain *efa_domain, enum fi_hmem
if (OFI_UNLIKELY(ofi_hmem_p2p_disabled()))
return -FI_EOPNOTSUPP;

info->p2p_disabled_by_user = false;
return 0;

case FI_HMEM_P2P_DISABLED:
if (info->p2p_required_by_impl)
/* return -FI_EOPNOTSUPP if p2p is required by implementation */
if (iface != FI_HMEM_CUDA || FI_VERSION_LT(api_version, FI_VERSION(1, 18)))
return -FI_EOPNOTSUPP;

info->p2p_disabled_by_user = true;
return 0;
}

Expand All @@ -354,23 +338,21 @@ int efa_domain_hmem_validate_p2p_opt(struct efa_domain *efa_domain, enum fi_hmem
* struct will be used to determine which efa transfer
* protocol should be selected.
*
* @param[in,out] efa_domain Pointer to struct efa_domain to be initialized
*
* @return 0 on success
* negative libfabric error code on an unexpected error
*/
int efa_domain_hmem_info_init_all(struct efa_domain *efa_domain)
int efa_hmem_info_initialize()
{
int ret = 0, i = 0;

if(g_device_cnt <= 0) {
return -FI_ENODEV;
}

memset(efa_domain->hmem_info, 0, OFI_HMEM_MAX * sizeof(struct efa_hmem_info));
memset(g_efa_hmem_info, 0, OFI_HMEM_MAX * sizeof(struct efa_hmem_info));

EFA_HMEM_IFACE_FOREACH(i) {
efa_domain_hmem_info_init_iface(efa_domain, efa_hmem_ifaces[i]);
efa_hmem_info_init_iface(efa_hmem_ifaces[i]);
}

return ret;
Expand Down
8 changes: 4 additions & 4 deletions prov/efa/src/efa_hmem.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,6 @@ static const enum fi_hmem_iface efa_hmem_ifaces[] = {

struct efa_hmem_info {
bool initialized; /* do we support it at all */
bool p2p_disabled_by_user; /* Did the user disable p2p via FI_OPT_FI_HMEM_P2P? */
bool p2p_required_by_impl; /* Is p2p required for this interface? */
bool p2p_supported_by_device; /* do we support p2p with this device */

size_t max_medium_msg_size;
Expand All @@ -33,10 +31,12 @@ struct efa_hmem_info {
size_t min_read_write_size;
};

extern struct efa_hmem_info g_efa_hmem_info[OFI_HMEM_MAX];

struct efa_domain;

int efa_domain_hmem_validate_p2p_opt(struct efa_domain *efa_domain, enum fi_hmem_iface iface, int p2p_opt);
int efa_domain_hmem_info_init_all(struct efa_domain *efa_domain);
int efa_hmem_validate_p2p_opt(enum fi_hmem_iface iface, int p2p_opt, uint32_t api_version);
int efa_hmem_info_initialize();

/**
* @brief Copy data from a hmem device to a system buffer
Expand Down
4 changes: 2 additions & 2 deletions prov/efa/src/efa_mr.c
Original file line number Diff line number Diff line change
Expand Up @@ -192,7 +192,7 @@ static int efa_mr_hmem_setup(struct efa_mr *efa_mr,
}

if (efa_mr->domain->util_domain.info_domain_caps & FI_HMEM) {
if (efa_mr->domain->hmem_info[attr->iface].initialized) {
if (g_efa_hmem_info[attr->iface].initialized) {
efa_mr->peer.iface = attr->iface;
} else {
EFA_WARN(FI_LOG_MR,
Expand Down Expand Up @@ -813,7 +813,7 @@ static int efa_mr_reg_impl(struct efa_mr *efa_mr, uint64_t flags, const void *at
* For FI_HMEM_CUDA iface when p2p is unavailable, skip ibv_reg_mr() and
* generate proprietary mr_fid key.
*/
if (mr_attr.iface == FI_HMEM_CUDA && !efa_mr->domain->hmem_info[FI_HMEM_CUDA].p2p_supported_by_device) {
if (mr_attr.iface == FI_HMEM_CUDA && !g_efa_hmem_info[FI_HMEM_CUDA].p2p_supported_by_device) {
efa_mr->mr_fid.key = efa_mr_cuda_non_p2p_keygen();
} else {
efa_mr->ibv_mr = efa_mr_reg_ibv_mr(efa_mr, &mr_attr, fi_ibv_access, flags);
Expand Down
4 changes: 4 additions & 0 deletions prov/efa/src/efa_prov.c
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,10 @@ EFA_INI
if (err)
goto err_free;

err = efa_hmem_info_initialize();
if (err)
goto err_free;

dlist_init(&g_efa_domain_list);

return &efa_prov;
Expand Down
2 changes: 1 addition & 1 deletion prov/efa/src/rdm/efa_rdm_ep.h
Original file line number Diff line number Diff line change
Expand Up @@ -307,7 +307,7 @@ int efa_rdm_ep_use_p2p(struct efa_rdm_ep *efa_rdm_ep, struct efa_mr *efa_mr)
if (!efa_mr || efa_mr->peer.iface == FI_HMEM_SYSTEM)
return 1;

if (efa_rdm_ep_domain(efa_rdm_ep)->hmem_info[efa_mr->peer.iface].p2p_supported_by_device)
if (g_efa_hmem_info[efa_mr->peer.iface].p2p_supported_by_device)
return (efa_rdm_ep->hmem_p2p_opt != FI_HMEM_P2P_DISABLED);

if (efa_rdm_ep->hmem_p2p_opt == FI_HMEM_P2P_REQUIRED) {
Expand Down
36 changes: 22 additions & 14 deletions prov/efa/src/rdm/efa_rdm_ep_fiops.c
Original file line number Diff line number Diff line change
Expand Up @@ -434,7 +434,6 @@ static inline
void efa_rdm_ep_set_use_zcpy_rx(struct efa_rdm_ep *ep)
{
enum fi_hmem_iface iface;
struct efa_hmem_info *hmem_info;
uint64_t unsupported_caps = FI_DIRECTED_RECV | FI_TAGGED | FI_ATOMIC;

ep->use_zcpy_rx = true;
Expand Down Expand Up @@ -482,11 +481,11 @@ void efa_rdm_ep_set_use_zcpy_rx(struct efa_rdm_ep *ep)
}

/* Zero-copy receive requires P2P support. Disable it if any initialized HMEM iface does not support P2P. */
for (iface = FI_HMEM_SYSTEM; iface < OFI_HMEM_MAX; ++iface) {
hmem_info = &ep->base_ep.domain->hmem_info[iface];
if (hmem_info->initialized &&
!hmem_info->p2p_disabled_by_user &&
!hmem_info->p2p_supported_by_device) {
EFA_HMEM_IFACE_FOREACH(iface) {
if (g_efa_hmem_info[iface].initialized &&
!ofi_hmem_p2p_disabled() &&
ep->hmem_p2p_opt != FI_HMEM_P2P_DISABLED &&
!g_efa_hmem_info[iface].p2p_supported_by_device) {
EFA_INFO(FI_LOG_EP_CTRL,
"%s does not support P2P, zero-copy receive "
"protocol will be disabled\n",
Expand Down Expand Up @@ -530,6 +529,7 @@ int efa_rdm_ep_open(struct fid_domain *domain, struct fi_info *info,
struct efa_domain *efa_domain = NULL;
struct efa_rdm_ep *efa_rdm_ep = NULL;
int ret, retv, i;
enum fi_hmem_iface iface;

efa_rdm_ep = calloc(1, sizeof(*efa_rdm_ep));
if (!efa_rdm_ep)
Expand Down Expand Up @@ -606,6 +606,7 @@ int efa_rdm_ep_open(struct fid_domain *domain, struct fi_info *info,

efa_rdm_ep_init_linked_lists(efa_rdm_ep);

efa_rdm_ep->cuda_api_permitted = (FI_VERSION_GE(info->fabric_attr->api_version, FI_VERSION(1, 18)));
/* Set hmem_p2p_opt */
efa_rdm_ep->hmem_p2p_opt = FI_HMEM_P2P_DISABLED;

Expand All @@ -615,16 +616,21 @@ int efa_rdm_ep_open(struct fid_domain *domain, struct fi_info *info,
* tighter requirements for the default p2p opt
*/
EFA_HMEM_IFACE_FOREACH_NON_SYSTEM(i) {
if (efa_rdm_ep->base_ep.domain->hmem_info[efa_hmem_ifaces[i]].initialized &&
efa_rdm_ep->base_ep.domain->hmem_info[efa_hmem_ifaces[i]].p2p_supported_by_device) {
efa_rdm_ep->hmem_p2p_opt = efa_rdm_ep->base_ep.domain->hmem_info[efa_hmem_ifaces[i]].p2p_required_by_impl
? FI_HMEM_P2P_REQUIRED
: FI_HMEM_P2P_PREFERRED;
iface = efa_hmem_ifaces[i];
if (g_efa_hmem_info[iface].initialized &&
g_efa_hmem_info[iface].p2p_supported_by_device) {
/* If user is using libfabric API 1.18 or later, by default EFA
* provider is permitted to use CUDA library to support CUDA
* memory, therefore p2p is not required.
*/
efa_rdm_ep->hmem_p2p_opt =
(iface == FI_HMEM_CUDA && efa_rdm_ep->cuda_api_permitted) ?
FI_HMEM_P2P_PREFERRED :
FI_HMEM_P2P_REQUIRED;
break;
}
}

efa_rdm_ep->cuda_api_permitted = (FI_VERSION_GE(info->fabric_attr->api_version, FI_VERSION(1, 18)));
efa_rdm_ep->sendrecv_in_order_aligned_128_bytes = false;
efa_rdm_ep->write_in_order_aligned_128_bytes = false;

Expand Down Expand Up @@ -1413,7 +1419,9 @@ static int efa_rdm_ep_set_fi_hmem_p2p_opt(struct efa_rdm_ep *efa_rdm_ep, int opt
* tighter restrictions on valid p2p options.
*/
EFA_HMEM_IFACE_FOREACH_NON_SYSTEM(i) {
err = efa_domain_hmem_validate_p2p_opt(efa_rdm_ep_domain(efa_rdm_ep), efa_hmem_ifaces[i], opt);
err = efa_hmem_validate_p2p_opt(
efa_hmem_ifaces[i], opt,
efa_rdm_ep->base_ep.info->fabric_attr->api_version);
if (err == -FI_ENODATA)
continue;

Expand Down Expand Up @@ -1449,7 +1457,7 @@ static int efa_rdm_ep_set_cuda_api_permitted(struct efa_rdm_ep *ep, bool cuda_ap
/* CUDA memory can be supported by using either peer to peer or CUDA API. If neither is
* available, we cannot support CUDA memory
*/
if (!efa_rdm_ep_domain(ep)->hmem_info[FI_HMEM_CUDA].p2p_supported_by_device)
if (!g_efa_hmem_info[FI_HMEM_CUDA].p2p_supported_by_device)
return -FI_EOPNOTSUPP;

ep->cuda_api_permitted = false;
Expand Down
Loading

0 comments on commit 5ae97ef

Please sign in to comment.