diff --git a/include/ofi_util.h b/include/ofi_util.h index 911a69893ba..6e70d3e8333 100644 --- a/include/ofi_util.h +++ b/include/ofi_util.h @@ -1117,6 +1117,9 @@ int ofi_check_rx_attr(const struct fi_provider *prov, int ofi_check_tx_attr(const struct fi_provider *prov, const struct fi_tx_attr *prov_attr, const struct fi_tx_attr *user_attr, uint64_t info_mode); +int ofi_check_hmem_attr(const struct fi_provider *prov, + const struct fi_hmem_attr *prov_attr, + const struct fi_info *user_info); int ofi_check_attr_subset(const struct fi_provider *prov, uint64_t base_caps, uint64_t requested_caps); int ofi_prov_check_info(const struct util_prov *util_prov, diff --git a/include/rdma/fabric.h b/include/rdma/fabric.h index 420d2eacc05..2de93d0a7fb 100644 --- a/include/rdma/fabric.h +++ b/include/rdma/fabric.h @@ -360,6 +360,22 @@ enum { FI_TC_NETWORK_CTRL, }; +enum fi_hmem_iface { + FI_HMEM_SYSTEM = 0, + FI_HMEM_CUDA, + FI_HMEM_ROCR, + FI_HMEM_ZE, + FI_HMEM_NEURON, + FI_HMEM_SYNAPSEAI, +}; + +enum fi_hmem_attr_opt { + FI_HMEM_ATTR_UNSPEC = 0, + FI_HMEM_ATTR_REQUIRED, + FI_HMEM_ATTR_PREFERRED, + FI_HMEM_ATTR_DISABLED, +}; + static inline uint32_t fi_tc_dscp_set(uint8_t dscp) { return ((uint32_t) dscp) | FI_TC_DSCP; @@ -465,6 +481,14 @@ struct fi_fabric_attr { uint32_t api_version; }; +struct fi_hmem_attr { + enum fi_hmem_iface iface; + enum fi_hmem_attr_opt api_permitted; + enum fi_hmem_attr_opt use_p2p; + enum fi_hmem_attr_opt use_dev_reg_copy; + struct fi_hmem_attr *next; +}; + struct fi_info { struct fi_info *next; uint64_t caps; @@ -481,6 +505,7 @@ struct fi_info { struct fi_domain_attr *domain_attr; struct fi_fabric_attr *fabric_attr; struct fid_nic *nic; + struct fi_hmem_attr *hmem_attr; }; struct fi_device_attr { @@ -771,6 +796,7 @@ enum fi_type { FI_TYPE_MR_ATTR, FI_TYPE_CNTR_ATTR, FI_TYPE_CQ_ERR_ENTRY, + FI_TYPE_HMEM_ATTR, }; char *fi_tostr(const void *data, enum fi_type datatype); diff --git a/include/rdma/fi_domain.h b/include/rdma/fi_domain.h index 548e4b6ad3e..87b27fce118 100644 --- a/include/rdma/fi_domain.h +++ b/include/rdma/fi_domain.h @@ -128,15 +128,6 @@ struct fid_mr { uint64_t key; }; -enum fi_hmem_iface { - FI_HMEM_SYSTEM = 0, - FI_HMEM_CUDA, - FI_HMEM_ROCR, - FI_HMEM_ZE, - FI_HMEM_NEURON, - FI_HMEM_SYNAPSEAI, -}; - static inline int fi_hmem_ze_device(int driver_index, int device_index) { return driver_index << 16 | device_index; diff --git a/man/fabric.7.md b/man/fabric.7.md index a25190a46b0..5dedea377d3 100644 --- a/man/fabric.7.md +++ b/man/fabric.7.md @@ -455,6 +455,11 @@ Added new fields to the following attributes: *fi_domain_attr* : Added max_group_id +*fi_info* +: The fi_info structure was expanded to reference a new fabric object, + fi_hmem_attr. When available, the fi_hmem_attr references a new set of + attributes related to heterogeneous memory. + # SEE ALSO [`fi_info`(1)](fi_info.1.html), diff --git a/man/fi_fabric.3.md b/man/fi_fabric.3.md index 0d899ff9137..419cb68261f 100644 --- a/man/fi_fabric.3.md +++ b/man/fi_fabric.3.md @@ -177,6 +177,9 @@ datatype or field value. *FI_TYPE_LOG_SUBSYS* : enum fi_log_subsys +*FI_TYPE_HMEM_ATTR* +: struct fi_hmem_attr + fi_tostr() will return a pointer to an internal libfabric buffer that should not be modified, and will be overwritten the next time fi_tostr() is invoked. fi_tostr() is not thread safe. diff --git a/man/fi_getinfo.3.md b/man/fi_getinfo.3.md index 6219792257e..b4099278e78 100644 --- a/man/fi_getinfo.3.md +++ b/man/fi_getinfo.3.md @@ -143,6 +143,7 @@ struct fi_info { struct fi_domain_attr *domain_attr; struct fi_fabric_attr *fabric_attr; struct fid_nic *nic; + struct fi_hmem_attr *hmem_attr; }; ``` @@ -249,6 +250,73 @@ struct fi_info { closely associated with a hardware NIC. See [`fi_nic`(3)](fi_nic.3.html) for details. +*hmem_attr - heterogeneous memory attributes* +: Optionally supplied HMEM attributes. HMEM attributes may be + specified and returned as part of fi_getinfo. When provided as + hints, requested values of struct fi_hmem_attr should be set. On + output, the actual HMEM attributes that can be provided will be + returned. + +## HMEM ATTRIBUTES + +```c +enum fi_hmem_attr_opt { + FI_HMEM_ATTR_UNSPEC, + FI_HMEM_ATTR_REQUIRED, + FI_HMEM_ATTR_PREFERRED, + FI_HMEM_ATTR_DISABLED +}; + +struct fi_hmem_attr { + enum fi_hmem_iface iface; + enum fi_hmem_attr_opt api_permitted; + enum fi_hmem_attr_opt use_p2p; + enum fi_hmem_attr_opt use_dev_reg_copy; + struct fi_hmem_attr *next; +}; +``` +- *fi_hmem_attr_opt - int* +: Defines how the provider should handle HMEM attributes for an interface. + By default, the provider will chose whether to use the attributes + (FI_HMEM_ATTR_UNSPEC). + Valid values defined in fabric.h are: + * FI_HMEM_ATTR_UNSPEC: The attribute may be used by the provider + and is subject to the provider implementation. + * FI_HMEM_ATTR_REQUIRED: The attribute must be used for this interface, + operations that cannot be performed will be reported as failing. + * FI_HMEM_ATTR_PREFERRED: The attribute should be used by the + provider if available, but the provider may choose other implementation + if it is unavailable. + * FI_HMEM_ATTR_DISABLED: The attribute should not be used. + +- *iface* + +Indicates the software interfaces used by the application, details in +[`fi_mr`(3)](fi_mr.3.html) + +- *api_permitted* + +Controls whether libfabric is allowed to make device-specific API calls. +By default, libfabric is permitted to call device-specific API(e.g. CUDA API). +If user wish to prohibit libfabric from making such calls, user can achieve +that by set this field to FI_HMEM_ATTR_DISABLED. +The setopt option FI_OPT_CUDA_API_PERMITTED for endpoint takes precedence +over this attribute when api_permitted is not disabled. + +- *use_p2p* + +Controls whether peer to peer FI_HMEM transfers should be used. +The FI_OPT_FI_HMEM_P2P setopt option discussed in +[`fi_endpoint`(3)](fi_endpoint.3.html) takes precedence over this attribute. + +- *use_dev_reg_copy* + +Controls whether optimized memcpy for device memory is used, e.g. GDR copy. + +- *next* + +Pointer to the next fi_hmem_attr if using multiple non-system iface. + # CAPABILITIES Interface capabilities are obtained by OR-ing the following flags diff --git a/man/fi_info.1.md b/man/fi_info.1.md index 36bf07b3bc8..1143770a53b 100644 --- a/man/fi_info.1.md +++ b/man/fi_info.1.md @@ -216,6 +216,11 @@ fi_info: speed: 0 state: FI_LINK_UP network_type: InfiniBand + fi_hmem_attr: + iface: FI_HMEM_SYSTEM + api_permitted: FI_HMEM_ATTR_UNSPEC + use_p2p: FI_HMEM_ATTR_UNSPEC + use_dev_reg_copy: FI_HMEM_ATTR_UNSPEC ``` To see libfabric related environment variables `-e` option. diff --git a/prov/util/src/util_attr.c b/prov/util/src/util_attr.c index 634af1e5e82..b7e8c5a29e5 100644 --- a/prov/util/src/util_attr.c +++ b/prov/util/src/util_attr.c @@ -1002,6 +1002,83 @@ int ofi_check_tx_attr(const struct fi_provider *prov, return 0; } +static bool ofi_compare_hmem_attr_opt(enum fi_hmem_attr_opt prov_opt, + enum fi_hmem_attr_opt user_opt) +{ + switch (user_opt) { + case FI_HMEM_ATTR_UNSPEC: + return true; + case FI_HMEM_ATTR_REQUIRED: + case FI_HMEM_ATTR_PREFERRED: + return prov_opt != FI_HMEM_ATTR_DISABLED; + case FI_HMEM_ATTR_DISABLED: + return prov_opt != FI_HMEM_ATTR_REQUIRED; + default: + return false; + } +} + +static int +ofi_validate_hmem_attr_compat(const struct fi_provider *prov, + const struct fi_hmem_attr *prov_attr_head, + const struct fi_hmem_attr *user_attr) +{ + const struct fi_hmem_attr *prov_attr = prov_attr_head; + + while (prov_attr) { + if (prov_attr->iface == user_attr->iface) { + if (!ofi_compare_hmem_attr_opt( + prov_attr->api_permitted, + user_attr->api_permitted)) { + FI_INFO(prov, FI_LOG_CORE, + "api_permitted option not supported\n"); + return -FI_ENODATA; + } + + if (!ofi_compare_hmem_attr_opt( + prov_attr->use_p2p, + user_attr->use_p2p)) { + FI_INFO(prov, FI_LOG_CORE, + "use_p2p option not supported\n"); + return -FI_ENODATA; + } + + if (!ofi_compare_hmem_attr_opt( + prov_attr->use_dev_reg_copy, + user_attr->use_dev_reg_copy)) { + FI_INFO(prov, FI_LOG_CORE, + "use_dev_reg_copy option not supported\n"); + return -FI_ENODATA; + } + + return 0; + } + prov_attr = prov_attr->next; + } + + return -FI_ENODATA; +} + +int ofi_check_hmem_attr(const struct fi_provider *prov, + const struct fi_hmem_attr *prov_attr, + const struct fi_info *user_info) +{ + struct fi_hmem_attr *user_attr = user_info->hmem_attr; + + if (!(user_info->caps & FI_HMEM)) { + FI_INFO(prov, FI_LOG_CORE, "FI_HMEM not set\n"); + return -FI_ENODATA; + } + + while (user_attr) { + if (ofi_validate_hmem_attr_compat(prov, prov_attr, user_attr) < 0) + return -FI_ENODATA; + user_attr = user_attr->next; + } + + return 0; +} + /* Use if there are multiple fi_info in the provider: * check provider's info */ int ofi_prov_check_info(const struct util_prov *util_prov, @@ -1152,6 +1229,13 @@ int ofi_check_info(const struct util_prov *util_prov, if (ret) return ret; } + + if (user_info->hmem_attr) { + ret = ofi_check_hmem_attr(prov, prov_info->hmem_attr, user_info); + if (ret) + return ret; + } + return 0; } @@ -1271,6 +1355,24 @@ static void fi_alter_tx_attr(struct fi_tx_attr *attr, attr->rma_iov_limit = hints->rma_iov_limit; } +static void fi_alter_hmem_attr(struct fi_hmem_attr *attr, + const struct fi_hmem_attr *hints) +{ + if (!hints) + return; + + if (hints->iface) + attr->iface = hints->iface; + if (hints->api_permitted) + attr->api_permitted = hints->api_permitted; + if (hints->use_p2p) + attr->use_p2p = hints->use_p2p; + if (hints->use_dev_reg_copy) + attr->use_dev_reg_copy = hints->use_dev_reg_copy; + if (hints->next) + attr->next = hints->next; +} + static uint64_t ofi_get_info_caps(const struct fi_info *prov_info, const struct fi_info *user_info, uint32_t api_version) @@ -1336,5 +1438,6 @@ void ofi_alter_info(struct fi_info *info, const struct fi_info *hints, info->caps); fi_alter_tx_attr(info->tx_attr, hints ? hints->tx_attr : NULL, info->caps); + fi_alter_hmem_attr(info->hmem_attr, hints ? hints->hmem_attr : NULL); } } diff --git a/src/abi_1_0.c b/src/abi_1_0.c index afc17802f3d..02c488ffc44 100644 --- a/src/abi_1_0.c +++ b/src/abi_1_0.c @@ -281,6 +281,14 @@ struct fi_domain_attr_1_7 { size_t max_ep_auth_key; }; +struct fi_hmem_attr_1_7 { + enum fi_hmem_iface iface; + enum fi_hmem_attr_opt api_permitted; + enum fi_hmem_attr_opt use_p2p; + enum fi_hmem_attr_opt use_dev_reg_copy; + struct fi_hmem_attr *next; +}; + #define fi_tx_attr_1_7 fi_tx_attr_1_3 #define fi_rx_attr_1_7 fi_rx_attr_1_3 #define fi_ep_attr_1_7 fi_ep_attr_1_3 @@ -303,6 +311,7 @@ struct fi_info_1_7 { struct fi_domain_attr_1_7 *domain_attr; struct fi_fabric_attr_1_7 *fabric_attr; struct fid_nic_1_7 *nic; + struct fi_hmem_attr_1_7 *hmem_attr; }; #define ofi_dup_attr(dst, src) \ diff --git a/src/fabric.c b/src/fabric.c index b1a735638bb..598a1fb9f90 100644 --- a/src/fabric.c +++ b/src/fabric.c @@ -1041,6 +1041,7 @@ __attribute__((visibility ("default"),EXTERNALLY_VISIBLE)) void DEFAULT_SYMVER_PRE(fi_freeinfo)(struct fi_info *info) { struct fi_info *next; + struct fi_hmem_attr *next_hmem_attr; for (; info; info = next) { next = info->next; @@ -1067,6 +1068,12 @@ void DEFAULT_SYMVER_PRE(fi_freeinfo)(struct fi_info *info) FI_CHECK_OP(info->nic->fid.ops, struct fi_ops, close)) { fi_close(&info->nic->fid); } + if (info->hmem_attr) { + for (; info->hmem_attr; info->hmem_attr = next_hmem_attr) { + next_hmem_attr = info->hmem_attr->next; + free(info->hmem_attr); + } + } free(info); } } @@ -1303,6 +1310,21 @@ static int ofi_layering_ok(const struct fi_provider *provider, return !strcasecmp(provider->name, prov_name); } +static struct fi_hmem_attr *dup_hmem_attr(const struct fi_hmem_attr *hmem_attr) +{ + if (!hmem_attr) + return NULL; + + struct fi_hmem_attr *dup = + (struct fi_hmem_attr *) mem_dup(hmem_attr, sizeof(*hmem_attr)); + if (!dup) + return NULL; + + dup->next = dup_hmem_attr(hmem_attr->next); + + return dup; +} + __attribute__((visibility ("default"),EXTERNALLY_VISIBLE)) int DEFAULT_SYMVER_PRE(fi_getinfo)(uint32_t version, const char *node, const char *service, uint64_t flags, @@ -1423,7 +1445,7 @@ struct fi_info *ofi_allocinfo_internal(void) info->ep_attr = calloc(1, sizeof(*info->ep_attr)); info->domain_attr = calloc(1, sizeof(*info->domain_attr)); info->fabric_attr = calloc(1, sizeof(*info->fabric_attr)); - if (!info->tx_attr|| !info->rx_attr || !info->ep_attr || + if (!info->tx_attr || !info->rx_attr || !info->ep_attr || !info->domain_attr || !info->fabric_attr) goto err; @@ -1454,6 +1476,7 @@ struct fi_info *DEFAULT_SYMVER_PRE(fi_dupinfo)(const struct fi_info *info) dup->ep_attr = NULL; dup->domain_attr = NULL; dup->fabric_attr = NULL; + dup->hmem_attr = NULL; dup->next = NULL; if (info->src_addr != NULL) { @@ -1533,6 +1556,12 @@ struct fi_info *DEFAULT_SYMVER_PRE(fi_dupinfo)(const struct fi_info *info) goto fail; } + if (info->hmem_attr) { + dup->hmem_attr = dup_hmem_attr(info->hmem_attr); + if (dup->hmem_attr == NULL) + goto fail; + } + return dup; fail: diff --git a/src/fi_tostr.c b/src/fi_tostr.c index 910dfd1214b..6bbf7388f5c 100644 --- a/src/fi_tostr.c +++ b/src/fi_tostr.c @@ -294,6 +294,36 @@ ofi_tostr_addr(char *buf, size_t len, uint32_t addr_format, void *addr) ofi_straddr(p, &addrlen, addr_format, addr); } +static void +ofi_tostr_hmem_iface(char *buf, size_t len, enum fi_hmem_iface iface) +{ + switch (iface) { + CASEENUMSTRN(FI_HMEM_SYSTEM, len); + CASEENUMSTRN(FI_HMEM_CUDA, len); + CASEENUMSTRN(FI_HMEM_ROCR, len); + CASEENUMSTRN(FI_HMEM_ZE, len); + CASEENUMSTRN(FI_HMEM_NEURON, len); + CASEENUMSTRN(FI_HMEM_SYNAPSEAI, len); + default: + ofi_strncatf(buf, len, "Unknown"); + break; + } +} + +static void +ofi_tostr_hmem_attr_opt(char *buf, size_t len, enum fi_hmem_attr_opt hmem_attr_opt) +{ + switch (hmem_attr_opt) { + CASEENUMSTRN(FI_HMEM_ATTR_UNSPEC, len); + CASEENUMSTRN(FI_HMEM_ATTR_REQUIRED, len); + CASEENUMSTRN(FI_HMEM_ATTR_PREFERRED, len); + CASEENUMSTRN(FI_HMEM_ATTR_DISABLED, len); + default: + ofi_strncatf(buf, len, "Unknown"); + break; + } +} + static void ofi_tostr_tx_attr(char *buf, size_t len, const struct fi_tx_attr *attr, const char *prefix) @@ -560,6 +590,36 @@ ofi_tostr_fabric_attr(char *buf, size_t len, const struct fi_fabric_attr *attr, FI_MAJOR(attr->api_version), FI_MINOR(attr->api_version)); } +static void +ofi_tostr_hmem_attr(char *buf, size_t len, const struct fi_hmem_attr *attr, + const char *prefix) +{ + const struct fi_hmem_attr *next_hmem_attr = attr; + + if (!attr) { + ofi_strncatf(buf, len, "%sfi_hmem_attr: (null)\n", prefix); + return; + } + + ofi_strncatf(buf, len, "%sfi_hmem_attr:\n", prefix); + + while (next_hmem_attr != NULL) { + ofi_strncatf(buf, len, "%s%siface: ", prefix, TAB); + ofi_tostr_hmem_iface(buf, len, attr->iface); + ofi_strncatf(buf, len, "\n"); + ofi_strncatf(buf, len, "%s%s%sapi_permitted: ", prefix, TAB, TAB); + ofi_tostr_hmem_attr_opt(buf, len, attr->api_permitted); + ofi_strncatf(buf, len, "\n"); + ofi_strncatf(buf, len, "%s%s%suse_p2p: ", prefix, TAB, TAB); + ofi_tostr_hmem_attr_opt(buf, len, attr->use_p2p); + ofi_strncatf(buf, len, "\n"); + ofi_strncatf(buf, len, "%s%s%suse_dev_reg_copy: ", prefix, TAB, TAB); + ofi_tostr_hmem_attr_opt(buf, len, attr->use_dev_reg_copy); + ofi_strncatf(buf, len, "\n"); + next_hmem_attr = next_hmem_attr->next; + } +} + static void ofi_tostr_info(char *buf, size_t len, const struct fi_info *info) { ofi_strncatf(buf, len, "fi_info:\n"); @@ -592,6 +652,7 @@ static void ofi_tostr_info(char *buf, size_t len, const struct fi_info *info) ofi_tostr_domain_attr(buf, len, info->domain_attr, TAB); ofi_tostr_fabric_attr(buf, len, info->fabric_attr, TAB); ofi_tostr_fid(TAB "nic: ", buf, len, &info->nic->fid); + ofi_tostr_hmem_attr(buf, len, info->hmem_attr, TAB); } static void ofi_tostr_atomic_type(char *buf, size_t len, enum fi_datatype type) @@ -707,22 +768,6 @@ static void ofi_tostr_cq_event_flags(char *buf, size_t len, uint64_t flags) ofi_remove_comma(buf); } -static void -ofi_tostr_hmem_iface(char *buf, size_t len, enum fi_hmem_iface iface) -{ - switch (iface) { - CASEENUMSTRN(FI_HMEM_SYSTEM, len); - CASEENUMSTRN(FI_HMEM_CUDA, len); - CASEENUMSTRN(FI_HMEM_ROCR, len); - CASEENUMSTRN(FI_HMEM_ZE, len); - CASEENUMSTRN(FI_HMEM_NEURON, len); - CASEENUMSTRN(FI_HMEM_SYNAPSEAI, len); - default: - ofi_strncatf(buf, len, "Unknown"); - break; - } -} - static void ofi_tostr_cq_format(char *buf, size_t len, enum fi_cq_format cq_format) { @@ -1037,6 +1082,8 @@ char *DEFAULT_SYMVER_PRE(fi_tostr_r)(char *buf, size_t len, case FI_TYPE_CQ_ERR_ENTRY: ofi_tostr_cq_err_entry(buf, len, data); break; + case FI_TYPE_HMEM_ATTR: + ofi_tostr_hmem_attr(buf, len, data, ""); default: ofi_strncatf(buf, len, "Unknown type"); break;