ofiwg · jiaxiyan · Sep 11, 2024 · Sep 11, 2024 · Sep 11, 2024 · Sep 11, 2024
diff --git a/include/ofi_util.h b/include/ofi_util.h
@@ -1117,6 +1117,9 @@ int ofi_check_rx_attr(const struct fi_provider *prov,
 int ofi_check_tx_attr(const struct fi_provider *prov,
 		      const struct fi_tx_attr *prov_attr,
 		      const struct fi_tx_attr *user_attr, uint64_t info_mode);
+int ofi_check_hmem_attr(const struct fi_provider *prov,
+			const struct fi_hmem_attr *prov_attr,
+			const struct fi_info *user_info);
 int ofi_check_attr_subset(const struct fi_provider *prov,
 			uint64_t base_caps, uint64_t requested_caps);
 int ofi_prov_check_info(const struct util_prov *util_prov,

diff --git a/include/rdma/fabric.h b/include/rdma/fabric.h
@@ -360,6 +360,22 @@ enum {
 	FI_TC_NETWORK_CTRL,
 };
 
+enum fi_hmem_iface {
+	FI_HMEM_SYSTEM = 0,
+	FI_HMEM_CUDA,
+	FI_HMEM_ROCR,
+	FI_HMEM_ZE,
+	FI_HMEM_NEURON,
+	FI_HMEM_SYNAPSEAI,
+};
+
+enum fi_hmem_attr_opt {
+	FI_HMEM_ATTR_UNSPEC = 0,
+	FI_HMEM_ATTR_REQUIRED,
+	FI_HMEM_ATTR_PREFERRED,
+	FI_HMEM_ATTR_DISABLED,
+};
+
 static inline uint32_t fi_tc_dscp_set(uint8_t dscp)
 {
 	return ((uint32_t) dscp) | FI_TC_DSCP;
@@ -465,6 +481,14 @@ struct fi_fabric_attr {
 	uint32_t		api_version;
 };
 
+struct fi_hmem_attr {
+	enum fi_hmem_iface		iface;
+	enum fi_hmem_attr_opt		api_permitted;
+	enum fi_hmem_attr_opt		use_p2p;
+	enum fi_hmem_attr_opt		use_dev_reg_copy;
+	struct fi_hmem_attr		*next;
+};
+
 struct fi_info {
 	struct fi_info		*next;
 	uint64_t		caps;
@@ -481,6 +505,7 @@ struct fi_info {
 	struct fi_domain_attr	*domain_attr;
 	struct fi_fabric_attr	*fabric_attr;
 	struct fid_nic		*nic;
+	struct fi_hmem_attr	*hmem_attr;
 };
 
 struct fi_device_attr {
@@ -771,6 +796,7 @@ enum fi_type {
 	FI_TYPE_MR_ATTR,
 	FI_TYPE_CNTR_ATTR,
 	FI_TYPE_CQ_ERR_ENTRY,
+	FI_TYPE_HMEM_ATTR,
 };
 
 char *fi_tostr(const void *data, enum fi_type datatype);

diff --git a/include/rdma/fi_domain.h b/include/rdma/fi_domain.h
@@ -128,15 +128,6 @@ struct fid_mr {
 	uint64_t		key;
 };
 
-enum fi_hmem_iface {
-	FI_HMEM_SYSTEM	= 0,
-	FI_HMEM_CUDA,
-	FI_HMEM_ROCR,
-	FI_HMEM_ZE,
-	FI_HMEM_NEURON,
-	FI_HMEM_SYNAPSEAI,
-};
-
 static inline int fi_hmem_ze_device(int driver_index, int device_index)
 {
 	return driver_index << 16 | device_index;

diff --git a/man/fabric.7.md b/man/fabric.7.md
@@ -455,6 +455,11 @@ Added new fields to the following attributes:
 *fi_domain_attr*
 : Added max_group_id
 
+*fi_info*
+: The fi_info structure was expanded to reference a new fabric object,
+  fi_hmem_attr.  When available, the fi_hmem_attr references a new set of 
+  attributes related to heterogeneous memory.
+
 # SEE ALSO
 
 [`fi_info`(1)](fi_info.1.html),

diff --git a/man/fi_fabric.3.md b/man/fi_fabric.3.md
@@ -177,6 +177,9 @@ datatype or field value.
 *FI_TYPE_LOG_SUBSYS*
 : enum fi_log_subsys
 
+*FI_TYPE_HMEM_ATTR*
+: struct fi_hmem_attr
+
 fi_tostr() will return a pointer to an internal libfabric buffer that
 should not be modified, and will be overwritten the next time
 fi_tostr() is invoked.  fi_tostr() is not thread safe.

diff --git a/man/fi_getinfo.3.md b/man/fi_getinfo.3.md
@@ -143,6 +143,7 @@ struct fi_info {
 	struct fi_domain_attr *domain_attr;
 	struct fi_fabric_attr *fabric_attr;
 	struct fid_nic        *nic;
+	struct fi_hmem_attr   *hmem_attr;
 };
 ```
 
@@ -249,6 +250,73 @@ struct fi_info {
   closely associated with a hardware NIC.  See
   [`fi_nic`(3)](fi_nic.3.html) for details.
 
+*hmem_attr - heterogeneous memory attributes*
+: Optionally supplied HMEM attributes.  HMEM attributes may be
+  specified and returned as part of fi_getinfo.  When provided as
+  hints, requested values of struct fi_hmem_attr should be set.  On
+  output, the actual HMEM attributes that can be provided will be
+  returned.
+
+## HMEM ATTRIBUTES
+
+```c
+enum fi_hmem_attr_opt {
+	FI_HMEM_ATTR_UNSPEC,
+	FI_HMEM_ATTR_REQUIRED,
+	FI_HMEM_ATTR_PREFERRED,
+	FI_HMEM_ATTR_DISABLED
+};
+
+struct fi_hmem_attr {
+	enum fi_hmem_iface		iface;
+	enum fi_hmem_attr_opt		api_permitted;
+	enum fi_hmem_attr_opt		use_p2p;
+	enum fi_hmem_attr_opt		use_dev_reg_copy;
+	struct fi_hmem_attr		*next;
+};
+```
+- *fi_hmem_attr_opt - int*
+: Defines how the provider should handle HMEM attributes for an interface.
+  By default, the provider will chose whether to use the attributes 
+  (FI_HMEM_ATTR_UNSPEC). 
+  Valid values defined in fabric.h are:
+	* FI_HMEM_ATTR_UNSPEC: The attribute may be used by the provider
+	  and is subject to the provider implementation.
+	* FI_HMEM_ATTR_REQUIRED: The attribute must be used for this interface,
+	  operations that cannot be performed will be reported as failing.
+	* FI_HMEM_ATTR_PREFERRED: The attribute should be used by the
+	  provider if available, but the provider may choose other implementation 
+	  if it is unavailable.
+	* FI_HMEM_ATTR_DISABLED: The attribute should not be used.
+
+- *iface*
+
+Indicates the software interfaces used by the application, details in 
+[`fi_mr`(3)](fi_mr.3.html)
+
+- *api_permitted*
+
+Controls whether libfabric is allowed to make device-specific API calls. 
+By default, libfabric is permitted to call device-specific API(e.g. CUDA API). 
+If user wish to prohibit libfabric from making such calls, user can achieve 
+that by set this field to FI_HMEM_ATTR_DISABLED.
+The setopt option FI_OPT_CUDA_API_PERMITTED for endpoint takes precedence 
+over this attribute when api_permitted is not disabled.
+
+- *use_p2p*
+
+Controls whether peer to peer FI_HMEM transfers should be used.
+The FI_OPT_FI_HMEM_P2P setopt option discussed in 
+[`fi_endpoint`(3)](fi_endpoint.3.html) takes precedence over this attribute.
+
+- *use_dev_reg_copy*
+
+Controls whether optimized memcpy for device memory is used, e.g. GDR copy.
+
+- *next*
+
+Pointer to the next fi_hmem_attr if using multiple non-system iface.
+
 # CAPABILITIES
 
 Interface capabilities are obtained by OR-ing the following flags

diff --git a/man/fi_info.1.md b/man/fi_info.1.md
@@ -216,6 +216,11 @@ fi_info:
             speed: 0
             state: FI_LINK_UP
             network_type: InfiniBand
+    fi_hmem_attr:
+        iface: FI_HMEM_SYSTEM
+        api_permitted: FI_HMEM_ATTR_UNSPEC
+        use_p2p: FI_HMEM_ATTR_UNSPEC
+        use_dev_reg_copy: FI_HMEM_ATTR_UNSPEC
 ```
 
 To see libfabric related environment variables `-e` option.

diff --git a/prov/util/src/util_attr.c b/prov/util/src/util_attr.c
@@ -1002,6 +1002,83 @@ int ofi_check_tx_attr(const struct fi_provider *prov,
 	return 0;
 }
 
+static bool ofi_compare_hmem_attr_opt(enum fi_hmem_attr_opt prov_opt,
+				  enum fi_hmem_attr_opt user_opt)
+{
+	switch (user_opt) {
+	case FI_HMEM_ATTR_UNSPEC:
+		return true;
+	case FI_HMEM_ATTR_REQUIRED:
+	case FI_HMEM_ATTR_PREFERRED:
+		return prov_opt != FI_HMEM_ATTR_DISABLED;
+	case FI_HMEM_ATTR_DISABLED:
+		return prov_opt != FI_HMEM_ATTR_REQUIRED;
+	default:
+		return false;
+	}
+}
+
+static int
+ofi_validate_hmem_attr_compat(const struct fi_provider *prov,
+			      const struct fi_hmem_attr *prov_attr_head,
+			      const struct fi_hmem_attr *user_attr)
+{
+	const struct fi_hmem_attr *prov_attr = prov_attr_head;
+
+	while (prov_attr) {
+		if (prov_attr->iface == user_attr->iface) {
+			if (!ofi_compare_hmem_attr_opt(
+				    prov_attr->api_permitted,
+				    user_attr->api_permitted)) {
+				FI_INFO(prov, FI_LOG_CORE,
+					"api_permitted option not supported\n");
+				return -FI_ENODATA;
+			}
+
+			if (!ofi_compare_hmem_attr_opt(
+				    prov_attr->use_p2p,
+				    user_attr->use_p2p)) {
+				FI_INFO(prov, FI_LOG_CORE,
+					"use_p2p option not supported\n");
+				return -FI_ENODATA;
+			}
+
+			if (!ofi_compare_hmem_attr_opt(
+				    prov_attr->use_dev_reg_copy,
+				    user_attr->use_dev_reg_copy)) {
+				FI_INFO(prov, FI_LOG_CORE,
+					"use_dev_reg_copy option not supported\n");
+				return -FI_ENODATA;
+			}
+
+			return 0;
+		}
+		prov_attr = prov_attr->next;
+	}
+
+	return -FI_ENODATA;
+}
+
+int ofi_check_hmem_attr(const struct fi_provider *prov,
+			const struct fi_hmem_attr *prov_attr,
+			const struct fi_info *user_info)
+{
+	struct fi_hmem_attr *user_attr = user_info->hmem_attr;
+
+	if (!(user_info->caps & FI_HMEM)) {
+		FI_INFO(prov, FI_LOG_CORE, "FI_HMEM not set\n");
+		return -FI_ENODATA;
+	}
+
+	while (user_attr) {
+		if (ofi_validate_hmem_attr_compat(prov, prov_attr, user_attr) < 0)
+			return -FI_ENODATA;
+		user_attr = user_attr->next;
+	}
+
+	return 0;
+}
+
 /* Use if there are multiple fi_info in the provider:
  * check provider's info */
 int ofi_prov_check_info(const struct util_prov *util_prov,
@@ -1152,6 +1229,13 @@ int ofi_check_info(const struct util_prov *util_prov,
 		if (ret)
 			return ret;
 	}
+
+	if (user_info->hmem_attr) {
+		ret = ofi_check_hmem_attr(prov, prov_info->hmem_attr, user_info);
+		if (ret)
+			return ret;
+	}
+
 	return 0;
 }
 
@@ -1271,6 +1355,24 @@ static void fi_alter_tx_attr(struct fi_tx_attr *attr,
 		attr->rma_iov_limit = hints->rma_iov_limit;
 }
 
+static void fi_alter_hmem_attr(struct fi_hmem_attr *attr,
+			       const struct fi_hmem_attr *hints)
+{
+	if (!hints)
+		return;
+
+	if (hints->iface)
+		attr->iface = hints->iface;
+	if (hints->api_permitted)
+		attr->api_permitted = hints->api_permitted;
+	if (hints->use_p2p)
+		attr->use_p2p = hints->use_p2p;
+	if (hints->use_dev_reg_copy)
+		attr->use_dev_reg_copy = hints->use_dev_reg_copy;
+	if (hints->next)
+		attr->next = hints->next;
+}
+
 static uint64_t ofi_get_info_caps(const struct fi_info *prov_info,
 				  const struct fi_info *user_info,
 				  uint32_t api_version)
@@ -1336,5 +1438,6 @@ void ofi_alter_info(struct fi_info *info, const struct fi_info *hints,
 				 info->caps);
 		fi_alter_tx_attr(info->tx_attr, hints ? hints->tx_attr : NULL,
 				 info->caps);
+		fi_alter_hmem_attr(info->hmem_attr, hints ? hints->hmem_attr : NULL);
 	}
 }
diff --git a/src/abi_1_0.c b/src/abi_1_0.c
@@ -281,6 +281,14 @@ struct fi_domain_attr_1_7 {
 	size_t			max_ep_auth_key;
 };
 
+struct fi_hmem_attr_1_7 {
+        enum fi_hmem_iface        iface;
+        enum fi_hmem_attr_opt     api_permitted;
+        enum fi_hmem_attr_opt     use_p2p;
+        enum fi_hmem_attr_opt     use_dev_reg_copy;
+        struct fi_hmem_attr       *next;
+};
+
 #define fi_tx_attr_1_7 fi_tx_attr_1_3
 #define fi_rx_attr_1_7 fi_rx_attr_1_3
 #define fi_ep_attr_1_7 fi_ep_attr_1_3
@@ -303,6 +311,7 @@ struct fi_info_1_7 {
         struct fi_domain_attr_1_7 *domain_attr;
         struct fi_fabric_attr_1_7 *fabric_attr;
         struct fid_nic_1_7        *nic;
+        struct fi_hmem_attr_1_7   *hmem_attr;
 };
 
 #define ofi_dup_attr(dst, src)				\