From 0cf58aee304e1c2dd6de6a9365af224ce45d68ea Mon Sep 17 00:00:00 2001 From: Philip Marshall Date: Thu, 14 Sep 2023 15:47:19 -0700 Subject: [PATCH 01/16] src: Implement SHMEM_REQUIRE_MULTIRAIL_ENV environment variable --- src/shmem_env_defs.h | 2 ++ src/transport_ofi.c | 60 +++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 58 insertions(+), 4 deletions(-) diff --git a/src/shmem_env_defs.h b/src/shmem_env_defs.h index 78bb4f40..796a6942 100644 --- a/src/shmem_env_defs.h +++ b/src/shmem_env_defs.h @@ -107,6 +107,8 @@ SHMEM_INTERNAL_ENV_DEF(OFI_STX_ALLOCATOR, string, "round-robin", SHMEM_INTERNAL_ "Algorithm for allocating STX resources to contexts") SHMEM_INTERNAL_ENV_DEF(OFI_STX_DISABLE_PRIVATE, bool, false, SHMEM_INTERNAL_ENV_CAT_TRANSPORT, "Disallow private contexts from having exclusive STX access") +SHMEM_INTERNAL_ENV_DEF(REQUIRE_MULTIRAIL_ENV, bool, false, SHMEM_INTERNAL_ENV_CAT_TRANSPORT, + "Error out of program if multi-NIC environment not detected") #endif #ifdef USE_UCX diff --git a/src/transport_ofi.c b/src/transport_ofi.c index e5ae8baa..5b4cd8c1 100644 --- a/src/transport_ofi.c +++ b/src/transport_ofi.c @@ -1424,23 +1424,75 @@ int query_for_fabric(struct fabric_info *info) if (info->fabric_name != NULL || info->domain_name != NULL) { struct fi_info *cur_fabric; - info->p_info = NULL; + //if (shmem_internal_params.USE_MULTIRAIL) { + int num_nics = 0; + struct fi_info *fallback = NULL; + struct fi_info *filtered_fabrics_list_head = NULL; + struct fi_info *filtered_fabrics_last_added = NULL; + struct fi_info *multirail_fabric_list_head = NULL; + struct fi_info *multirail_fabric_last_added = NULL; + + if (info->fabric_name != NULL || info->domain_name != NULL) { + struct fi_info *cur_fabric; for (cur_fabric = info->fabrics; cur_fabric; cur_fabric = cur_fabric->next) { if (info->fabric_name == NULL || fnmatch(info->fabric_name, cur_fabric->fabric_attr->name, 0) == 0) { if (info->domain_name == NULL || fnmatch(info->domain_name, cur_fabric->domain_attr->name, 0) == 0) { - info->p_info = cur_fabric; - break; + if (!filtered_fabrics_list_head) filtered_fabrics_list_head = cur_fabric; + if (filtered_fabrics_last_added) filtered_fabrics_last_added->next = cur_fabric; + filtered_fabrics_last_added = cur_fabric; } } } + filtered_fabrics_last_added->next = NULL; } else { - info->p_info = info->fabrics; + filtered_fabrics_list_head = info->fabrics; + } + //TODO: Should probably check that filtered_fabrics_list_head is not NULL, otherwise error + + struct fi_info *cur_fabric; + + info->p_info = NULL; + + for (cur_fabric = filtered_fabrics_list_head; cur_fabric; cur_fabric = cur_fabric->next) { + if (cur_fabric->nic && !nic_already_used(cur_fabric->nic, multirail_fabric_list_head, num_nics)) { + num_nics += 1; + if (!multirail_fabric_list_head) multirail_fabric_list_head = cur_fabric; + if (multirail_fabric_last_added) multirail_fabric_last_added->next = cur_fabric; + multirail_fabric_last_added = cur_fabric; + } } + if (num_nics == 0) { + if (shmem_internal_params.REQUIRE_MULTIRAIL_ENV) + RAISE_ERROR_MSG("Unable to detect multiple NICs\n"); + else + DEBUG_MSG("Unable to detect multiple NICs\n"); + info->p_info = fallback; + } + else { + int idx = 0; + struct fi_info **prov_list = (struct fi_info **) malloc(num_nics * sizeof(struct fi_info *)); + for (cur_fabric = multirail_fabric_list_head; cur_fabric; cur_fabric = cur_fabric->next) { + prov_list[idx++] = cur_fabric; + } + qsort(prov_list, num_nics, sizeof(struct fi_info *), compare_nic_names); + info->p_info = prov_list[shmem_internal_my_pe % num_nics]; + //Correct to this once SHMEM_TEAM_SHARED/SHMEMX_TEAM_HOST fixed: + //info->p_info = prov_list[shmem_team_translate_pe(SHMEM_TEAM_WORLD, shmem_internal_my_pe, SHMEMX_TEAM_HOST) % num_nics]; + } + DEBUG_MSG("Number of unique NICs detected: %d\n", num_nics); + + // TODO: Do we want to allow a user to explicitly request not to use multi-NIC functionality? If so, this complicates + // the usage of SHMEM_REQUIRE_MULTIRAIL_ENV (which would likely be renamed to SHMEM_USE_MULTIRAIL). Could not be simple bool, + // would need to be tri-state (unset/default, true, or false): + // unset/default: Attempt to use multiple NICs, but if not possible, use single NIC + // true: If multiple NICs not detected, raise error + // false: Do not attempt to utilize multiple NICs (ie. assign all PEs the same provider for info->p_info) + if (NULL == info->p_info) { RAISE_WARN_MSG("OFI transport, no valid fabric (prov=%s, fabric=%s, domain=%s)\n", info->prov_name != NULL ? info->prov_name : "", From a0526b60203c7e56ddf487c10a1b85eab0abc4f7 Mon Sep 17 00:00:00 2001 From: Philip Marshall Date: Mon, 25 Sep 2023 14:19:46 -0700 Subject: [PATCH 02/16] src: Assign fallback fabric --- src/transport_ofi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transport_ofi.c b/src/transport_ofi.c index 5b4cd8c1..cacd01f0 100644 --- a/src/transport_ofi.c +++ b/src/transport_ofi.c @@ -1458,6 +1458,7 @@ int query_for_fabric(struct fabric_info *info) info->p_info = NULL; for (cur_fabric = filtered_fabrics_list_head; cur_fabric; cur_fabric = cur_fabric->next) { + if (!fallback) fallback = cur_fabric; if (cur_fabric->nic && !nic_already_used(cur_fabric->nic, multirail_fabric_list_head, num_nics)) { num_nics += 1; if (!multirail_fabric_list_head) multirail_fabric_list_head = cur_fabric; @@ -1484,7 +1485,6 @@ int query_for_fabric(struct fabric_info *info) //Correct to this once SHMEM_TEAM_SHARED/SHMEMX_TEAM_HOST fixed: //info->p_info = prov_list[shmem_team_translate_pe(SHMEM_TEAM_WORLD, shmem_internal_my_pe, SHMEMX_TEAM_HOST) % num_nics]; } - DEBUG_MSG("Number of unique NICs detected: %d\n", num_nics); // TODO: Do we want to allow a user to explicitly request not to use multi-NIC functionality? If so, this complicates // the usage of SHMEM_REQUIRE_MULTIRAIL_ENV (which would likely be renamed to SHMEM_USE_MULTIRAIL). Could not be simple bool, From 3d2809d5b86646611ad2e07d034dcebd2bc6ca4b Mon Sep 17 00:00:00 2001 From: Philip Marshall Date: Thu, 19 Oct 2023 07:02:35 -0700 Subject: [PATCH 03/16] src: Begin work on adding hwloc support for NIC assignment --- configure.ac | 7 +++ src/transport_ofi.c | 111 ++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 114 insertions(+), 4 deletions(-) diff --git a/configure.ac b/configure.ac index 39197dda..4ac9bc64 100755 --- a/configure.ac +++ b/configure.ac @@ -332,6 +332,13 @@ AC_ARG_ENABLE([ofi-hmem], AS_IF([test "$enable_ofi_hmem" = "yes"], [AC_DEFINE([USE_FI_HMEM], [1], [If defined, the OFI transport will enable FI_HMEM.])]) +AC_ARG_ENABLE([hwloc], + [AC_HELP_STRING([--enable-hwloc], + [Use hwloc to optimize assignment of NICs to PEs in multi-NIC environments. (default: disabled)])]) +AS_IF([test "$enable_hwloc" = "yes"], + [AC_DEFINE([USE_HWLOC], [1], [If defined, hwloc will be used to assign NICs based on CPU affinity.])]) + + PKG_INSTALLDIR() dnl check for programs diff --git a/src/transport_ofi.c b/src/transport_ofi.c index cacd01f0..82a65029 100644 --- a/src/transport_ofi.c +++ b/src/transport_ofi.c @@ -1333,6 +1333,103 @@ int allocate_fabric_resources(struct fabric_info *info) return ret; } +#ifdef USE_HWLOC +struct fi_info *assign_nic_with_hwloc(struct fi_info *fabric, struct fi_info **provs, size_t num_nics) { + hwloc_topology_t topology; + hwloc_bitmap_t bindset; + + int ret = hwloc_topology_init(&topology); + if (ret < 0) { + RAISE_ERROR_MSG("hwloc_topology_init failed (%s)\n", strerror(errno)); + } + + ret = hwloc_topology_set_io_types_filter(topology, HWLOC_TYPE_FILTER_KEEP_ALL); + if (ret < 0) { + RAISE_ERROR_MSG("hwloc_topology_set_io_types_filter failed (%s)\n", strerror(errno)); + } + + ret = hwloc_topology_load(topology); + if (ret < 0) { + RAISE_ERROR_MSG("hwloc_topology_load (%s)\n", strerror(errno)); + } + + //hwloc_obj_t root_obj = hwloc_get_root_obj(topology); + bindset = hwloc_bitmap_alloc(); + + ret = hwloc_get_proc_cpubind(topology, getpid(), bindset, HWLOC_CPUBIND_PROCESS); + if (ret < 0) { + RAISE_ERROR_MSG("hwloc_get_proc_cpubind failed (%s)\n", strerror(errno)); + } + + // Identify which provider entries correspond to NICs with an affinity to the calling process + struct fi_info *close_provs = NULL; + struct fi_info *last_added = NULL; + size_t num_close_nics = 0; + for (size_t i = 0; i < num_nics; i++) { + struct fi_info *cur_prov = provs[i]; + if (cur_prov->nic->bus_attr->bus_type != FI_BUS_PCI) continue; + + struct fi_pci_attr pci = cur_prov->nic->bus_attr->attr.pci; + hwloc_obj_t io_device = hwloc_get_pcidev_by_busid(topology, pci.domain_id, pci.bus_id, pci.device_id, pci.function_id); + if (!io_device) RAISE_ERROR_MSG("hwloc_get_pcidev_by_busid failed\n"); + hwloc_obj_t first_non_io = hwloc_get_non_io_ancestor_obj(topology, io_device); + if (!first_non_io) RAISE_ERROR_MSG("hwloc_get_non_io_ancestor_obj failed\n"); + + if (hwloc_bitmap_isincluded(bindset, first_non_io->cpuset) || + hwloc_bitmap_isincluded(first_non_io->cpuset, bindset)) { + struct fi_info *dup = fi_dupinfo(cur_prov); + if (!close_provs) close_provs = dup; + if (last_added) last_added->next = dup; + last_added = dup; + num_close_nics++; + } + } + DEBUG_MSG("num_close_nics = %zu\n", num_close_nics); + last_added->next = NULL; + if (!close_provs) RAISE_ERROR_MSG("cannot find any valid network providers\n"); + + int idx = 0; + struct fi_info **prov_list = (struct fi_info **) malloc(num_close_nics * sizeof(struct fi_info *)); + for (struct fi_info *cur_fabric = close_provs; cur_fabric; cur_fabric = cur_fabric->next) { + prov_list[idx++] = cur_fabric; + } + + //return prov_list[shmem_team_my_pe(SHMEMX_TEAM_NODE) % num_close_nics]; + return prov_list[shmem_internal_my_pe % num_close_nics]; +} +#endif + +static int compare_nic_names(const void *f1, const void *f2) +{ + const struct fi_info **fabric1 = (const struct fi_info **) f1; + const struct fi_info **fabric2 = (const struct fi_info **) f2; + return strcmp((*fabric1)->nic->device_attr->name, (*fabric2)->nic->device_attr->name); +} + +bool nic_already_used(struct fid_nic *nic, struct fi_info *fabrics, int num_nics) +{ + struct fi_info *cur_fabric = fabrics; + for (int i = 0; i < num_nics; i++) { + if (nic->bus_attr->bus_type == FI_BUS_PCI && + cur_fabric->nic->bus_attr->bus_type == FI_BUS_PCI) { + struct fi_pci_attr nic_pci = nic->bus_attr->attr.pci; + struct fi_pci_attr cur_fabric_pci = cur_fabric->nic->bus_attr->attr.pci; + if (nic_pci.domain_id == cur_fabric_pci.domain_id && nic_pci.bus_id == cur_fabric_pci.bus_id && + nic_pci.device_id == cur_fabric_pci.device_id && nic_pci.function_id == cur_fabric_pci.function_id) { + return true; + } + } else { + if (strcmp(nic->device_attr->name, cur_fabric->nic->device_attr->name) == 0) { + return true; + } + } + + cur_fabric = cur_fabric->next; + } + + return false; +} + static inline int query_for_fabric(struct fabric_info *info) { @@ -1442,7 +1539,7 @@ int query_for_fabric(struct fabric_info *info) fnmatch(info->domain_name, cur_fabric->domain_attr->name, 0) == 0) { if (!filtered_fabrics_list_head) filtered_fabrics_list_head = cur_fabric; if (filtered_fabrics_last_added) filtered_fabrics_last_added->next = cur_fabric; - filtered_fabrics_last_added = cur_fabric; + filtered_fabrics_last_added = cur_fabric; } } } @@ -1481,10 +1578,16 @@ int query_for_fabric(struct fabric_info *info) prov_list[idx++] = cur_fabric; } qsort(prov_list, num_nics, sizeof(struct fi_info *), compare_nic_names); - info->p_info = prov_list[shmem_internal_my_pe % num_nics]; - //Correct to this once SHMEM_TEAM_SHARED/SHMEMX_TEAM_HOST fixed: - //info->p_info = prov_list[shmem_team_translate_pe(SHMEM_TEAM_WORLD, shmem_internal_my_pe, SHMEMX_TEAM_HOST) % num_nics]; +#ifdef USE_HWLOC + DEBUG_MSG("[%d]: local pe = %d\n", shmem_internal_my_pe, shmem_team_my_pe(SHMEMX_TEAM_NODE)); + info->p_info = assign_nic_with_hwloc(info->p_info, prov_list, num_nics); + //info->p_info = prov_list[shmem_team_my_pe(SHMEMX_TEAM_NODE) % num_nics]; +#else + //Round-robin assignment of NICs to PEs + info->p_info = prov_list[shmem_team_my_pe(SHMEMX_TEAM_NODE) % num_nics]; +#endif } + DEBUG_MSG("provider: %s\n", info->p_info->domain_attr->name); // TODO: Do we want to allow a user to explicitly request not to use multi-NIC functionality? If so, this complicates // the usage of SHMEM_REQUIRE_MULTIRAIL_ENV (which would likely be renamed to SHMEM_USE_MULTIRAIL). Could not be simple bool, From 95635a0942bca9b947d5a48f1fc65101fab782ef Mon Sep 17 00:00:00 2001 From: Philip Marshall Date: Mon, 30 Oct 2023 14:32:41 -0700 Subject: [PATCH 04/16] config: Add scripts for hwloc configuration --- autogen.sh | 2 +- config/opal_functions.m4 | 6 ++++++ configure.ac | 7 ------- 3 files changed, 7 insertions(+), 8 deletions(-) diff --git a/autogen.sh b/autogen.sh index 87c8d962..c6f6794e 100755 --- a/autogen.sh +++ b/autogen.sh @@ -18,4 +18,4 @@ done echo -e "\n" >> ./man/Makefile.am -autoreconf -vif +autoreconf -vif -I config/ diff --git a/config/opal_functions.m4 b/config/opal_functions.m4 index 080cef6e..150e72b5 100644 --- a/config/opal_functions.m4 +++ b/config/opal_functions.m4 @@ -253,6 +253,12 @@ dnl ####################################################################### dnl ####################################################################### dnl ####################################################################### +m4_copy([OAC_FLAGS_APPEND_MOVE], [OPAL_FLAGS_APPEND_MOVE]) + +dnl ####################################################################### +dnl ####################################################################### +dnl ####################################################################### + # Macro that serves as an alternative to using `which `. It is # preferable to simply using `which ` because backticks (`) (aka # backquotes) invoke a sub-shell which may source a "noisy" diff --git a/configure.ac b/configure.ac index 4ac9bc64..39197dda 100755 --- a/configure.ac +++ b/configure.ac @@ -332,13 +332,6 @@ AC_ARG_ENABLE([ofi-hmem], AS_IF([test "$enable_ofi_hmem" = "yes"], [AC_DEFINE([USE_FI_HMEM], [1], [If defined, the OFI transport will enable FI_HMEM.])]) -AC_ARG_ENABLE([hwloc], - [AC_HELP_STRING([--enable-hwloc], - [Use hwloc to optimize assignment of NICs to PEs in multi-NIC environments. (default: disabled)])]) -AS_IF([test "$enable_hwloc" = "yes"], - [AC_DEFINE([USE_HWLOC], [1], [If defined, hwloc will be used to assign NICs based on CPU affinity.])]) - - PKG_INSTALLDIR() dnl check for programs From 5af3699d21932f489a9f623ab50f6ff41a75d6e7 Mon Sep 17 00:00:00 2001 From: Philip Marshall Date: Fri, 17 Nov 2023 15:00:18 -0800 Subject: [PATCH 05/16] src: Fix NIC detection when process has affinity to multiple sockets --- src/transport_ofi.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/transport_ofi.c b/src/transport_ofi.c index 82a65029..e577fc46 100644 --- a/src/transport_ofi.c +++ b/src/transport_ofi.c @@ -1353,10 +1353,8 @@ struct fi_info *assign_nic_with_hwloc(struct fi_info *fabric, struct fi_info **p RAISE_ERROR_MSG("hwloc_topology_load (%s)\n", strerror(errno)); } - //hwloc_obj_t root_obj = hwloc_get_root_obj(topology); bindset = hwloc_bitmap_alloc(); - - ret = hwloc_get_proc_cpubind(topology, getpid(), bindset, HWLOC_CPUBIND_PROCESS); + ret = hwloc_get_proc_last_cpu_location(topology, getpid(), bindset, HWLOC_CPUBIND_PROCESS); if (ret < 0) { RAISE_ERROR_MSG("hwloc_get_proc_cpubind failed (%s)\n", strerror(errno)); } @@ -1385,8 +1383,9 @@ struct fi_info *assign_nic_with_hwloc(struct fi_info *fabric, struct fi_info **p } } DEBUG_MSG("num_close_nics = %zu\n", num_close_nics); - last_added->next = NULL; + if (!close_provs) RAISE_ERROR_MSG("cannot find any valid network providers\n"); + last_added->next = NULL; int idx = 0; struct fi_info **prov_list = (struct fi_info **) malloc(num_close_nics * sizeof(struct fi_info *)); @@ -1579,12 +1578,13 @@ int query_for_fabric(struct fabric_info *info) } qsort(prov_list, num_nics, sizeof(struct fi_info *), compare_nic_names); #ifdef USE_HWLOC - DEBUG_MSG("[%d]: local pe = %d\n", shmem_internal_my_pe, shmem_team_my_pe(SHMEMX_TEAM_NODE)); + //DEBUG_MSG("[%d]: local_pe = %d\n", shmem_internal_my_pe, shmem_team_my_pe(SHMEMX_TEAM_NODE)); info->p_info = assign_nic_with_hwloc(info->p_info, prov_list, num_nics); //info->p_info = prov_list[shmem_team_my_pe(SHMEMX_TEAM_NODE) % num_nics]; #else //Round-robin assignment of NICs to PEs - info->p_info = prov_list[shmem_team_my_pe(SHMEMX_TEAM_NODE) % num_nics]; + //info->p_info = prov_list[shmem_team_my_pe(SHMEMX_TEAM_NODE) % num_nics]; + info->p_info = prov_list[shmem_internal_my_pe % num_nics]; #endif } DEBUG_MSG("provider: %s\n", info->p_info->domain_attr->name); From 571ebc36cf3514b97949b187ca750daa53c04883 Mon Sep 17 00:00:00 2001 From: Philip Marshall Date: Tue, 21 Nov 2023 08:59:25 -0800 Subject: [PATCH 06/16] src: Continue work on hwloc implementation --- src/transport_ofi.c | 33 +++++++++++++++++++++++++++++++-- 1 file changed, 31 insertions(+), 2 deletions(-) diff --git a/src/transport_ofi.c b/src/transport_ofi.c index e577fc46..76ff687a 100644 --- a/src/transport_ofi.c +++ b/src/transport_ofi.c @@ -1350,14 +1350,42 @@ struct fi_info *assign_nic_with_hwloc(struct fi_info *fabric, struct fi_info **p ret = hwloc_topology_load(topology); if (ret < 0) { - RAISE_ERROR_MSG("hwloc_topology_load (%s)\n", strerror(errno)); + RAISE_ERROR_MSG("hwloc_topology_load failed (%s)\n", strerror(errno)); } bindset = hwloc_bitmap_alloc(); + +#define HWLOC_ENFORCE_SINGLE_SOCKET 1 +#ifdef HWLOC_ENFORCE_SINGLE_SOCKET + // If a process has affinity to multiple sockets, do we really want to only use cores from a single socket? + // (May be better for communication but less so for computation if not using all cores originally assigned by job launcher) + + ret = hwloc_get_proc_cpubind(topology, getpid(), bindset, HWLOC_CPUBIND_PROCESS); + + int depth = hwloc_get_type_depth(topology, HWLOC_OBJ_PACKAGE); + if (depth == HWLOC_TYPE_DEPTH_UNKNOWN) RAISE_ERROR_MSG("hwloc_get_type_depth failed (no objects of type HWLOC_OBJ_PACKAGE detected)\n"); + + unsigned int num_sockets = hwloc_get_nbobjs_by_depth(topology, depth); + if (num_sockets == 0) RAISE_ERROR_MSG("hwloc_get_nbobjs_by_depth failed (could not detect sockets on system)\n"); + for (unsigned int i = 0; i < num_sockets; i++) { + hwloc_bitmap_t bindset_socket = hwloc_bitmap_alloc(); + hwloc_obj_t socket = hwloc_get_obj_by_depth(topology, depth, i); + if (!socket) RAISE_ERROR_MSG("hwloc_get_obj_by_depth failed (no object at (depth=%d,index=%u))\n", depth, i); + + hwloc_bitmap_and(bindset_socket, bindset, socket->cpuset); + int num_processor_units_socket = hwloc_get_nbobjs_inside_cpuset_by_type(topology, bindset_socket, HWLOC_OBJ_PU); + if (num_processor_units_socket < 0) RAISE_ERROR_MSG("hwloc_get_nbobjs_inside_cpuset_by_type failed (multiple levels of objects of type HWLOC_OBJ_PU)\n"); + // TODO: Which PUs to assign process to? + + hwloc_bitmap_free(bindset_socket); + } +#else + // Use cores on all sockets assigned by job launcher, but assign NIC to based on affinity of last CPU the process ran on ret = hwloc_get_proc_last_cpu_location(topology, getpid(), bindset, HWLOC_CPUBIND_PROCESS); if (ret < 0) { - RAISE_ERROR_MSG("hwloc_get_proc_cpubind failed (%s)\n", strerror(errno)); + RAISE_ERROR_MSG("hwloc_get_proc_last_cpu_location failed (%s)\n", strerror(errno)); } +#endif // Identify which provider entries correspond to NICs with an affinity to the calling process struct fi_info *close_provs = NULL; @@ -1393,6 +1421,7 @@ struct fi_info *assign_nic_with_hwloc(struct fi_info *fabric, struct fi_info **p prov_list[idx++] = cur_fabric; } + hwloc_bitmap_free(bindset); //return prov_list[shmem_team_my_pe(SHMEMX_TEAM_NODE) % num_close_nics]; return prov_list[shmem_internal_my_pe % num_close_nics]; } From 1758803750929c641f0e667c92486df1bc3972a9 Mon Sep 17 00:00:00 2001 From: Philip Marshall Date: Tue, 21 Nov 2023 13:19:46 -0800 Subject: [PATCH 07/16] src: Add process re-pinning if process has affinity to multiple sockets --- src/transport_ofi.c | 56 +++++---------------------------------------- 1 file changed, 6 insertions(+), 50 deletions(-) diff --git a/src/transport_ofi.c b/src/transport_ofi.c index 76ff687a..13dd93aa 100644 --- a/src/transport_ofi.c +++ b/src/transport_ofi.c @@ -1335,57 +1335,13 @@ int allocate_fabric_resources(struct fabric_info *info) #ifdef USE_HWLOC struct fi_info *assign_nic_with_hwloc(struct fi_info *fabric, struct fi_info **provs, size_t num_nics) { - hwloc_topology_t topology; - hwloc_bitmap_t bindset; - - int ret = hwloc_topology_init(&topology); - if (ret < 0) { - RAISE_ERROR_MSG("hwloc_topology_init failed (%s)\n", strerror(errno)); - } - - ret = hwloc_topology_set_io_types_filter(topology, HWLOC_TYPE_FILTER_KEEP_ALL); - if (ret < 0) { - RAISE_ERROR_MSG("hwloc_topology_set_io_types_filter failed (%s)\n", strerror(errno)); - } - - ret = hwloc_topology_load(topology); - if (ret < 0) { - RAISE_ERROR_MSG("hwloc_topology_load failed (%s)\n", strerror(errno)); - } - - bindset = hwloc_bitmap_alloc(); - -#define HWLOC_ENFORCE_SINGLE_SOCKET 1 -#ifdef HWLOC_ENFORCE_SINGLE_SOCKET - // If a process has affinity to multiple sockets, do we really want to only use cores from a single socket? - // (May be better for communication but less so for computation if not using all cores originally assigned by job launcher) - - ret = hwloc_get_proc_cpubind(topology, getpid(), bindset, HWLOC_CPUBIND_PROCESS); - - int depth = hwloc_get_type_depth(topology, HWLOC_OBJ_PACKAGE); - if (depth == HWLOC_TYPE_DEPTH_UNKNOWN) RAISE_ERROR_MSG("hwloc_get_type_depth failed (no objects of type HWLOC_OBJ_PACKAGE detected)\n"); - - unsigned int num_sockets = hwloc_get_nbobjs_by_depth(topology, depth); - if (num_sockets == 0) RAISE_ERROR_MSG("hwloc_get_nbobjs_by_depth failed (could not detect sockets on system)\n"); - for (unsigned int i = 0; i < num_sockets; i++) { - hwloc_bitmap_t bindset_socket = hwloc_bitmap_alloc(); - hwloc_obj_t socket = hwloc_get_obj_by_depth(topology, depth, i); - if (!socket) RAISE_ERROR_MSG("hwloc_get_obj_by_depth failed (no object at (depth=%d,index=%u))\n", depth, i); - - hwloc_bitmap_and(bindset_socket, bindset, socket->cpuset); - int num_processor_units_socket = hwloc_get_nbobjs_inside_cpuset_by_type(topology, bindset_socket, HWLOC_OBJ_PU); - if (num_processor_units_socket < 0) RAISE_ERROR_MSG("hwloc_get_nbobjs_inside_cpuset_by_type failed (multiple levels of objects of type HWLOC_OBJ_PU)\n"); - // TODO: Which PUs to assign process to? + int ret = 0; + hwloc_bitmap_t bindset = hwloc_bitmap_alloc(); - hwloc_bitmap_free(bindset_socket); - } -#else - // Use cores on all sockets assigned by job launcher, but assign NIC to based on affinity of last CPU the process ran on - ret = hwloc_get_proc_last_cpu_location(topology, getpid(), bindset, HWLOC_CPUBIND_PROCESS); + ret = hwloc_get_proc_cpubind(shmem_topology, getpid(), bindset, HWLOC_CPUBIND_PROCESS); if (ret < 0) { - RAISE_ERROR_MSG("hwloc_get_proc_last_cpu_location failed (%s)\n", strerror(errno)); + RAISE_ERROR_MSG("hwloc_get_proc_cpubind failed (%s)\n", strerror(errno)); } -#endif // Identify which provider entries correspond to NICs with an affinity to the calling process struct fi_info *close_provs = NULL; @@ -1396,9 +1352,9 @@ struct fi_info *assign_nic_with_hwloc(struct fi_info *fabric, struct fi_info **p if (cur_prov->nic->bus_attr->bus_type != FI_BUS_PCI) continue; struct fi_pci_attr pci = cur_prov->nic->bus_attr->attr.pci; - hwloc_obj_t io_device = hwloc_get_pcidev_by_busid(topology, pci.domain_id, pci.bus_id, pci.device_id, pci.function_id); + hwloc_obj_t io_device = hwloc_get_pcidev_by_busid(shmem_topology, pci.domain_id, pci.bus_id, pci.device_id, pci.function_id); if (!io_device) RAISE_ERROR_MSG("hwloc_get_pcidev_by_busid failed\n"); - hwloc_obj_t first_non_io = hwloc_get_non_io_ancestor_obj(topology, io_device); + hwloc_obj_t first_non_io = hwloc_get_non_io_ancestor_obj(shmem_topology, io_device); if (!first_non_io) RAISE_ERROR_MSG("hwloc_get_non_io_ancestor_obj failed\n"); if (hwloc_bitmap_isincluded(bindset, first_non_io->cpuset) || From 7b07a8a21cbd09730427e17cf2ef86464493c2d4 Mon Sep 17 00:00:00 2001 From: Philip Marshall Date: Mon, 27 Nov 2023 09:25:14 -0800 Subject: [PATCH 08/16] src: hwloc repinning cleanup --- src/transport_ofi.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/transport_ofi.c b/src/transport_ofi.c index 13dd93aa..9d95deca 100644 --- a/src/transport_ofi.c +++ b/src/transport_ofi.c @@ -1338,9 +1338,9 @@ struct fi_info *assign_nic_with_hwloc(struct fi_info *fabric, struct fi_info **p int ret = 0; hwloc_bitmap_t bindset = hwloc_bitmap_alloc(); - ret = hwloc_get_proc_cpubind(shmem_topology, getpid(), bindset, HWLOC_CPUBIND_PROCESS); + ret = hwloc_get_proc_last_cpu_location(shmem_topology, getpid(), bindset, HWLOC_CPUBIND_PROCESS); if (ret < 0) { - RAISE_ERROR_MSG("hwloc_get_proc_cpubind failed (%s)\n", strerror(errno)); + RAISE_ERROR_MSG("hwloc_get_proc_last_cpu_location failed (%s)\n", strerror(errno)); } // Identify which provider entries correspond to NICs with an affinity to the calling process From 5607078fd160d45393fbd8758fd1440f78c2b403 Mon Sep 17 00:00:00 2001 From: Philip Marshall Date: Tue, 28 Nov 2023 14:41:45 -0800 Subject: [PATCH 09/16] misc: Additional cleanup --- autogen.sh | 2 +- src/shmem_env_defs.h | 4 ++-- src/transport_ofi.c | 26 +++++--------------------- 3 files changed, 8 insertions(+), 24 deletions(-) diff --git a/autogen.sh b/autogen.sh index c6f6794e..87c8d962 100755 --- a/autogen.sh +++ b/autogen.sh @@ -18,4 +18,4 @@ done echo -e "\n" >> ./man/Makefile.am -autoreconf -vif -I config/ +autoreconf -vif diff --git a/src/shmem_env_defs.h b/src/shmem_env_defs.h index 796a6942..cbf8567c 100644 --- a/src/shmem_env_defs.h +++ b/src/shmem_env_defs.h @@ -107,8 +107,8 @@ SHMEM_INTERNAL_ENV_DEF(OFI_STX_ALLOCATOR, string, "round-robin", SHMEM_INTERNAL_ "Algorithm for allocating STX resources to contexts") SHMEM_INTERNAL_ENV_DEF(OFI_STX_DISABLE_PRIVATE, bool, false, SHMEM_INTERNAL_ENV_CAT_TRANSPORT, "Disallow private contexts from having exclusive STX access") -SHMEM_INTERNAL_ENV_DEF(REQUIRE_MULTIRAIL_ENV, bool, false, SHMEM_INTERNAL_ENV_CAT_TRANSPORT, - "Error out of program if multi-NIC environment not detected") +SHMEM_INTERNAL_ENV_DEF(DISABLE_MULTIRAIL, bool, false, SHMEM_INTERNAL_ENV_CAT_TRANSPORT, + "Disable usage of multirail functionality") #endif #ifdef USE_UCX diff --git a/src/transport_ofi.c b/src/transport_ofi.c index 9d95deca..ada34ce7 100644 --- a/src/transport_ofi.c +++ b/src/transport_ofi.c @@ -1366,7 +1366,7 @@ struct fi_info *assign_nic_with_hwloc(struct fi_info *fabric, struct fi_info **p num_close_nics++; } } - DEBUG_MSG("num_close_nics = %zu\n", num_close_nics); + DEBUG_MSG("Num. NICs w/ affinity to process: %zu\n", num_close_nics); if (!close_provs) RAISE_ERROR_MSG("cannot find any valid network providers\n"); last_added->next = NULL; @@ -1502,10 +1502,6 @@ int query_for_fabric(struct fabric_info *info) /* If the user supplied a fabric or domain name, use it to select the * fabric. Otherwise, select the first fabric in the list. */ - if (info->fabric_name != NULL || info->domain_name != NULL) { - struct fi_info *cur_fabric; - - //if (shmem_internal_params.USE_MULTIRAIL) { int num_nics = 0; struct fi_info *fallback = NULL; struct fi_info *filtered_fabrics_list_head = NULL; @@ -1532,7 +1528,6 @@ int query_for_fabric(struct fabric_info *info) else { filtered_fabrics_list_head = info->fabrics; } - //TODO: Should probably check that filtered_fabrics_list_head is not NULL, otherwise error struct fi_info *cur_fabric; @@ -1548,11 +1543,8 @@ int query_for_fabric(struct fabric_info *info) } } - if (num_nics == 0) { - if (shmem_internal_params.REQUIRE_MULTIRAIL_ENV) - RAISE_ERROR_MSG("Unable to detect multiple NICs\n"); - else - DEBUG_MSG("Unable to detect multiple NICs\n"); + DEBUG_MSG("Total num. NICs detected: %d\n", num_nics); + if ((num_nics == 0) || (shmem_internal_params.DISABLE_MULTIRAIL)) { info->p_info = fallback; } else { @@ -1562,24 +1554,15 @@ int query_for_fabric(struct fabric_info *info) prov_list[idx++] = cur_fabric; } qsort(prov_list, num_nics, sizeof(struct fi_info *), compare_nic_names); -#ifdef USE_HWLOC //DEBUG_MSG("[%d]: local_pe = %d\n", shmem_internal_my_pe, shmem_team_my_pe(SHMEMX_TEAM_NODE)); +#ifdef USE_HWLOC info->p_info = assign_nic_with_hwloc(info->p_info, prov_list, num_nics); - //info->p_info = prov_list[shmem_team_my_pe(SHMEMX_TEAM_NODE) % num_nics]; #else //Round-robin assignment of NICs to PEs //info->p_info = prov_list[shmem_team_my_pe(SHMEMX_TEAM_NODE) % num_nics]; info->p_info = prov_list[shmem_internal_my_pe % num_nics]; #endif } - DEBUG_MSG("provider: %s\n", info->p_info->domain_attr->name); - - // TODO: Do we want to allow a user to explicitly request not to use multi-NIC functionality? If so, this complicates - // the usage of SHMEM_REQUIRE_MULTIRAIL_ENV (which would likely be renamed to SHMEM_USE_MULTIRAIL). Could not be simple bool, - // would need to be tri-state (unset/default, true, or false): - // unset/default: Attempt to use multiple NICs, but if not possible, use single NIC - // true: If multiple NICs not detected, raise error - // false: Do not attempt to utilize multiple NICs (ie. assign all PEs the same provider for info->p_info) if (NULL == info->p_info) { RAISE_WARN_MSG("OFI transport, no valid fabric (prov=%s, fabric=%s, domain=%s)\n", @@ -1588,6 +1571,7 @@ int query_for_fabric(struct fabric_info *info) info->domain_name != NULL ? info->domain_name : ""); return ret; } + DEBUG_MSG("provider: %s\n", info->p_info->domain_attr->name); if (info->p_info->ep_attr->max_msg_size > 0) { shmem_transport_ofi_max_msg_size = info->p_info->ep_attr->max_msg_size; From 583de8bf8d45aeaa07bbfa28727415abbc56835a Mon Sep 17 00:00:00 2001 From: Philip Marshall Date: Wed, 29 Nov 2023 15:00:47 -0800 Subject: [PATCH 10/16] src: Free lists allocated during fabric selection --- src/transport_ofi.c | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/src/transport_ofi.c b/src/transport_ofi.c index ada34ce7..4d8a9c6e 100644 --- a/src/transport_ofi.c +++ b/src/transport_ofi.c @@ -1368,7 +1368,14 @@ struct fi_info *assign_nic_with_hwloc(struct fi_info *fabric, struct fi_info **p } DEBUG_MSG("Num. NICs w/ affinity to process: %zu\n", num_close_nics); - if (!close_provs) RAISE_ERROR_MSG("cannot find any valid network providers\n"); + if (!close_provs) { + RAISE_WARN_MSG("Could not detect any NICs with affinity to the process\n"); + + /* If no 'close' NICs, select from list of all NICs using round-robin assignment */ + //return provs[shmem_team_my_pe(SHMEMX_TEAM_NODE) % num_nics]; + return provs[shmem_internal_my_pe % num_nics]; + } + last_added->next = NULL; int idx = 0; @@ -1378,8 +1385,12 @@ struct fi_info *assign_nic_with_hwloc(struct fi_info *fabric, struct fi_info **p } hwloc_bitmap_free(bindset); - //return prov_list[shmem_team_my_pe(SHMEMX_TEAM_NODE) % num_close_nics]; - return prov_list[shmem_internal_my_pe % num_close_nics]; + + //struct fi_info *provider = prov_list[shmem_team_my_pe(SHMEMX_TEAM_NODE) % num_close_nics]; + struct fi_info *provider = prov_list[shmem_internal_my_pe % num_close_nics]; + free(prov_list); + + return provider; } #endif @@ -1558,10 +1569,11 @@ int query_for_fabric(struct fabric_info *info) #ifdef USE_HWLOC info->p_info = assign_nic_with_hwloc(info->p_info, prov_list, num_nics); #else - //Round-robin assignment of NICs to PEs + /* Round-robin assignment of NICs to PEs */ //info->p_info = prov_list[shmem_team_my_pe(SHMEMX_TEAM_NODE) % num_nics]; info->p_info = prov_list[shmem_internal_my_pe % num_nics]; #endif + free(prov_list); } if (NULL == info->p_info) { From 222e62aa99d7daccee2d634a1c80e780a95bbc4c Mon Sep 17 00:00:00 2001 From: Philip Marshall Date: Wed, 29 Nov 2023 15:15:54 -0800 Subject: [PATCH 11/16] ci: Add '--with-hwloc=no' flag to sos_config of several CI test cases --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 17e21f7f..2582b23c 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -486,7 +486,7 @@ jobs: - uses: actions/checkout@v2 - name: Install dependencies run: | - sudo apt-get install -y gfortran mpich libmpich-dev + sudo apt-get install -y gfortran mpich libmpich-dev libhwloc-dev sudo sysctl -w kernel.yama.ptrace_scope=0 sudo sysctl -w kernel.randomize_va_space=0 From 7989f34890722570f12c0c9475496f51f6ec688e Mon Sep 17 00:00:00 2001 From: Philip Marshall Date: Thu, 30 Nov 2023 08:20:55 -0800 Subject: [PATCH 12/16] ci: Debug failing UCX CI test case --- .github/workflows/ci.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 2582b23c..ab24e4d6 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -475,7 +475,7 @@ jobs: ucx_version: v1.9.0 xpmem_version: master sos_config: [--enable-pmi-simple --disable-fortran --with-hwloc=no, - --enable-pmi-mpi CC=mpicc --disable-fortran, + --enable-pmi-mpi CC=mpicc --disable-fortran --with-hwloc=no, --with-cma --enable-error-checking --enable-profiling --enable-pmi-simple --disable-fortran --with-hwloc=no, --with-xpmem --enable-error-checking --enable-pmi-simple --with-hwloc=no] @@ -486,7 +486,7 @@ jobs: - uses: actions/checkout@v2 - name: Install dependencies run: | - sudo apt-get install -y gfortran mpich libmpich-dev libhwloc-dev + sudo apt-get install -y gfortran mpich libmpich-dev sudo sysctl -w kernel.yama.ptrace_scope=0 sudo sysctl -w kernel.randomize_va_space=0 From f46ae95ae7777fc8e382a2447eeb9c574cde01d3 Mon Sep 17 00:00:00 2001 From: Philip Marshall Date: Thu, 30 Nov 2023 13:09:37 -0800 Subject: [PATCH 13/16] config: If hwloc installation not specified, don't error if hwloc build cannot be found --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index ab24e4d6..17e21f7f 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -475,7 +475,7 @@ jobs: ucx_version: v1.9.0 xpmem_version: master sos_config: [--enable-pmi-simple --disable-fortran --with-hwloc=no, - --enable-pmi-mpi CC=mpicc --disable-fortran --with-hwloc=no, + --enable-pmi-mpi CC=mpicc --disable-fortran, --with-cma --enable-error-checking --enable-profiling --enable-pmi-simple --disable-fortran --with-hwloc=no, --with-xpmem --enable-error-checking --enable-pmi-simple --with-hwloc=no] From d331a77f698e6fd63d9240f9c213f39f42e85515 Mon Sep 17 00:00:00 2001 From: Philip Marshall Date: Tue, 9 Jan 2024 13:41:04 -0800 Subject: [PATCH 14/16] src: Implement Dave's feedback --- src/transport_ofi.c | 77 +++++++++++++++++++++++++-------------------- 1 file changed, 43 insertions(+), 34 deletions(-) diff --git a/src/transport_ofi.c b/src/transport_ofi.c index 4d8a9c6e..91d3b188 100644 --- a/src/transport_ofi.c +++ b/src/transport_ofi.c @@ -1338,9 +1338,10 @@ struct fi_info *assign_nic_with_hwloc(struct fi_info *fabric, struct fi_info **p int ret = 0; hwloc_bitmap_t bindset = hwloc_bitmap_alloc(); - ret = hwloc_get_proc_last_cpu_location(shmem_topology, getpid(), bindset, HWLOC_CPUBIND_PROCESS); + ret = hwloc_get_proc_last_cpu_location(shmem_internal_topology, getpid(), bindset, HWLOC_CPUBIND_PROCESS); if (ret < 0) { - RAISE_ERROR_MSG("hwloc_get_proc_last_cpu_location failed (%s)\n", strerror(errno)); + RAISE_WARN_MSG("hwloc_get_proc_last_cpu_location failed (%s)\n", strerror(errno)); + return provs[shmem_internal_my_pe % num_nics]; } // Identify which provider entries correspond to NICs with an affinity to the calling process @@ -1352,10 +1353,16 @@ struct fi_info *assign_nic_with_hwloc(struct fi_info *fabric, struct fi_info **p if (cur_prov->nic->bus_attr->bus_type != FI_BUS_PCI) continue; struct fi_pci_attr pci = cur_prov->nic->bus_attr->attr.pci; - hwloc_obj_t io_device = hwloc_get_pcidev_by_busid(shmem_topology, pci.domain_id, pci.bus_id, pci.device_id, pci.function_id); - if (!io_device) RAISE_ERROR_MSG("hwloc_get_pcidev_by_busid failed\n"); - hwloc_obj_t first_non_io = hwloc_get_non_io_ancestor_obj(shmem_topology, io_device); - if (!first_non_io) RAISE_ERROR_MSG("hwloc_get_non_io_ancestor_obj failed\n"); + hwloc_obj_t io_device = hwloc_get_pcidev_by_busid(shmem_internal_topology, pci.domain_id, pci.bus_id, pci.device_id, pci.function_id); + if (!io_device) { + RAISE_WARN_MSG("hwloc_get_pcidev_by_busid failed\n"); + return provs[shmem_internal_my_pe % num_nics]; + }; + hwloc_obj_t first_non_io = hwloc_get_non_io_ancestor_obj(shmem_internal_topology, io_device); + if (!first_non_io) { + RAISE_WARN_MSG("hwloc_get_non_io_ancestor_obj failed\n"); + return provs[shmem_internal_my_pe % num_nics]; + } if (hwloc_bitmap_isincluded(bindset, first_non_io->cpuset) || hwloc_bitmap_isincluded(first_non_io->cpuset, bindset)) { @@ -1372,7 +1379,6 @@ struct fi_info *assign_nic_with_hwloc(struct fi_info *fabric, struct fi_info **p RAISE_WARN_MSG("Could not detect any NICs with affinity to the process\n"); /* If no 'close' NICs, select from list of all NICs using round-robin assignment */ - //return provs[shmem_team_my_pe(SHMEMX_TEAM_NODE) % num_nics]; return provs[shmem_internal_my_pe % num_nics]; } @@ -1386,7 +1392,6 @@ struct fi_info *assign_nic_with_hwloc(struct fi_info *fabric, struct fi_info **p hwloc_bitmap_free(bindset); - //struct fi_info *provider = prov_list[shmem_team_my_pe(SHMEMX_TEAM_NODE) % num_close_nics]; struct fi_info *provider = prov_list[shmem_internal_my_pe % num_close_nics]; free(prov_list); @@ -1512,7 +1517,8 @@ int query_for_fabric(struct fabric_info *info) info->prov_name != NULL ? info->prov_name : ""); /* If the user supplied a fabric or domain name, use it to select the - * fabric. Otherwise, select the first fabric in the list. */ + * fabrics that may be chosen. Otherwise, consider all available + * fabrics */ int num_nics = 0; struct fi_info *fallback = NULL; struct fi_info *filtered_fabrics_list_head = NULL; @@ -1544,38 +1550,41 @@ int query_for_fabric(struct fabric_info *info) info->p_info = NULL; - for (cur_fabric = filtered_fabrics_list_head; cur_fabric; cur_fabric = cur_fabric->next) { - if (!fallback) fallback = cur_fabric; - if (cur_fabric->nic && !nic_already_used(cur_fabric->nic, multirail_fabric_list_head, num_nics)) { - num_nics += 1; - if (!multirail_fabric_list_head) multirail_fabric_list_head = cur_fabric; - if (multirail_fabric_last_added) multirail_fabric_last_added->next = cur_fabric; - multirail_fabric_last_added = cur_fabric; - } - } - - DEBUG_MSG("Total num. NICs detected: %d\n", num_nics); - if ((num_nics == 0) || (shmem_internal_params.DISABLE_MULTIRAIL)) { - info->p_info = fallback; + if (shmem_internal_params.DISABLE_MULTIRAIL) { + info->p_info = filtered_fabrics_list_head; } else { - int idx = 0; - struct fi_info **prov_list = (struct fi_info **) malloc(num_nics * sizeof(struct fi_info *)); - for (cur_fabric = multirail_fabric_list_head; cur_fabric; cur_fabric = cur_fabric->next) { - prov_list[idx++] = cur_fabric; + /* Generate a linked list of all fabrics with a non-null nic value */ + for (cur_fabric = filtered_fabrics_list_head; cur_fabric; cur_fabric = cur_fabric->next) { + if (!fallback) fallback = cur_fabric; + if (cur_fabric->nic && !nic_already_used(cur_fabric->nic, multirail_fabric_list_head, num_nics)) { + num_nics += 1; + if (!multirail_fabric_list_head) multirail_fabric_list_head = cur_fabric; + if (multirail_fabric_last_added) multirail_fabric_last_added->next = cur_fabric; + multirail_fabric_last_added = cur_fabric; + } } - qsort(prov_list, num_nics, sizeof(struct fi_info *), compare_nic_names); - //DEBUG_MSG("[%d]: local_pe = %d\n", shmem_internal_my_pe, shmem_team_my_pe(SHMEMX_TEAM_NODE)); + + DEBUG_MSG("Total num. NICs detected: %d\n", num_nics); + if (num_nics == 0) { + info->p_info = fallback; + } + else { + int idx = 0; + struct fi_info **prov_list = (struct fi_info **) malloc(num_nics * sizeof(struct fi_info *)); + for (cur_fabric = multirail_fabric_list_head; cur_fabric; cur_fabric = cur_fabric->next) { + prov_list[idx++] = cur_fabric; + } + qsort(prov_list, num_nics, sizeof(struct fi_info *), compare_nic_names); #ifdef USE_HWLOC - info->p_info = assign_nic_with_hwloc(info->p_info, prov_list, num_nics); + info->p_info = assign_nic_with_hwloc(info->p_info, prov_list, num_nics); #else - /* Round-robin assignment of NICs to PEs */ - //info->p_info = prov_list[shmem_team_my_pe(SHMEMX_TEAM_NODE) % num_nics]; - info->p_info = prov_list[shmem_internal_my_pe % num_nics]; + /* Round-robin assignment of NICs to PEs */ + info->p_info = prov_list[shmem_internal_my_pe % num_nics]; #endif - free(prov_list); + free(prov_list); + } } - if (NULL == info->p_info) { RAISE_WARN_MSG("OFI transport, no valid fabric (prov=%s, fabric=%s, domain=%s)\n", info->prov_name != NULL ? info->prov_name : "", From 3d7d02d8d44bda8deea15775e9f48a9cf79924a1 Mon Sep 17 00:00:00 2001 From: Philip Marshall Date: Thu, 15 Feb 2024 13:10:48 -0800 Subject: [PATCH 15/16] Remove rebase duplicate code in opal_functions.m4 --- config/opal_functions.m4 | 6 ------ 1 file changed, 6 deletions(-) diff --git a/config/opal_functions.m4 b/config/opal_functions.m4 index 150e72b5..080cef6e 100644 --- a/config/opal_functions.m4 +++ b/config/opal_functions.m4 @@ -253,12 +253,6 @@ dnl ####################################################################### dnl ####################################################################### dnl ####################################################################### -m4_copy([OAC_FLAGS_APPEND_MOVE], [OPAL_FLAGS_APPEND_MOVE]) - -dnl ####################################################################### -dnl ####################################################################### -dnl ####################################################################### - # Macro that serves as an alternative to using `which `. It is # preferable to simply using `which ` because backticks (`) (aka # backquotes) invoke a sub-shell which may source a "noisy" From 4162370b478701c1411bf7662941b2f029bf72ae Mon Sep 17 00:00:00 2001 From: Philip Marshall Date: Wed, 3 Apr 2024 13:01:15 -0700 Subject: [PATCH 16/16] src: Minor cleanup changes --- README | 4 ++++ src/shmem_env_defs.h | 2 +- src/transport_ofi.c | 43 ++++++++++++++++++++++--------------------- 3 files changed, 27 insertions(+), 22 deletions(-) diff --git a/README b/README index 51a3c361..143bf242 100644 --- a/README +++ b/README @@ -259,6 +259,10 @@ options. compute node is determined by its unique hostname, and the number of STXs available on a compute node is provided by the libfabric library. + SHMEM_OFI_DISABLE_MULTIRAIL (default: off) + Disable multirail functionality. Enabling this will restrict all + communications to occur over a single NIC per system. + Team Environment variables: SHMEM_TEAMS_MAX (default: 10) diff --git a/src/shmem_env_defs.h b/src/shmem_env_defs.h index cbf8567c..6118509d 100644 --- a/src/shmem_env_defs.h +++ b/src/shmem_env_defs.h @@ -107,7 +107,7 @@ SHMEM_INTERNAL_ENV_DEF(OFI_STX_ALLOCATOR, string, "round-robin", SHMEM_INTERNAL_ "Algorithm for allocating STX resources to contexts") SHMEM_INTERNAL_ENV_DEF(OFI_STX_DISABLE_PRIVATE, bool, false, SHMEM_INTERNAL_ENV_CAT_TRANSPORT, "Disallow private contexts from having exclusive STX access") -SHMEM_INTERNAL_ENV_DEF(DISABLE_MULTIRAIL, bool, false, SHMEM_INTERNAL_ENV_CAT_TRANSPORT, +SHMEM_INTERNAL_ENV_DEF(OFI_DISABLE_MULTIRAIL, bool, false, SHMEM_INTERNAL_ENV_CAT_TRANSPORT, "Disable usage of multirail functionality") #endif diff --git a/src/transport_ofi.c b/src/transport_ofi.c index 91d3b188..d7d784c4 100644 --- a/src/transport_ofi.c +++ b/src/transport_ofi.c @@ -1521,10 +1521,10 @@ int query_for_fabric(struct fabric_info *info) * fabrics */ int num_nics = 0; struct fi_info *fallback = NULL; - struct fi_info *filtered_fabrics_list_head = NULL; - struct fi_info *filtered_fabrics_last_added = NULL; + struct fi_info *fabrics_list_head = NULL; + struct fi_info *fabrics_list_tail = NULL; struct fi_info *multirail_fabric_list_head = NULL; - struct fi_info *multirail_fabric_last_added = NULL; + struct fi_info *multirail_fabric_list_tail = NULL; if (info->fabric_name != NULL || info->domain_name != NULL) { struct fi_info *cur_fabric; @@ -1534,52 +1534,53 @@ int query_for_fabric(struct fabric_info *info) fnmatch(info->fabric_name, cur_fabric->fabric_attr->name, 0) == 0) { if (info->domain_name == NULL || fnmatch(info->domain_name, cur_fabric->domain_attr->name, 0) == 0) { - if (!filtered_fabrics_list_head) filtered_fabrics_list_head = cur_fabric; - if (filtered_fabrics_last_added) filtered_fabrics_last_added->next = cur_fabric; - filtered_fabrics_last_added = cur_fabric; + if (!fabrics_list_head) fabrics_list_head = cur_fabric; + if (fabrics_list_tail) fabrics_list_tail->next = cur_fabric; + fabrics_list_tail = cur_fabric; } } } - filtered_fabrics_last_added->next = NULL; + fabrics_list_tail->next = NULL; } else { - filtered_fabrics_list_head = info->fabrics; + fabrics_list_head = info->fabrics; } - struct fi_info *cur_fabric; - info->p_info = NULL; - if (shmem_internal_params.DISABLE_MULTIRAIL) { - info->p_info = filtered_fabrics_list_head; + if (shmem_internal_params.OFI_DISABLE_MULTIRAIL) { + info->p_info = fabrics_list_head; } else { /* Generate a linked list of all fabrics with a non-null nic value */ - for (cur_fabric = filtered_fabrics_list_head; cur_fabric; cur_fabric = cur_fabric->next) { + for (struct fi_info *cur_fabric = fabrics_list_head; cur_fabric; cur_fabric = cur_fabric->next) { if (!fallback) fallback = cur_fabric; if (cur_fabric->nic && !nic_already_used(cur_fabric->nic, multirail_fabric_list_head, num_nics)) { num_nics += 1; if (!multirail_fabric_list_head) multirail_fabric_list_head = cur_fabric; - if (multirail_fabric_last_added) multirail_fabric_last_added->next = cur_fabric; - multirail_fabric_last_added = cur_fabric; + if (multirail_fabric_list_tail) multirail_fabric_list_tail->next = cur_fabric; + multirail_fabric_list_tail = cur_fabric; } } - DEBUG_MSG("Total num. NICs detected: %d\n", num_nics); if (num_nics == 0) { info->p_info = fallback; } else { int idx = 0; struct fi_info **prov_list = (struct fi_info **) malloc(num_nics * sizeof(struct fi_info *)); - for (cur_fabric = multirail_fabric_list_head; cur_fabric; cur_fabric = cur_fabric->next) { + for (struct fi_info *cur_fabric = multirail_fabric_list_head; cur_fabric; cur_fabric = cur_fabric->next) { prov_list[idx++] = cur_fabric; } qsort(prov_list, num_nics, sizeof(struct fi_info *), compare_nic_names); #ifdef USE_HWLOC info->p_info = assign_nic_with_hwloc(info->p_info, prov_list, num_nics); #else - /* Round-robin assignment of NICs to PEs */ + /* Round-robin assignment of NICs to PEs + * FIXME: A more suitable indexing value would be + * shmem_team_my_pe(SHMEM_TEAM_NODE) % num_nics, but it is too early in initialization to + * do that here. We would also want to replace the similar occurrences in the + * assign_nic_with_hwloc function. */ info->p_info = prov_list[shmem_internal_my_pe % num_nics]; #endif free(prov_list); @@ -1592,7 +1593,6 @@ int query_for_fabric(struct fabric_info *info) info->domain_name != NULL ? info->domain_name : ""); return ret; } - DEBUG_MSG("provider: %s\n", info->p_info->domain_attr->name); if (info->p_info->ep_attr->max_msg_size > 0) { shmem_transport_ofi_max_msg_size = info->p_info->ep_attr->max_msg_size; @@ -1630,7 +1630,7 @@ int query_for_fabric(struct fabric_info *info) #endif DEBUG_MSG("OFI provider: %s, fabric: %s, domain: %s, mr_mode: 0x%x\n" - RAISE_PE_PREFIX "max_inject: %zu, max_msg: %zu, stx: %s, stx_max: %ld\n", + RAISE_PE_PREFIX "max_inject: %zu, max_msg: %zu, stx: %s, stx_max: %ld, num_nics: %d\n", info->p_info->fabric_attr->prov_name, info->p_info->fabric_attr->name, info->p_info->domain_attr->name, info->p_info->domain_attr->mr_mode, @@ -1638,7 +1638,8 @@ int query_for_fabric(struct fabric_info *info) shmem_transport_ofi_max_buffered_send, shmem_transport_ofi_max_msg_size, info->p_info->domain_attr->max_ep_stx_ctx == 0 ? "no" : "yes", - shmem_transport_ofi_stx_max); + shmem_transport_ofi_stx_max, + num_nics); return ret; }