diff --git a/src/ucp/core/ucp_worker.c b/src/ucp/core/ucp_worker.c index 202bf767965..d69ab532d61 100644 --- a/src/ucp/core/ucp_worker.c +++ b/src/ucp/core/ucp_worker.c @@ -1629,6 +1629,7 @@ ucs_status_t ucp_worker_create(ucp_context_h context, int num_mm_units; int num_sys_devices; int xx; + int yy; ucs_mm_unit_t *mm_units; ucs_sys_device_t *sys_devices; @@ -1702,12 +1703,21 @@ ucs_status_t ucp_worker_create(ucp_context_h context, for (xx = 0; xx < num_mm_units; xx++) { ucs_debug("mm_unit = %s", mm_units[xx].fpath); } - ucs_sys_free_mm_units(mm_units); ucs_sys_get_sys_devices(&sys_devices, &num_sys_devices); for (xx = 0; xx < num_sys_devices; xx++) { ucs_debug("sys_unit = %s", sys_devices[xx].fpath); } + + for (xx = 0; xx < num_mm_units; xx++) { + for (yy = 0; yy < num_sys_devices; yy++) { + ucs_sys_dev_dist_enum_t distance; + ucs_sys_get_dev_distance(&sys_devices[yy], + &mm_units[xx], + &distance); + } + } + ucs_sys_free_mm_units(mm_units); ucs_sys_free_sys_devices(sys_devices); /* Create statistics */ diff --git a/src/ucs/config/types.h b/src/ucs/config/types.h index cdce240d57c..5f08fc5c397 100644 --- a/src/ucs/config/types.h +++ b/src/ucs/config/types.h @@ -114,6 +114,18 @@ typedef enum { } ucs_sys_device_enum_t; +/** + * PCIe distance categories and answers what needs to be crossed to reach + */ +typedef enum { + UCS_SYS_DEV_DIST_PIX = 0, /* traverse 1 PCIe switch */ + UCS_SYS_DEV_DIST_PXB, /* traverse >1 PCIe switches */ + UCS_SYS_DEV_DIST_PHB, /* traverse host bridge */ + UCS_SYS_DEV_DIST_NODE, /* traverse host bridge in the same numa node */ + UCS_SYS_DEV_DIST_SYS /* traverse CPU interconnect (like QPI) */ +} ucs_sys_dev_dist_enum_t; + + /** * Structure type for array configuration. Should be used inside the configuration * structure declaration. @@ -150,7 +162,9 @@ typedef struct ucs_mm_unit { ucs_mm_unit_enum_t mm_unit_type; /**< Type of memory unit */ unsigned int id; /**< Index of the unit */ unsigned int bus_id; /**< bus ID of of the device if applicable*/ + unsigned int numa_node; /**< NUMA node assoicated with the device*/ char fpath[UCS_FPATH_MAX_LEN]; + char rpath[UCS_FPATH_MAX_LEN]; } ucs_mm_unit_t; /** @@ -160,8 +174,10 @@ typedef struct ucs_mm_unit { typedef struct ucs_sys_device { ucs_sys_device_enum_t sys_dev_type; /**< Type of system device*/ unsigned int id; /**< Index of the unit */ - unsigned int bus_id; /**< bus ID of of the device if applicable*/ + unsigned int bus_id; /**< bus ID of of the device*/ + unsigned int numa_node; /**< NUMA node assoicated with the device*/ char fpath[UCS_FPATH_MAX_LEN]; + char rpath[UCS_FPATH_MAX_LEN]; } ucs_sys_device_t; #endif /* TYPES_H_ */ diff --git a/src/ucs/sys/sys.c b/src/ucs/sys/sys.c index 1941931fa7c..30d97adcf5c 100644 --- a/src/ucs/sys/sys.c +++ b/src/ucs/sys/sys.c @@ -43,6 +43,8 @@ #define UCS_DEFAULT_MEM_FREE 640000 #define UCS_PROCESS_SMAPS_FILE "/proc/self/smaps" +#define UCS_MAX_PATH_DEPTH 64 + static char *ucs_mm_unit_paths[] = { [UCS_MM_UNIT_CPU] = "/sys/devices/system/node", [UCS_MM_UNIT_CUDA] = "/sys/bus/pci/drivers/nvidia", @@ -1150,6 +1152,53 @@ static ucs_status_t ucs_release_paths(char *fpaths) return UCS_OK; } +/* + * Expects path_name to be in the form /sys/bus/pci/drivers/nvidia/0000:86:00.0 + * The function appends `numa_node` and returns the contents of say + * /sys/bus/pci/drivers/nvidia/0000:86:00.0/numa_node + */ +static int ucs_get_numa_node(char *path_name) +{ + char *buf = NULL; + char name[UCS_FPATH_MAX_LEN]; + struct stat statbuf; + int fd; + int numa_node; + + strcpy(name, path_name); + strcat(name, "/"); + strcat(name, "numa_node"); + + numa_node = -1; + fd = open(name, O_RDONLY); + if (0 != stat(name, &statbuf)) { + ucs_error("stat %m name = %s", name); + goto out1; + } + + buf = malloc(sizeof(char) * statbuf.st_size); + if (NULL == buf) { + ucs_error("malloc"); + goto out2; + } + + if (-1 == read(fd, buf, statbuf.st_size)) { + ucs_error("read"); + goto out3; + } + + numa_node = atoi(buf); + +out3: + free(buf); + +out2: + close(fd); + +out1: + return numa_node; +} + static int ucs_get_bus_id(char *name) { char delim[] = ":"; @@ -1191,6 +1240,42 @@ static int ucs_get_bus_id(char *name) return bus_id; } +static int ucs_get_rpath(char *name, char *rpath) +{ + char tmp_path[UCS_FPATH_MAX_LEN] = "/sys/class/pci_bus/"; + char delim[] = ":"; + char *rval = NULL; + char *str = NULL; + char *str_p = NULL; + int count = 0; + + str = ucs_malloc(sizeof(char) * strlen(name), "ucs_get_rpath str"); + if (NULL == str) { + return -1; + } + str_p = str; + strcpy(str, name); + + do { + rval = strtok(str, delim); + str = NULL; + count++; + strcat(tmp_path, rval); + if (count == 2) break; + strcat(tmp_path, delim); + } while (rval != NULL); + + if (NULL == realpath(tmp_path, rpath)) { + ucs_error("realpath %m"); + return -1; + } + + ucs_free(str_p); + + return 0; + +} + ucs_status_t ucs_sys_get_mm_units(ucs_mm_unit_t **mm_units, int *num_units) { int num_mm_units[UCS_MM_UNIT_LAST]; @@ -1228,11 +1313,20 @@ ucs_status_t ucs_sys_get_mm_units(ucs_mm_unit_t **mm_units, int *num_units) for (mm_idx = UCS_MM_UNIT_CPU; mm_idx < UCS_MM_UNIT_LAST; mm_idx++) { for (i = 0; i < num_mm_units[mm_idx]; i++) { + strcpy(mm_unit_p->fpath, ucs_mm_unit_paths[mm_idx]); src = (char *) mm_fpaths[mm_idx] + (i * UCS_FPATH_MAX_LEN); strcat(mm_unit_p->fpath, "/"); strcat(mm_unit_p->fpath, src); + + if (mm_idx != UCS_MM_UNIT_CPU) { + ucs_get_rpath(src, mm_unit_p->rpath); + } else { + strcpy(mm_unit_p->rpath, mm_unit_p->fpath); + } + mm_unit_p->bus_id = (mm_idx == UCS_MM_UNIT_CPU) ? -1 : ucs_get_bus_id(src); + mm_unit_p->numa_node = (mm_idx == UCS_MM_UNIT_CPU) ? i : ucs_get_numa_node(mm_unit_p->fpath); mm_unit_p->id = mm_unit_idx++; mm_unit_p->mm_unit_type = mm_idx; mm_unit_p = mm_unit_p + 1; @@ -1295,7 +1389,9 @@ ucs_status_t ucs_sys_get_sys_devices(ucs_sys_device_t **sys_devices, int *num_un src = (char *) sys_fpaths[sys_idx] + (i * UCS_FPATH_MAX_LEN); strcat(sys_dev_p->fpath, "/"); strcat(sys_dev_p->fpath, src); + ucs_get_rpath(src, sys_dev_p->rpath); sys_dev_p->bus_id = ucs_get_bus_id(src); + sys_dev_p->numa_node = ucs_get_numa_node(sys_dev_p->fpath); /* TODO: handle numa_node = -1 */ sys_dev_p->id = sys_dev_idx++; sys_dev_p->sys_dev_type = sys_idx; sys_dev_p = sys_dev_p + 1; @@ -1317,12 +1413,119 @@ ucs_status_t ucs_sys_free_sys_devices(ucs_sys_device_t *sys_devices) return UCS_OK; } +ucs_status_t ucs_get_path_info(char *name1, char *name2, + int *rdepth1, int *rdepth2, int *variation_position) +{ + int depth1 = 0; + int depth2 = 0; + int depth_variation = 0; + int min_depth; + int offset1[UCS_MAX_PATH_DEPTH]; + int offset2[UCS_MAX_PATH_DEPTH]; + int i; + + depth1 = 0; + depth2 = 0; + + for (i = 0; i < strlen(name1); i++) { + if (name1[i] == '/') { + offset1[depth1++] = i + 1; + } + } + + for (i = 0; i < strlen(name2); i++) { + if (name2[i] == '/') { + offset2[depth2++] = i + 1; + } + } + + min_depth = depth1 < depth2 ? depth1 : depth2; + + for (i = 0; i < min_depth; i++) { + if ((offset1[i + 1] - offset1[i]) == (offset2[i + 1] - offset2[i])) { + if (!strncmp((char *)name1 + offset1[i], (char *)name2 + offset2[i], + (offset2[i + 1] - offset2[i]))) { + depth_variation++; + } else { + break; + } + } else { + break; + } + } + + *rdepth1 = depth1; + *rdepth2 = depth2; + *variation_position = depth_variation; + + return UCS_OK; + +} + +ucs_status_t ucs_sys_get_dev_distance(ucs_sys_device_t *sys_device, + ucs_mm_unit_t *mm_unit, + ucs_sys_dev_dist_enum_t *distance) +{ + int numa_distance = abs(mm_unit->numa_node - sys_device->numa_node); + int rdepth1, rdepth2, variation_position; + + if (UCS_MM_UNIT_CPU == mm_unit->mm_unit_type) { + *distance = (numa_distance == 0) ? UCS_SYS_DEV_DIST_PIX : UCS_SYS_DEV_DIST_SYS; + } else if (UCS_MM_UNIT_CUDA == mm_unit->mm_unit_type) { + if (numa_distance > 0) { + /* + * Crossing QPI + * /sys/devices/pci0000:00/{0000:00:02.0}/0000:03:00.0/0000:04:0c.0/pci_bus/0000:07 + * /sys/devices/pci0000:80/{0000:80:02.0}/0000:82:00.0/0000:83:04.0/pci_bus/0000:84 + */ + *distance = UCS_SYS_DEV_DIST_SYS; + } else { + ucs_get_path_info(sys_device->rpath, mm_unit->rpath, + &rdepth1, &rdepth2, &variation_position); + if ((rdepth1 == rdepth2) && (variation_position == (rdepth1 - 3))) { + /* + * Crossing a single switch + * /sys/devices/pci0000:00/0000:00:02.0/0000:03:00.0/{0000:04:0c.0}/pci_bus/0000:07 + * /sys/devices/pci0000:00/0000:00:02.0/0000:03:00.0/{0000:04:04.0}/pci_bus/0000:05 + * => depths should match but depth - 3 differs + */ + *distance = UCS_SYS_DEV_DIST_PIX; + } else if (variation_position == 3) { + /* + * Crossing Host Bridge + * /sys/devices/pci0000:00/{0000:00:02.0}/0000:03:00.0/0000:04:0c.0/pci_bus/0000:07 + * /sys/devices/pci0000:00/{0000:00:03.0}/0000:08:00.0/0000:09:0c.0/pci_bus/0000:0c + * + * /sys/devices/pci0000:80/{0000:80:03.0}/0000:82:00.0/0000:83:10.0/pci_bus/0000:85 + * /sys/devices/pci0000:80/{0000:80:01.0}/pci_bus/0000:81 + * + * => depths may or may not match but depth 4 varies + */ + *distance = UCS_SYS_DEV_DIST_PHB; + } else { + /* If not the above 2 cases, then assume multiple switches */ + /* + * TODO: Differentiate between NODE/PXB cases + * Crossing a multiple switches + * /sys/devices/pci0000:00/0000:00:02.0/0000:03:00.0/0000:04:0c.0/pci_bus/0000:07 + * /sys/devices/pci0000:00/0000:00:02.0/0000:03:00.0/0000:05:04.0/pci_bus/0000:08 + */ + *distance = UCS_SYS_DEV_DIST_NODE; + } + } + } + + ucs_debug("distance between %s %s = %d", mm_unit->rpath, sys_device->rpath, *distance); + + return UCS_OK; +} + int ucs_get_cpu_mm_index(void *ptr, ucs_mm_unit_t *mm_units, int num_units) { int mm_index = -1; get_mempolicy(&mm_index, NULL, 0, ptr, MPOL_F_NODE | MPOL_F_ADDR); - printf("mm_index = %d\n", mm_index); + ucs_debug("ptr = %p mm_index = %d", ptr, mm_index); /* this is a shortcut that works only if numa node 0, 1, ... n * occupy the first n indices of mm_unit array diff --git a/src/ucs/sys/sys.h b/src/ucs/sys/sys.h index e5ea32a3e77..2741be86b97 100644 --- a/src/ucs/sys/sys.h +++ b/src/ucs/sys/sys.h @@ -382,6 +382,7 @@ ucs_status_t ucs_sys_get_mm_units(ucs_mm_unit_t **mm_units, int *num_mm_units); */ ucs_status_t ucs_sys_free_mm_units(ucs_mm_unit_t *mm_units); + /** * Detect system devices such as HCAs, GPUs, and other PCIe devices * and return a list of such devices @@ -401,6 +402,18 @@ ucs_status_t ucs_sys_get_sys_devices(ucs_sys_device_t **sys_devices, int *num_sy */ ucs_status_t ucs_sys_free_sys_devices(ucs_sys_device_t *sys_devices); + +/** + * Return relative distance between a memory unit and a system device + * + * @param [in] sys_device System device structure such as GPU/HCA + * @param [in] mm_unit Memory Unit structue like CPU/GPU memory + * @param [out] distance Abstract distance between mm_unit and sys_device + * @return UCS_OK or error in case of failure. + */ +ucs_status_t ucs_sys_get_dev_distance(ucs_sys_device_t *sys_device, ucs_mm_unit_t *mm_unit, + ucs_sys_dev_dist_enum_t *distance); + END_C_DECLS #endif