Skip to content

Commit

Permalink
fix(tree): avoid sign comparison issues (aws#575)
Browse files Browse the repository at this point in the history
Signed-off-by: Nicholas Sielicki <[email protected]>
  • Loading branch information
aws-nslick committed Sep 30, 2024
1 parent b4702c5 commit 105142c
Show file tree
Hide file tree
Showing 11 changed files with 42 additions and 41 deletions.
4 changes: 2 additions & 2 deletions include/nccl_ofi.h
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ extern "C" {
#define NCCL_OFI_MAX_SEND_REQUESTS (NCCL_OFI_MAX_REQUESTS * NCCL_OFI_MAX_RECVS)

/* Flush read size (bytes) */
#define NCCL_OFI_FLUSH_SIZE 4
#define NCCL_OFI_FLUSH_SIZE (4ULL)

/* Initial number of entries in the MR cache of a device */
#define NCCL_OFI_MR_CACHE_INIT_SIZE 128
Expand Down Expand Up @@ -135,7 +135,7 @@ extern int domain_per_thread;
extern float net_latency;

/* Size of system memory pages */
extern long system_page_size;
extern size_t system_page_size;

struct nccl_net_ofi_plugin;
struct nccl_net_ofi_device;
Expand Down
2 changes: 1 addition & 1 deletion include/nccl_ofi_idpool.h
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ int nccl_ofi_idpool_allocate_id(nccl_ofi_idpool_t *idpool);
* @return 0 on success
* non-zero on error
*/
int nccl_ofi_idpool_free_id(nccl_ofi_idpool_t *idpool, int id);
int nccl_ofi_idpool_free_id(nccl_ofi_idpool_t *idpool, size_t id);

/*
* @brief Release pool of IDs and free resources
Expand Down
17 changes: 10 additions & 7 deletions src/nccl_ofi_idpool.c
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
#include "config.h"

#include <errno.h>
#include <stdbool.h>
#include <stdlib.h>

#include "nccl_ofi_idpool.h"
Expand Down Expand Up @@ -103,7 +104,9 @@ int nccl_ofi_idpool_allocate_id(nccl_ofi_idpool_t *idpool)
nccl_net_ofi_mutex_lock(&idpool->lock);

int entry_index = 0;
int id = -1;

bool found = false;
size_t id = 0;
for (size_t i = 0; i < num_long_elements; i++) {
entry_index = __builtin_ffsll(idpool->ids[i]);
if (0 != entry_index) {
Expand All @@ -113,14 +116,15 @@ int nccl_ofi_idpool_allocate_id(nccl_ofi_idpool_t *idpool)
idpool->ids[i] &= ~(1ULL << (entry_index - 1));

/* Store the ID we found */
id = (int)((i * sizeof(uint64_t) * 8) + entry_index - 1);
id = (size_t)((i * sizeof(uint64_t) * 8) + entry_index - 1);
found = true;
break;
}
}

nccl_net_ofi_mutex_unlock(&idpool->lock);

if (-1 == id || id >= idpool->size) {
if (!found || id >= idpool->size) {
NCCL_OFI_WARN("No IDs available (max: %lu)", idpool->size);
return -ENOMEM;
}
Expand All @@ -142,10 +146,9 @@ int nccl_ofi_idpool_allocate_id(nccl_ofi_idpool_t *idpool)
* @return 0 on success
* non-zero on error
*/
int nccl_ofi_idpool_free_id(nccl_ofi_idpool_t *idpool, int id)
int nccl_ofi_idpool_free_id(nccl_ofi_idpool_t *idpool, size_t id)
{
assert(NULL != idpool);
assert(id >= 0);

if (0 == idpool->size) {
NCCL_OFI_WARN("Cannot free an ID from a 0-sized pool");
Expand All @@ -158,7 +161,7 @@ int nccl_ofi_idpool_free_id(nccl_ofi_idpool_t *idpool, int id)
}

if (OFI_UNLIKELY(id >= idpool->size)) {
NCCL_OFI_WARN("ID value %d out of range (max: %lu)", id, idpool->size);
NCCL_OFI_WARN("ID value %lu out of range (max: %lu)", id, idpool->size);
return -EINVAL;
}

Expand All @@ -169,7 +172,7 @@ int nccl_ofi_idpool_free_id(nccl_ofi_idpool_t *idpool, int id)

/* Check if bit is 1 already */
if (idpool->ids[i] & (1ULL << entry_index)) {
NCCL_OFI_WARN("Attempted to free an ID that's not in use (%d)", id);
NCCL_OFI_WARN("Attempted to free an ID that's not in use (%lu)", id);

nccl_net_ofi_mutex_unlock(&idpool->lock);
return -ENOTSUP;
Expand Down
11 changes: 6 additions & 5 deletions src/nccl_ofi_net.c
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ int domain_per_thread = 0;
float net_latency = .0;

/* Size of a memory page */
long system_page_size = -1;
size_t system_page_size = 0;

/*
* @brief Allocate memory region for memory registration
Expand Down Expand Up @@ -146,12 +146,13 @@ int nccl_net_ofi_create_plugin(nccl_net_ofi_plugin_t **plugin_p)
NCCL_OFI_INFO(NCCL_INIT | NCCL_NET, "Using Libfabric version %u.%u", FI_MAJOR(fab_version),
FI_MINOR(fab_version));

system_page_size = sysconf(_SC_PAGESIZE);
if (OFI_UNLIKELY(system_page_size == -1)) {
long int system_page_size_sysconf = sysconf(_SC_PAGESIZE);
if (OFI_UNLIKELY(system_page_size_sysconf == -1)) {
NCCL_OFI_WARN("Failed to get system page size (%d %s)", errno, strerror(errno));
ret = -ENOTSUP;
goto exit;
}
system_page_size = (size_t)system_page_size_sysconf;
assert(NCCL_OFI_IS_POWER_OF_TWO(system_page_size));
assert(system_page_size > 0);

Expand Down Expand Up @@ -610,7 +611,7 @@ static int nccl_net_ofi_plugin_assign_device(nccl_net_ofi_plugin_t *plugin,
size_t device_index,
nccl_net_ofi_device_t *device)
{
if (device_index < 0 || device_index >= plugin->p_num_devs) {
if (device_index >= plugin->p_num_devs) {
return -ENOSPC;
}

Expand All @@ -623,7 +624,7 @@ static int nccl_net_ofi_plugin_assign_device(nccl_net_ofi_plugin_t *plugin,
static nccl_net_ofi_device_t * nccl_net_ofi_plugin_get_device(nccl_net_ofi_plugin_t *plugin,
size_t device_index)
{
if (device_index < 0 || device_index >= plugin->p_num_devs) {
if (device_index >= plugin->p_num_devs) {
NCCL_OFI_WARN("Invalid device index %zu", device_index);
return NULL;
}
Expand Down
22 changes: 10 additions & 12 deletions src/nccl_ofi_rdma.c
Original file line number Diff line number Diff line change
Expand Up @@ -5093,8 +5093,7 @@ static int send_progress(nccl_net_ofi_rdma_req_t *req)

ret = post_rdma_eager_send(req, comm_rail, xfer_info);
} else {
for (int rail_it = send_data->xferred_rail_id;
rail_it < schedule->num_xfer_infos; rail_it++) {
for (size_t rail_it = send_data->xferred_rail_id; rail_it < schedule->num_xfer_infos; rail_it++) {
/* Get xfer information from the schedule */
nccl_net_ofi_xfer_info_t *xfer_info = &xfers[rail_it];
/* Get communicator rail information to xfer the req */
Expand Down Expand Up @@ -5460,8 +5459,7 @@ static int send(nccl_net_ofi_send_comm_t *send_comm, void *data, int size, int t

/* Determine if this should be sent eagerly. */
eager = false;
if ((!have_ctrl && size <= eager_max_size) ||
(size == 0)) {
if ((!have_ctrl && (size_t)size <= eager_max_size) || (size == 0)) {
eager = true;
}

Expand Down Expand Up @@ -7196,7 +7194,7 @@ static inline int nccl_net_ofi_rdma_plugin_complete_init(nccl_net_ofi_plugin_t *
}

/* Allocate and initialize nccl_net devices */
for (int dev_id = 0 ; dev_id != rdma_plugin->base.p_num_devs ; ++dev_id) {
for (size_t dev_id = 0; dev_id != rdma_plugin->base.p_num_devs; ++dev_id) {
struct fi_info *info_list;

/* Retrieve NIC info list from topology */
Expand All @@ -7208,18 +7206,19 @@ static inline int nccl_net_ofi_rdma_plugin_complete_init(nccl_net_ofi_plugin_t *
}

/* Allocate device */
nccl_net_ofi_rdma_device_t *device =
nccl_net_ofi_rdma_device_create(&rdma_plugin->base, dev_id,
info_list, rdma_plugin->topo,
ofi_nccl_round_robin_threshold());
nccl_net_ofi_rdma_device_t *device = nccl_net_ofi_rdma_device_create(&rdma_plugin->base,
(int)dev_id,
info_list,
rdma_plugin->topo,
ofi_nccl_round_robin_threshold());
if (device == NULL) {
NCCL_OFI_WARN("Device creation failed");
return -ENOMEM;
}

ret = plugin->assign_device(plugin, dev_id, &device->base);
if (ret != 0) {
NCCL_OFI_WARN("Assigning device %d failed", dev_id);
NCCL_OFI_WARN("Assigning device %ld failed", dev_id);
return ret;
}
}
Expand Down Expand Up @@ -7338,8 +7337,7 @@ int nccl_net_ofi_rdma_init(const char *provider_filter,
goto error;
}

if (ofi_nccl_eager_max_size() < 0 ||
ofi_nccl_eager_max_size() > ofi_nccl_round_robin_threshold()) {
if (ofi_nccl_eager_max_size() > ofi_nccl_round_robin_threshold()) {
NCCL_OFI_WARN("Invalid value for EAGER_MAX_SIZE");
ret = ncclInvalidArgument;
goto error;
Expand Down
6 changes: 3 additions & 3 deletions src/nccl_ofi_scheduler.c
Original file line number Diff line number Diff line change
Expand Up @@ -57,9 +57,9 @@ void nccl_net_ofi_set_multiplexing_schedule(size_t size, int num_rails,
* @brief Assign message round-robin
*/
static inline int set_round_robin_schedule(nccl_net_ofi_threshold_scheduler_t *scheduler,
size_t size,
int num_rails,
nccl_net_ofi_schedule_t *schedule)
size_t size,
size_t num_rails,
nccl_net_ofi_schedule_t *schedule)
{
int rail_id;

Expand Down
11 changes: 5 additions & 6 deletions src/nccl_ofi_sendrecv.c
Original file line number Diff line number Diff line change
Expand Up @@ -2392,7 +2392,7 @@ nccl_net_ofi_sendrecv_device_create(nccl_net_ofi_plugin_t *plugin,
nccl_net_ofi_sendrecv_device_t *device =
(nccl_net_ofi_sendrecv_device_t *)calloc(1, sizeof(nccl_net_ofi_sendrecv_device_t));
if (device == NULL) {
NCCL_OFI_WARN("Unable to allocate device %i", dev_id);
NCCL_OFI_WARN("Unable to allocate device %d", dev_id);
return NULL;
}

Expand Down Expand Up @@ -2543,7 +2543,7 @@ static inline int nccl_net_ofi_sendrecv_plugin_complete_init(nccl_net_ofi_plugin
{
nccl_net_ofi_sendrecv_plugin_t *sendrecv_plugin = (nccl_net_ofi_sendrecv_plugin_t *)plugin;
struct fi_info *info;
int dev_id = 0;
size_t dev_id = 0;
int ret;

/* Allocate and initialize nccl_net devices */
Expand All @@ -2554,16 +2554,15 @@ static inline int nccl_net_ofi_sendrecv_plugin_complete_init(nccl_net_ofi_plugin
return -EINVAL;
}

nccl_net_ofi_sendrecv_device_t *device =
nccl_net_ofi_sendrecv_device_create(plugin, dev_id, info);
nccl_net_ofi_sendrecv_device_t *device = nccl_net_ofi_sendrecv_device_create(plugin, (int)dev_id, info);
if (device == NULL) {
NCCL_OFI_WARN("Unable to allocate device %i", dev_id);
NCCL_OFI_WARN("Unable to allocate device %li", dev_id);
return -ENOMEM;
}

ret = plugin->assign_device(plugin, dev_id, &device->base);
if (ret != 0) {
NCCL_OFI_WARN("Assigning device %d failed", dev_id);
NCCL_OFI_WARN("Assigning device %li failed", dev_id);
return ret;
}

Expand Down
2 changes: 1 addition & 1 deletion src/nccl_ofi_topo.c
Original file line number Diff line number Diff line change
Expand Up @@ -1041,7 +1041,7 @@ static int get_device_property(unsigned domain, unsigned bus,
int ret = 0;
FILE *file;
const char *path_format = "/sys/bus/pci/devices/%04x:%02x:%02x.%01x/%s";
size_t path_len;
ssize_t path_len;
char *path = NULL;

if ((path_len = snprintf(NULL, 0, path_format, domain, bus, dev, func, prop_name)) < 0) {
Expand Down
2 changes: 1 addition & 1 deletion src/platform-aws.c
Original file line number Diff line number Diff line change
Expand Up @@ -550,7 +550,7 @@ int platform_init(const char **provider_filter)

ret = snprintf(topology_path, sizeof(topology_path), "%s/%s",
XML_DIR, platform_data->topology);
if (ret < 0 || ret >= sizeof(topology_path)) {
if (ret < 0 || (size_t)ret >= sizeof(topology_path)) {
NCCL_OFI_WARN("Error occurred while forming the complete topology XML file path. RC: %d, Buffer Size: %d, XML dir: %s, Topology file: %s",
ret, PATH_MAX, XML_DIR, platform_data->topology);
ret = -ENOMEM;
Expand Down
2 changes: 1 addition & 1 deletion src/tuner/nccl_ofi_regions.c
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,7 @@ int is_inside_region(nccl_ofi_tuner_point_t point, nccl_ofi_tuner_region_t *regi
{
assert(region->num_vertices > 1);

int i, k;
size_t i, k;
nccl_ofi_tuner_point_t *pv;
double min_x, max_x, min_y, max_y;
const double eps = 1e-10;
Expand Down
4 changes: 2 additions & 2 deletions src/tuner/nccl_ofi_tuner.c
Original file line number Diff line number Diff line change
Expand Up @@ -347,7 +347,7 @@ ncclResult_t nccl_ofi_tuner_get_coll_info(void *context,
nccl_ofi_tuner_point_t p = {.x = nBytes, .y = nccl_ofi_tuner_ctx->dims.num_ranks};

/* Check all regions */
for (int i = 0; i < nccl_ofi_tuner_ctx->num_regions && in_out < 0; i++) {
for (size_t i = 0; i < nccl_ofi_tuner_ctx->num_regions && in_out < 0; i++) {
algorithm = nccl_ofi_tuner_ctx->regions[i].algorithm;
protocol = nccl_ofi_tuner_ctx->regions[i].protocol;
if (table[algorithm][protocol] == NCCL_ALGO_PROTO_IGNORE || algorithm >= numAlgo ||
Expand Down Expand Up @@ -413,7 +413,7 @@ ncclResult_t nccl_ofi_tuner_get_coll_info_v2(
nccl_ofi_tuner_point_t p = {.x = nBytes, .y = nccl_ofi_tuner_ctx->dims.num_ranks};

/* Check all regions */
for (int i = 0; i < nccl_ofi_tuner_ctx->num_regions && in_out < 0; i++) {
for (size_t i = 0; i < nccl_ofi_tuner_ctx->num_regions && in_out < 0; i++) {
if (nccl_ofi_tuner_ctx->regions[i].algorithm == NCCL_ALGO_NVLS_TREE && nvlsSupport == 0) {
continue;
}
Expand Down

0 comments on commit 105142c

Please sign in to comment.