From db51f19654cebb83a126de6d4b3d93029cc1c108 Mon Sep 17 00:00:00 2001 From: Nicholas Sielicki Date: Wed, 4 Sep 2024 13:09:06 -0700 Subject: [PATCH] fix(rdma): define COMM_ID_INVALID (#574) Previously, this used ~0 as the invalid signal. This defaults to a signed type, which breaks under -wsign-compare. Prefer to use COMM_ID_INVALID (defined as COMM_ID_MASK) as the marker. Signed-off-by: Nicholas Sielicki --- src/nccl_ofi_rdma.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/src/nccl_ofi_rdma.c b/src/nccl_ofi_rdma.c index 453c568f2..f9b672f24 100644 --- a/src/nccl_ofi_rdma.c +++ b/src/nccl_ofi_rdma.c @@ -46,6 +46,11 @@ */ #define COMM_ID_MASK (((uint64_t)1 << NCCL_OFI_RDMA_COMM_ID_BITS) - 1) +/* + * @brief Signifier for an invalid Communicator ID + */ +#define COMM_ID_INVALID (COMM_ID_MASK) + /* * @brief Message sequence number bitmask for immediate data */ @@ -4173,7 +4178,7 @@ static nccl_net_ofi_rdma_recv_comm_t *prepare_recv_comm(nccl_net_ofi_rdma_listen /* Allocate recv communicator ID */ comm_id = nccl_ofi_idpool_allocate_id(device->comm_idpool); if (OFI_UNLIKELY(comm_id < 0)) { - r_comm->local_comm_id = ~0; + r_comm->local_comm_id = COMM_ID_INVALID; goto error; } r_comm->local_comm_id = (uint32_t)comm_id; @@ -4343,7 +4348,7 @@ static nccl_net_ofi_rdma_recv_comm_t *prepare_recv_comm(nccl_net_ofi_rdma_listen nccl_ofi_freelist_fini(r_comm->nccl_ofi_reqs_fl); if (r_comm->msgbuff) nccl_ofi_msgbuff_destroy(r_comm->msgbuff); - if (~0 != r_comm->local_comm_id) { + if (COMM_ID_INVALID != r_comm->local_comm_id) { ret = nccl_ofi_idpool_free_id(device->comm_idpool, r_comm->local_comm_id); if (ret != 0) { NCCL_OFI_WARN("Error freeing communicator ID %" PRIu32, r_comm->local_comm_id); @@ -4769,7 +4774,7 @@ static int listen(nccl_net_ofi_ep_t *base_ep, /* Allocate listen communicator ID */ comm_id = nccl_ofi_idpool_allocate_id(device->comm_idpool); if (OFI_UNLIKELY(comm_id < 0)) { - l_comm->comm_id = ~0; + l_comm->comm_id = COMM_ID_INVALID; ret = comm_id; goto error; } @@ -4788,8 +4793,8 @@ static int listen(nccl_net_ofi_ep_t *base_ep, goto exit; - error: - if (l_comm && ~0 != l_comm->comm_id) { +error: + if (l_comm && COMM_ID_INVALID != l_comm->comm_id) { if (0 != nccl_ofi_idpool_free_id(device->comm_idpool, l_comm->comm_id)) { NCCL_OFI_WARN("Error freeing communicator ID %" PRIu32, l_comm->comm_id); } @@ -5950,7 +5955,7 @@ static inline int create_send_comm(nccl_net_ofi_conn_handle_t *handle, /* Allocate send communicator ID */ comm_id = nccl_ofi_idpool_allocate_id(device->comm_idpool); if (OFI_UNLIKELY(comm_id < 0)) { - ret_s_comm->local_comm_id = ~0; + ret_s_comm->local_comm_id = COMM_ID_INVALID; ret = comm_id; goto error; } @@ -6016,7 +6021,7 @@ static inline int create_send_comm(nccl_net_ofi_conn_handle_t *handle, error: if (ret_s_comm) { - if (~0 != ret_s_comm->local_comm_id) { + if (COMM_ID_INVALID != ret_s_comm->local_comm_id) { if (0 != nccl_ofi_idpool_free_id(device->comm_idpool, ret_s_comm->local_comm_id)) { NCCL_OFI_WARN("Error freeing communicator ID %" PRIu32, ret_s_comm->local_comm_id); }