Skip to content

Commit

Permalink
fix(rdma): define COMM_ID_INVALID (aws#574)
Browse files Browse the repository at this point in the history
Previously, this used ~0 as the invalid signal. This defaults to a
signed type, which breaks under -wsign-compare. Prefer to use
COMM_ID_INVALID (defined as COMM_ID_MASK) as the marker.

Signed-off-by: Nicholas Sielicki <[email protected]>
  • Loading branch information
aws-nslick committed Sep 27, 2024
1 parent 32deaf1 commit a2b26df
Showing 1 changed file with 15 additions and 10 deletions.
25 changes: 15 additions & 10 deletions src/nccl_ofi_rdma.c
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,11 @@
*/
#define COMM_ID_MASK (((uint64_t)1 << NCCL_OFI_RDMA_COMM_ID_BITS) - 1)

/*
* @brief Signifier for an invalid Communicator ID
*/
#define COMM_ID_INVALID (COMM_ID_MASK)

/*
* @brief Message sequence number bitmask for immediate data
*/
Expand Down Expand Up @@ -4173,7 +4178,7 @@ static nccl_net_ofi_rdma_recv_comm_t *prepare_recv_comm(nccl_net_ofi_rdma_listen
/* Allocate recv communicator ID */
comm_id = nccl_ofi_idpool_allocate_id(device->comm_idpool);
if (OFI_UNLIKELY(comm_id < 0)) {
r_comm->local_comm_id = ~0;
r_comm->local_comm_id = COMM_ID_INVALID;
goto error;
}
r_comm->local_comm_id = (uint32_t)comm_id;
Expand Down Expand Up @@ -4343,7 +4348,7 @@ static nccl_net_ofi_rdma_recv_comm_t *prepare_recv_comm(nccl_net_ofi_rdma_listen
nccl_ofi_freelist_fini(r_comm->nccl_ofi_reqs_fl);
if (r_comm->msgbuff)
nccl_ofi_msgbuff_destroy(r_comm->msgbuff);
if (~0 != r_comm->local_comm_id) {
if (COMM_ID_INVALID != r_comm->local_comm_id) {
ret = nccl_ofi_idpool_free_id(device->comm_idpool, r_comm->local_comm_id);
if (ret != 0) {
NCCL_OFI_WARN("Error freeing communicator ID %" PRIu32, r_comm->local_comm_id);
Expand Down Expand Up @@ -4769,7 +4774,7 @@ static int listen(nccl_net_ofi_ep_t *base_ep,
/* Allocate listen communicator ID */
comm_id = nccl_ofi_idpool_allocate_id(device->comm_idpool);
if (OFI_UNLIKELY(comm_id < 0)) {
l_comm->comm_id = ~0;
l_comm->comm_id = COMM_ID_INVALID;
ret = comm_id;
goto error;
}
Expand All @@ -4789,11 +4794,11 @@ static int listen(nccl_net_ofi_ep_t *base_ep,
goto exit;

error:
if (l_comm && ~0 != l_comm->comm_id) {
if (0 != nccl_ofi_idpool_free_id(device->comm_idpool, l_comm->comm_id)) {
NCCL_OFI_WARN("Error freeing communicator ID %" PRIu32, l_comm->comm_id);
}
}
if (l_comm && COMM_ID_INVALID != l_comm->comm_id) {
if (0 != nccl_ofi_idpool_free_id(device->comm_idpool, l_comm->comm_id)) {
NCCL_OFI_WARN("Error freeing communicator ID %" PRIu32, l_comm->comm_id);
}
}
free(l_comm);
exit:
return ret;
Expand Down Expand Up @@ -5950,7 +5955,7 @@ static inline int create_send_comm(nccl_net_ofi_conn_handle_t *handle,
/* Allocate send communicator ID */
comm_id = nccl_ofi_idpool_allocate_id(device->comm_idpool);
if (OFI_UNLIKELY(comm_id < 0)) {
ret_s_comm->local_comm_id = ~0;
ret_s_comm->local_comm_id = COMM_ID_INVALID;
ret = comm_id;
goto error;
}
Expand Down Expand Up @@ -6016,7 +6021,7 @@ static inline int create_send_comm(nccl_net_ofi_conn_handle_t *handle,

error:
if (ret_s_comm) {
if (~0 != ret_s_comm->local_comm_id) {
if (COMM_ID_INVALID != ret_s_comm->local_comm_id) {
if (0 != nccl_ofi_idpool_free_id(device->comm_idpool, ret_s_comm->local_comm_id)) {
NCCL_OFI_WARN("Error freeing communicator ID %" PRIu32, ret_s_comm->local_comm_id);
}
Expand Down

0 comments on commit a2b26df

Please sign in to comment.