diff --git a/prov/efa/src/efa_base_ep.h b/prov/efa/src/efa_base_ep.h index 3562a64fe34..86657c5dc12 100644 --- a/prov/efa/src/efa_base_ep.h +++ b/prov/efa/src/efa_base_ep.h @@ -14,6 +14,7 @@ #define EFA_QP_DEFAULT_SERVICE_LEVEL 0 #define EFA_QP_LOW_LATENCY_SERVICE_LEVEL 8 +#define EFA_ERROR_MSG_BUFFER_LENGTH 1024 #define efa_rx_flags(efa_base_ep) ((efa_base_ep)->util_ep.rx_op_flags) #define efa_tx_flags(efa_base_ep) ((efa_base_ep)->util_ep.tx_op_flags) diff --git a/prov/efa/src/rdm/efa_rdm_cq.c b/prov/efa/src/rdm/efa_rdm_cq.c index 622f9b71fee..5a18ef17003 100644 --- a/prov/efa/src/rdm/efa_rdm_cq.c +++ b/prov/efa/src/rdm/efa_rdm_cq.c @@ -348,11 +348,11 @@ static void efa_rdm_cq_handle_recv_completion(struct efa_ibv_cq *ibv_cq, struct * QP and we cannot cancel that. */ if (OFI_UNLIKELY(ep->use_zcpy_rx && efa_rdm_pkt_type_is_rtm(pkt_type))) { - void *errbuf; + char errbuf[EFA_ERROR_MSG_BUFFER_LENGTH] = {0}; size_t errbuf_len; /* local & peer host-id & ep address will be logged by efa_rdm_write_error_msg */ - if (!efa_rdm_write_error_msg(ep, pkt_entry->addr, FI_EFA_ERR_INVALID_PKT_TYPE_ZCPY_RX, &errbuf, &errbuf_len)) + if (!efa_rdm_write_error_msg(ep, pkt_entry->addr, FI_EFA_ERR_INVALID_PKT_TYPE_ZCPY_RX, errbuf, &errbuf_len)) EFA_WARN(FI_LOG_CQ, "Error: %s\n", (const char *) errbuf); efa_base_ep_write_eq_error(&ep->base_ep, FI_EINVAL, FI_EFA_ERR_INVALID_PKT_TYPE_ZCPY_RX); efa_rdm_pke_release_rx(pkt_entry); diff --git a/prov/efa/src/rdm/efa_rdm_ep.h b/prov/efa/src/rdm/efa_rdm_ep.h index a3429756b30..1b888e182a4 100644 --- a/prov/efa/src/rdm/efa_rdm_ep.h +++ b/prov/efa/src/rdm/efa_rdm_ep.h @@ -10,7 +10,6 @@ #include "efa_base_ep.h" #include "efa_rdm_rxe_map.h" -#define EFA_RDM_ERROR_MSG_BUFFER_LENGTH 1024 /** @brief Information of a queued copy. * @@ -186,7 +185,6 @@ struct efa_rdm_ep { bool sendrecv_in_order_aligned_128_bytes; /**< whether to support in order send/recv of each aligned 128 bytes memory region */ bool write_in_order_aligned_128_bytes; /**< whether to support in order write of each aligned 128 bytes memory region */ - char err_msg[EFA_RDM_ERROR_MSG_BUFFER_LENGTH]; /* A large enough buffer to store CQ/EQ error data used by e.g. fi_cq_readerr */ struct efa_rdm_pke **pke_vec; struct dlist_entry entry; /* the count of opes queued before handshake is made with their peers */ diff --git a/prov/efa/src/rdm/efa_rdm_ope.c b/prov/efa/src/rdm/efa_rdm_ope.c index f24d9c0150e..58a0f51ecaa 100644 --- a/prov/efa/src/rdm/efa_rdm_ope.c +++ b/prov/efa/src/rdm/efa_rdm_ope.c @@ -556,6 +556,7 @@ void efa_rdm_rxe_handle_error(struct efa_rdm_ope *rxe, int err, int prov_errno) struct dlist_entry *tmp; struct efa_rdm_pke *pkt_entry; int write_cq_err; + char err_msg[EFA_ERROR_MSG_BUFFER_LENGTH] = {0}; assert(rxe->type == EFA_RDM_RXE); @@ -603,8 +604,10 @@ void efa_rdm_rxe_handle_error(struct efa_rdm_ope *rxe, int err, int prov_errno) err_entry.data = rxe->cq_entry.data; err_entry.tag = rxe->cq_entry.tag; if (OFI_UNLIKELY(efa_rdm_write_error_msg(ep, rxe->addr, prov_errno, - &err_entry.err_data, &err_entry.err_data_size))) { + err_msg, &err_entry.err_data_size))) { err_entry.err_data_size = 0; + } else { + err_entry.err_data = err_msg; } EFA_WARN(FI_LOG_CQ, "err: %d, message: %s (%d)\n", @@ -660,6 +663,7 @@ void efa_rdm_txe_handle_error(struct efa_rdm_ope *txe, int err, int prov_errno) struct dlist_entry *tmp; struct efa_rdm_pke *pkt_entry; int write_cq_err; + char err_msg[EFA_ERROR_MSG_BUFFER_LENGTH] = {0}; ep = txe->ep; memset(&err_entry, 0, sizeof(err_entry)); @@ -695,8 +699,10 @@ void efa_rdm_txe_handle_error(struct efa_rdm_ope *txe, int err, int prov_errno) err_entry.data = txe->cq_entry.data; err_entry.tag = txe->cq_entry.tag; if (OFI_UNLIKELY(efa_rdm_write_error_msg(ep, txe->addr, prov_errno, - &err_entry.err_data, &err_entry.err_data_size))) { + err_msg, &err_entry.err_data_size))) { err_entry.err_data_size = 0; + } else { + err_entry.err_data = err_msg; } EFA_WARN(FI_LOG_CQ, "err: %d, message: %s (%d)\n", diff --git a/prov/efa/src/rdm/efa_rdm_rma.c b/prov/efa/src/rdm/efa_rdm_rma.c index a10e37edabc..87267f6d8ae 100644 --- a/prov/efa/src/rdm/efa_rdm_rma.c +++ b/prov/efa/src/rdm/efa_rdm_rma.c @@ -360,6 +360,7 @@ ssize_t efa_rdm_rma_post_write(struct efa_rdm_ep *ep, struct efa_rdm_ope *txe) bool delivery_complete_requested; int ctrl_type, iface, use_p2p; size_t max_eager_rtw_data_size; + char err_msg[EFA_ERROR_MSG_BUFFER_LENGTH] = {0}; /* * A handshake is required to choose the correct protocol (whether to use device write/read). @@ -377,7 +378,7 @@ ssize_t efa_rdm_rma_post_write(struct efa_rdm_ep *ep, struct efa_rdm_ope *txe) */ if ((txe->fi_flags & FI_REMOTE_CQ_DATA) && (efa_rdm_ep_support_unsolicited_write_recv(ep) != efa_rdm_peer_support_unsolicited_write_recv(txe->peer))) { - (void) efa_rdm_construct_msg_with_local_and_peer_information(ep, txe->addr, ep->err_msg, "", EFA_RDM_ERROR_MSG_BUFFER_LENGTH); + (void) efa_rdm_construct_msg_with_local_and_peer_information(ep, txe->addr, err_msg, "", EFA_ERROR_MSG_BUFFER_LENGTH); EFA_WARN(FI_LOG_EP_DATA, "Inconsistent support status detected on unsolicited write recv.\n" "My support status: %d, peer support status: %d. %s.\n" @@ -385,7 +386,7 @@ ssize_t efa_rdm_rma_post_write(struct efa_rdm_ep *ep, struct efa_rdm_ope *txe) "Please use consistent software versions on both hosts, or disable the unsolicited write " "recv feature by setting environment variable FI_EFA_USE_UNSOLICITED_WRITE_RECV=0\n", efa_use_unsolicited_write_recv(), efa_rdm_peer_support_unsolicited_write_recv(txe->peer), - ep->err_msg); + err_msg); return -FI_EOPNOTSUPP; } efa_rdm_ope_prepare_to_post_write(txe); diff --git a/prov/efa/src/rdm/efa_rdm_util.c b/prov/efa/src/rdm/efa_rdm_util.c index d8ec83b9305..c9d65061e1b 100644 --- a/prov/efa/src/rdm/efa_rdm_util.c +++ b/prov/efa/src/rdm/efa_rdm_util.c @@ -149,24 +149,22 @@ int efa_rdm_construct_msg_with_local_and_peer_information(struct efa_rdm_ep *ep, * @param[in] ep EFA RDM endpoint * @param[in] addr Remote peer fi_addr_t * @param[in] prov_errno EFA provider * error code(must be positive) - * @param[out] buf Pointer to the address of error data written by this function + * @param[out] err_msg Pointer to the address of error message written by this function * @param[out] buflen Pointer to the returned error data size * @return A status code. 0 if the error data was written successfully, otherwise a negative FI error code. */ -int efa_rdm_write_error_msg(struct efa_rdm_ep *ep, fi_addr_t addr, int prov_errno, void **buf, size_t *buflen) +int efa_rdm_write_error_msg(struct efa_rdm_ep *ep, fi_addr_t addr, int prov_errno, char *err_msg, size_t *buflen) { const char *base_msg = efa_strerror(prov_errno); int ret; - *buf = NULL; - *buflen = 0; + *buflen = 0; - ret = efa_rdm_construct_msg_with_local_and_peer_information(ep, addr, ep->err_msg, base_msg, EFA_RDM_ERROR_MSG_BUFFER_LENGTH); + ret = efa_rdm_construct_msg_with_local_and_peer_information(ep, addr, err_msg, base_msg, EFA_ERROR_MSG_BUFFER_LENGTH); if (ret) return ret; - *buf = ep->err_msg; - *buflen = EFA_RDM_ERROR_MSG_BUFFER_LENGTH; + *buflen = EFA_ERROR_MSG_BUFFER_LENGTH; return 0; } diff --git a/prov/efa/src/rdm/efa_rdm_util.h b/prov/efa/src/rdm/efa_rdm_util.h index f52496b195e..7c3daa3432f 100644 --- a/prov/efa/src/rdm/efa_rdm_util.h +++ b/prov/efa/src/rdm/efa_rdm_util.h @@ -21,7 +21,7 @@ void efa_rdm_get_desc_for_shm(int numdesc, void **efa_desc, void **shm_desc); int efa_rdm_construct_msg_with_local_and_peer_information(struct efa_rdm_ep *ep, fi_addr_t addr, char *msg, const char *base_msg, size_t msg_len); -int efa_rdm_write_error_msg(struct efa_rdm_ep *ep, fi_addr_t addr, int prov_errno, void **buf, size_t *buflen); +int efa_rdm_write_error_msg(struct efa_rdm_ep *ep, fi_addr_t addr, int prov_errno, char *err_msg, size_t *buflen); #ifdef ENABLE_EFA_POISONING static inline void efa_rdm_poison_mem_region(void *ptr, size_t size) diff --git a/prov/efa/test/efa_unit_test_cq.c b/prov/efa/test/efa_unit_test_cq.c index 3d72a6460c1..29a06fc1579 100644 --- a/prov/efa/test/efa_unit_test_cq.c +++ b/prov/efa/test/efa_unit_test_cq.c @@ -155,7 +155,7 @@ static void test_rdm_cq_read_bad_send_status(struct efa_resource *resource, assert_int_equal(ret, -FI_EAVAIL); /* Allocate memory to read CQ error */ - cq_err_entry.err_data_size = EFA_RDM_ERROR_MSG_BUFFER_LENGTH; + cq_err_entry.err_data_size = EFA_ERROR_MSG_BUFFER_LENGTH; cq_err_entry.err_data = malloc(cq_err_entry.err_data_size); assert_non_null(cq_err_entry.err_data);