Skip to content

Commit

Permalink
prov/efa: Remove err_msg from efa_rdm_ep
Browse files Browse the repository at this point in the history
Use a temporary char array to hold error message. err_data can be
safely released after ofi_cq_write_err returns. No need to use a
persistent field in ep.

Signed-off-by: Jessie Yang <[email protected]>
  • Loading branch information
jiaxiyan authored and shijin-aws committed Jan 7, 2025
1 parent f3f3f0a commit a190ce4
Show file tree
Hide file tree
Showing 8 changed files with 21 additions and 17 deletions.
1 change: 1 addition & 0 deletions prov/efa/src/efa_base_ep.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@

#define EFA_QP_DEFAULT_SERVICE_LEVEL 0
#define EFA_QP_LOW_LATENCY_SERVICE_LEVEL 8
#define EFA_ERROR_MSG_BUFFER_LENGTH 1024

#define efa_rx_flags(efa_base_ep) ((efa_base_ep)->util_ep.rx_op_flags)
#define efa_tx_flags(efa_base_ep) ((efa_base_ep)->util_ep.tx_op_flags)
Expand Down
4 changes: 2 additions & 2 deletions prov/efa/src/rdm/efa_rdm_cq.c
Original file line number Diff line number Diff line change
Expand Up @@ -348,11 +348,11 @@ static void efa_rdm_cq_handle_recv_completion(struct efa_ibv_cq *ibv_cq, struct
* QP and we cannot cancel that.
*/
if (OFI_UNLIKELY(ep->use_zcpy_rx && efa_rdm_pkt_type_is_rtm(pkt_type))) {
void *errbuf;
char errbuf[EFA_ERROR_MSG_BUFFER_LENGTH] = {0};
size_t errbuf_len;

/* local & peer host-id & ep address will be logged by efa_rdm_write_error_msg */
if (!efa_rdm_write_error_msg(ep, pkt_entry->addr, FI_EFA_ERR_INVALID_PKT_TYPE_ZCPY_RX, &errbuf, &errbuf_len))
if (!efa_rdm_write_error_msg(ep, pkt_entry->addr, FI_EFA_ERR_INVALID_PKT_TYPE_ZCPY_RX, errbuf, &errbuf_len))
EFA_WARN(FI_LOG_CQ, "Error: %s\n", (const char *) errbuf);
efa_base_ep_write_eq_error(&ep->base_ep, FI_EINVAL, FI_EFA_ERR_INVALID_PKT_TYPE_ZCPY_RX);
efa_rdm_pke_release_rx(pkt_entry);
Expand Down
2 changes: 0 additions & 2 deletions prov/efa/src/rdm/efa_rdm_ep.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
#include "efa_base_ep.h"
#include "efa_rdm_rxe_map.h"

#define EFA_RDM_ERROR_MSG_BUFFER_LENGTH 1024

/** @brief Information of a queued copy.
*
Expand Down Expand Up @@ -186,7 +185,6 @@ struct efa_rdm_ep {

bool sendrecv_in_order_aligned_128_bytes; /**< whether to support in order send/recv of each aligned 128 bytes memory region */
bool write_in_order_aligned_128_bytes; /**< whether to support in order write of each aligned 128 bytes memory region */
char err_msg[EFA_RDM_ERROR_MSG_BUFFER_LENGTH]; /* A large enough buffer to store CQ/EQ error data used by e.g. fi_cq_readerr */
struct efa_rdm_pke **pke_vec;
struct dlist_entry entry;
/* the count of opes queued before handshake is made with their peers */
Expand Down
10 changes: 8 additions & 2 deletions prov/efa/src/rdm/efa_rdm_ope.c
Original file line number Diff line number Diff line change
Expand Up @@ -556,6 +556,7 @@ void efa_rdm_rxe_handle_error(struct efa_rdm_ope *rxe, int err, int prov_errno)
struct dlist_entry *tmp;
struct efa_rdm_pke *pkt_entry;
int write_cq_err;
char err_msg[EFA_ERROR_MSG_BUFFER_LENGTH] = {0};

assert(rxe->type == EFA_RDM_RXE);

Expand Down Expand Up @@ -603,8 +604,10 @@ void efa_rdm_rxe_handle_error(struct efa_rdm_ope *rxe, int err, int prov_errno)
err_entry.data = rxe->cq_entry.data;
err_entry.tag = rxe->cq_entry.tag;
if (OFI_UNLIKELY(efa_rdm_write_error_msg(ep, rxe->addr, prov_errno,
&err_entry.err_data, &err_entry.err_data_size))) {
err_msg, &err_entry.err_data_size))) {
err_entry.err_data_size = 0;
} else {
err_entry.err_data = err_msg;
}

EFA_WARN(FI_LOG_CQ, "err: %d, message: %s (%d)\n",
Expand Down Expand Up @@ -660,6 +663,7 @@ void efa_rdm_txe_handle_error(struct efa_rdm_ope *txe, int err, int prov_errno)
struct dlist_entry *tmp;
struct efa_rdm_pke *pkt_entry;
int write_cq_err;
char err_msg[EFA_ERROR_MSG_BUFFER_LENGTH] = {0};

ep = txe->ep;
memset(&err_entry, 0, sizeof(err_entry));
Expand Down Expand Up @@ -695,8 +699,10 @@ void efa_rdm_txe_handle_error(struct efa_rdm_ope *txe, int err, int prov_errno)
err_entry.data = txe->cq_entry.data;
err_entry.tag = txe->cq_entry.tag;
if (OFI_UNLIKELY(efa_rdm_write_error_msg(ep, txe->addr, prov_errno,
&err_entry.err_data, &err_entry.err_data_size))) {
err_msg, &err_entry.err_data_size))) {
err_entry.err_data_size = 0;
} else {
err_entry.err_data = err_msg;
}

EFA_WARN(FI_LOG_CQ, "err: %d, message: %s (%d)\n",
Expand Down
5 changes: 3 additions & 2 deletions prov/efa/src/rdm/efa_rdm_rma.c
Original file line number Diff line number Diff line change
Expand Up @@ -360,6 +360,7 @@ ssize_t efa_rdm_rma_post_write(struct efa_rdm_ep *ep, struct efa_rdm_ope *txe)
bool delivery_complete_requested;
int ctrl_type, iface, use_p2p;
size_t max_eager_rtw_data_size;
char err_msg[EFA_ERROR_MSG_BUFFER_LENGTH] = {0};

/*
* A handshake is required to choose the correct protocol (whether to use device write/read).
Expand All @@ -377,15 +378,15 @@ ssize_t efa_rdm_rma_post_write(struct efa_rdm_ep *ep, struct efa_rdm_ope *txe)
*/
if ((txe->fi_flags & FI_REMOTE_CQ_DATA) &&
(efa_rdm_ep_support_unsolicited_write_recv(ep) != efa_rdm_peer_support_unsolicited_write_recv(txe->peer))) {
(void) efa_rdm_construct_msg_with_local_and_peer_information(ep, txe->addr, ep->err_msg, "", EFA_RDM_ERROR_MSG_BUFFER_LENGTH);
(void) efa_rdm_construct_msg_with_local_and_peer_information(ep, txe->addr, err_msg, "", EFA_ERROR_MSG_BUFFER_LENGTH);
EFA_WARN(FI_LOG_EP_DATA,
"Inconsistent support status detected on unsolicited write recv.\n"
"My support status: %d, peer support status: %d. %s.\n"
"This is usually caused by inconsistent efa driver, libfabric, or rdma-core versions.\n"
"Please use consistent software versions on both hosts, or disable the unsolicited write "
"recv feature by setting environment variable FI_EFA_USE_UNSOLICITED_WRITE_RECV=0\n",
efa_use_unsolicited_write_recv(), efa_rdm_peer_support_unsolicited_write_recv(txe->peer),
ep->err_msg);
err_msg);
return -FI_EOPNOTSUPP;
}
efa_rdm_ope_prepare_to_post_write(txe);
Expand Down
12 changes: 5 additions & 7 deletions prov/efa/src/rdm/efa_rdm_util.c
Original file line number Diff line number Diff line change
Expand Up @@ -149,24 +149,22 @@ int efa_rdm_construct_msg_with_local_and_peer_information(struct efa_rdm_ep *ep,
* @param[in] ep EFA RDM endpoint
* @param[in] addr Remote peer fi_addr_t
* @param[in] prov_errno EFA provider * error code(must be positive)
* @param[out] buf Pointer to the address of error data written by this function
* @param[out] err_msg Pointer to the address of error message written by this function
* @param[out] buflen Pointer to the returned error data size
* @return A status code. 0 if the error data was written successfully, otherwise a negative FI error code.
*/
int efa_rdm_write_error_msg(struct efa_rdm_ep *ep, fi_addr_t addr, int prov_errno, void **buf, size_t *buflen)
int efa_rdm_write_error_msg(struct efa_rdm_ep *ep, fi_addr_t addr, int prov_errno, char *err_msg, size_t *buflen)
{
const char *base_msg = efa_strerror(prov_errno);
int ret;

*buf = NULL;
*buflen = 0;
*buflen = 0;

ret = efa_rdm_construct_msg_with_local_and_peer_information(ep, addr, ep->err_msg, base_msg, EFA_RDM_ERROR_MSG_BUFFER_LENGTH);
ret = efa_rdm_construct_msg_with_local_and_peer_information(ep, addr, err_msg, base_msg, EFA_ERROR_MSG_BUFFER_LENGTH);
if (ret)
return ret;

*buf = ep->err_msg;
*buflen = EFA_RDM_ERROR_MSG_BUFFER_LENGTH;
*buflen = EFA_ERROR_MSG_BUFFER_LENGTH;

return 0;
}
2 changes: 1 addition & 1 deletion prov/efa/src/rdm/efa_rdm_util.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ void efa_rdm_get_desc_for_shm(int numdesc, void **efa_desc, void **shm_desc);

int efa_rdm_construct_msg_with_local_and_peer_information(struct efa_rdm_ep *ep, fi_addr_t addr, char *msg, const char *base_msg, size_t msg_len);

int efa_rdm_write_error_msg(struct efa_rdm_ep *ep, fi_addr_t addr, int prov_errno, void **buf, size_t *buflen);
int efa_rdm_write_error_msg(struct efa_rdm_ep *ep, fi_addr_t addr, int prov_errno, char *err_msg, size_t *buflen);

#ifdef ENABLE_EFA_POISONING
static inline void efa_rdm_poison_mem_region(void *ptr, size_t size)
Expand Down
2 changes: 1 addition & 1 deletion prov/efa/test/efa_unit_test_cq.c
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,7 @@ static void test_rdm_cq_read_bad_send_status(struct efa_resource *resource,
assert_int_equal(ret, -FI_EAVAIL);

/* Allocate memory to read CQ error */
cq_err_entry.err_data_size = EFA_RDM_ERROR_MSG_BUFFER_LENGTH;
cq_err_entry.err_data_size = EFA_ERROR_MSG_BUFFER_LENGTH;
cq_err_entry.err_data = malloc(cq_err_entry.err_data_size);
assert_non_null(cq_err_entry.err_data);

Expand Down

0 comments on commit a190ce4

Please sign in to comment.