From ed5560a27db86a6aa625b7362150fbe3b086c0cd Mon Sep 17 00:00:00 2001 From: Shi Jin Date: Fri, 20 Dec 2024 00:31:08 +0000 Subject: [PATCH] prov/efa: Detect unsolicited write recv support status on both sides Currently, when local support unsolicited write recv while the peer doesn't support it, the peer will crash because it expects to get a valid wr_id for IBV_WC_RECV_RDMA_WITH_IMM op code. This peer crash can cause weird error message on sender side's cq when it is still sending data to it. When local doesn't support unsolicited write recv while the peer support it, local will get cq error for the rdma op as "Unexpected status" as well. This patch makes the initiator of rdma write imm detect the unsolicited write recv support status on both sides. If there is inconsistency, the initiator will return error with clear error messages that instruct the mitigation. Signed-off-by: Shi Jin --- prov/efa/docs/efa_rdm_protocol_v4.md | 18 ++++++ prov/efa/src/rdm/efa_rdm_ep.h | 6 ++ prov/efa/src/rdm/efa_rdm_ep_fiops.c | 3 + prov/efa/src/rdm/efa_rdm_peer.h | 17 +++++ prov/efa/src/rdm/efa_rdm_protocol.h | 3 +- prov/efa/src/rdm/efa_rdm_rma.c | 18 ++++++ prov/efa/src/rdm/efa_rdm_util.c | 91 +++++++++++++++++---------- prov/efa/src/rdm/efa_rdm_util.h | 2 + prov/efa/test/efa_unit_test_ep.c | 94 ++++++++++++++++++++++++++++ prov/efa/test/efa_unit_tests.c | 2 + prov/efa/test/efa_unit_tests.h | 2 + 11 files changed, 221 insertions(+), 35 deletions(-) diff --git a/prov/efa/docs/efa_rdm_protocol_v4.md b/prov/efa/docs/efa_rdm_protocol_v4.md index 9016ec00958..1877156779b 100644 --- a/prov/efa/docs/efa_rdm_protocol_v4.md +++ b/prov/efa/docs/efa_rdm_protocol_v4.md @@ -68,6 +68,12 @@ Chapter 4 "extra features/requests" describes the extra features/requests define * Section 4.6 describe the extra feature: RDMA-Write based message transfer. + * Section 4.7 describe the extra feature: Long read and runting read nack protocol. + + * Section 4.8 describe the extra feature: User receive QP. + + * Section 4.9 describe the extra feature: Unsolicited write recv. + Chapter 5 "What's not covered?" describes the contents that are intentionally left out of this document because they are considered "implementation details". @@ -323,6 +329,7 @@ Table: 2.1 a list of extra features/requests | 5 | RDMA-Write based data transfer | extra feature | libfabric 1.18.0 | Section 4.6 | | 6 | Read nack packets | extra feature | libfabric 1.20.0 | Section 4.7 | | 7 | User recv QP | extra feature & request| libfabric 1.22.0 | Section 4.8 | +| 8 | Unsolicited write recv | extra feature | libfabric 1.22.0 | Section 4.9 | How does protocol v4 maintain backward compatibility when extra features/requests are introduced? @@ -1611,6 +1618,17 @@ zero-copy receive mode. If a receiver gets RTM packets delivered to its default QP, it raises an error because it requests all RTM packets must be delivered to its user recv QP. +### 4.9 Unsolicited write recv + +The "Unsolicited write recv" is an extra feature that was +introduced with the libfabric 1.22.0. When this feature is on, rdma-write +with immediate data will not consume an rx buffer on the responder side. It is +defined as an extra feature because there is a set of requirements (firmware, +EFA kernel module and rdma-core) to be met before an endpoint can use the unsolicited +write recv capability, therefore an endpoint cannot assume the other party supports +unsolicited write recv. The rdma-write with immediate data cannot be issued if there +is a discrepancy on this feature between local and peer. + ## 5. What's not covered? The purpose of this document is to define the communication protocol. Therefore, it is intentionally written diff --git a/prov/efa/src/rdm/efa_rdm_ep.h b/prov/efa/src/rdm/efa_rdm_ep.h index aecb391ec55..9b198026d1b 100644 --- a/prov/efa/src/rdm/efa_rdm_ep.h +++ b/prov/efa/src/rdm/efa_rdm_ep.h @@ -455,4 +455,10 @@ static inline int efa_rdm_attempt_to_sync_memops_ioc(struct efa_rdm_ep *ep, stru return err; } +static inline +bool efa_rdm_ep_support_unsolicited_write_recv(struct efa_rdm_ep *ep) +{ + return ep->extra_info[0] & EFA_RDM_EXTRA_FEATURE_UNSOLICITED_WRITE_RECV; +} + #endif diff --git a/prov/efa/src/rdm/efa_rdm_ep_fiops.c b/prov/efa/src/rdm/efa_rdm_ep_fiops.c index 7f75caf8e92..2574e493cbb 100644 --- a/prov/efa/src/rdm/efa_rdm_ep_fiops.c +++ b/prov/efa/src/rdm/efa_rdm_ep_fiops.c @@ -1071,6 +1071,9 @@ void efa_rdm_ep_set_extra_info(struct efa_rdm_ep *ep) ep->extra_info[0] |= EFA_RDM_EXTRA_FEATURE_DELIVERY_COMPLETE; + if (efa_rdm_use_unsolicited_write_recv()) + ep->extra_info[0] |= EFA_RDM_EXTRA_FEATURE_UNSOLICITED_WRITE_RECV; + if (ep->use_zcpy_rx) { /* * When zcpy rx is enabled, an extra QP is created to diff --git a/prov/efa/src/rdm/efa_rdm_peer.h b/prov/efa/src/rdm/efa_rdm_peer.h index cc4deefa0ae..21585051921 100644 --- a/prov/efa/src/rdm/efa_rdm_peer.h +++ b/prov/efa/src/rdm/efa_rdm_peer.h @@ -115,6 +115,23 @@ bool efa_rdm_peer_support_rdma_write(struct efa_rdm_peer *peer) (peer->extra_info[0] & EFA_RDM_EXTRA_FEATURE_RDMA_WRITE); } +/** + * @brief check for peer's unsolicited write support, assuming HANDSHAKE has already occurred + * + * @param[in] peer A peer which we have already received a HANDSHAKE from + * @return bool The peer's unsolicited write recv support + */ +static inline +bool efa_rdm_peer_support_unsolicited_write_recv(struct efa_rdm_peer *peer) +{ + /* Unsolicited write recv is an extra feature defined in version 4 (the base version). + * Because it is an extra feature, an EP will assume the peer does not support + * it before a handshake packet was received. + */ + return (peer->flags & EFA_RDM_PEER_HANDSHAKE_RECEIVED) && + (peer->extra_info[0] & EFA_RDM_EXTRA_FEATURE_UNSOLICITED_WRITE_RECV); +} + static inline bool efa_rdm_peer_support_delivery_complete(struct efa_rdm_peer *peer) { diff --git a/prov/efa/src/rdm/efa_rdm_protocol.h b/prov/efa/src/rdm/efa_rdm_protocol.h index 1b94b5338d1..8840ce5f401 100644 --- a/prov/efa/src/rdm/efa_rdm_protocol.h +++ b/prov/efa/src/rdm/efa_rdm_protocol.h @@ -40,7 +40,8 @@ struct efa_ep_addr { #define EFA_RDM_EXTRA_FEATURE_RDMA_WRITE BIT_ULL(5) #define EFA_RDM_EXTRA_FEATURE_READ_NACK BIT_ULL(6) #define EFA_RDM_EXTRA_FEATURE_REQUEST_USER_RECV_QP BIT_ULL(7) -#define EFA_RDM_NUM_EXTRA_FEATURE_OR_REQUEST 8 +#define EFA_RDM_EXTRA_FEATURE_UNSOLICITED_WRITE_RECV BIT_ULL(8) +#define EFA_RDM_NUM_EXTRA_FEATURE_OR_REQUEST 9 /* * The length of 64-bit extra_info array used in efa_rdm_ep * and efa_rdm_peer diff --git a/prov/efa/src/rdm/efa_rdm_rma.c b/prov/efa/src/rdm/efa_rdm_rma.c index 36b2d5171da..cfb399ef055 100644 --- a/prov/efa/src/rdm/efa_rdm_rma.c +++ b/prov/efa/src/rdm/efa_rdm_rma.c @@ -370,6 +370,24 @@ ssize_t efa_rdm_rma_post_write(struct efa_rdm_ep *ep, struct efa_rdm_ope *txe) return efa_rdm_ep_enforce_handshake_for_txe(ep, txe); if (efa_rdm_rma_should_write_using_rdma(ep, txe, txe->peer)) { + /** + * Unsolicited write recv is a feature that makes rdma-write with + * imm not consume an rx buffer on the responder side, and this + * feature requires consistent support status on both sides. + */ + if ((txe->fi_flags & FI_REMOTE_CQ_DATA) && + (efa_rdm_ep_support_unsolicited_write_recv(ep) != efa_rdm_peer_support_unsolicited_write_recv(txe->peer))) { + (void) efa_rdm_construct_msg_with_local_and_peer_information(ep, txe->addr, ep->err_msg, "", EFA_RDM_ERROR_MSG_BUFFER_LENGTH); + EFA_WARN(FI_LOG_EP_DATA, + "Inconsistent support status detected on unsolicited write recv.\n" + "My support status: %d, peer support status: %d. %s.\n" + "This is usually caused by inconsistent efa driver, libfabric, or rdma-core versions.\n" + "Please use consistent software versions on both hosts, or disable the unsolicited write " + "recv feature by setting environment variable FI_EFA_USE_UNSOLICITED_WRITE_RECV=0\n", + efa_rdm_use_unsolicited_write_recv(), efa_rdm_peer_support_unsolicited_write_recv(txe->peer), + ep->err_msg); + return -FI_EOPNOTSUPP; + } efa_rdm_ope_prepare_to_post_write(txe); return efa_rdm_ope_post_remote_write(txe); } diff --git a/prov/efa/src/rdm/efa_rdm_util.c b/prov/efa/src/rdm/efa_rdm_util.c index 02880c09dfd..0175b3884a9 100644 --- a/prov/efa/src/rdm/efa_rdm_util.c +++ b/prov/efa/src/rdm/efa_rdm_util.c @@ -97,6 +97,53 @@ void efa_rdm_get_desc_for_shm(int numdesc, void **efa_desc, void **shm_desc) } } +/** + * @brief Construct a message that contains the local and peer information, + * including the efa address and the host id. + * + * @param ep EFA RDM endpoint + * @param addr Remote peer fi_addr_t + * @param msg the ptr of the msg to be constructed (needs to be allocated already!) + * @param base_msg ptr to the base msg that will show at the beginning of msg + * @param msg_len the length of the message + * @return int 0 on success, negative integer on failure + */ +int efa_rdm_construct_msg_with_local_and_peer_information(struct efa_rdm_ep *ep, fi_addr_t addr, char *msg, const char *base_msg, size_t msg_len) +{ + char ep_addr_str[OFI_ADDRSTRLEN] = {0}, peer_addr_str[OFI_ADDRSTRLEN] = {0}; + char peer_host_id_str[EFA_HOST_ID_STRING_LENGTH + 1] = {0}; + char local_host_id_str[EFA_HOST_ID_STRING_LENGTH + 1] = {0}; + size_t len = 0; + int ret; + struct efa_rdm_peer *peer = efa_rdm_ep_get_peer(ep, addr); + + len = sizeof(ep_addr_str); + efa_rdm_ep_raw_addr_str(ep, ep_addr_str, &len); + len = sizeof(peer_addr_str); + efa_rdm_ep_get_peer_raw_addr_str(ep, addr, peer_addr_str, &len); + + if (!ep->host_id || EFA_HOST_ID_STRING_LENGTH != snprintf(local_host_id_str, EFA_HOST_ID_STRING_LENGTH + 1, "i-%017lx", ep->host_id)) { + strcpy(local_host_id_str, "N/A"); + } + + if (!peer->host_id || EFA_HOST_ID_STRING_LENGTH != snprintf(peer_host_id_str, EFA_HOST_ID_STRING_LENGTH + 1, "i-%017lx", peer->host_id)) { + strcpy(peer_host_id_str, "N/A"); + } + + ret = snprintf(msg, msg_len, "%s My EFA addr: %s My host id: %s Peer EFA addr: %s Peer host id: %s", + base_msg, ep_addr_str, local_host_id_str, peer_addr_str, peer_host_id_str); + + if (ret < 0 || ret > msg_len - 1) { + return -FI_EINVAL; + } + + if (strlen(msg) >= msg_len) { + return -FI_ENOBUFS; + } + + return FI_SUCCESS; +} + /** * @brief Write the error message and return its byte length * @param[in] ep EFA RDM endpoint @@ -108,42 +155,18 @@ void efa_rdm_get_desc_for_shm(int numdesc, void **efa_desc, void **shm_desc) */ int efa_rdm_write_error_msg(struct efa_rdm_ep *ep, fi_addr_t addr, int prov_errno, void **buf, size_t *buflen) { - char ep_addr_str[OFI_ADDRSTRLEN] = {0}, peer_addr_str[OFI_ADDRSTRLEN] = {0}; - char peer_host_id_str[EFA_HOST_ID_STRING_LENGTH + 1] = {0}; - char local_host_id_str[EFA_HOST_ID_STRING_LENGTH + 1] = {0}; - const char *base_msg = efa_strerror(prov_errno); - size_t len = 0; - struct efa_rdm_peer *peer = efa_rdm_ep_get_peer(ep, addr); - - *buf = NULL; - *buflen = 0; - - len = sizeof(ep_addr_str); - efa_rdm_ep_raw_addr_str(ep, ep_addr_str, &len); - len = sizeof(peer_addr_str); - efa_rdm_ep_get_peer_raw_addr_str(ep, addr, peer_addr_str, &len); - - if (!ep->host_id || EFA_HOST_ID_STRING_LENGTH != snprintf(local_host_id_str, EFA_HOST_ID_STRING_LENGTH + 1, "i-%017lx", ep->host_id)) { - strcpy(local_host_id_str, "N/A"); - } - - if (!peer->host_id || EFA_HOST_ID_STRING_LENGTH != snprintf(peer_host_id_str, EFA_HOST_ID_STRING_LENGTH + 1, "i-%017lx", peer->host_id)) { - strcpy(peer_host_id_str, "N/A"); - } - - int ret = snprintf(ep->err_msg, EFA_RDM_ERROR_MSG_BUFFER_LENGTH, "%s My EFA addr: %s My host id: %s Peer EFA addr: %s Peer host id: %s", - base_msg, ep_addr_str, local_host_id_str, peer_addr_str, peer_host_id_str); + const char *base_msg = efa_strerror(prov_errno); + int ret; - if (ret < 0 || ret > EFA_RDM_ERROR_MSG_BUFFER_LENGTH - 1) { - return -FI_EINVAL; - } + *buf = NULL; + *buflen = 0; - if (strlen(ep->err_msg) >= EFA_RDM_ERROR_MSG_BUFFER_LENGTH) { - return -FI_ENOBUFS; - } + ret = efa_rdm_construct_msg_with_local_and_peer_information(ep, addr, ep->err_msg, base_msg, EFA_RDM_ERROR_MSG_BUFFER_LENGTH); + if (ret) + return ret; - *buf = ep->err_msg; - *buflen = EFA_RDM_ERROR_MSG_BUFFER_LENGTH; + *buf = ep->err_msg; + *buflen = EFA_RDM_ERROR_MSG_BUFFER_LENGTH; - return 0; + return 0; } diff --git a/prov/efa/src/rdm/efa_rdm_util.h b/prov/efa/src/rdm/efa_rdm_util.h index a2ba0083295..b79bafb4e85 100644 --- a/prov/efa/src/rdm/efa_rdm_util.h +++ b/prov/efa/src/rdm/efa_rdm_util.h @@ -19,6 +19,8 @@ bool efa_rdm_get_use_device_rdma(uint32_t fabric_api_version); void efa_rdm_get_desc_for_shm(int numdesc, void **efa_desc, void **shm_desc); +int efa_rdm_construct_msg_with_local_and_peer_information(struct efa_rdm_ep *ep, fi_addr_t addr, char *msg, const char *base_msg, size_t msg_len); + int efa_rdm_write_error_msg(struct efa_rdm_ep *ep, fi_addr_t addr, int prov_errno, void **buf, size_t *buflen); #ifdef ENABLE_EFA_POISONING diff --git a/prov/efa/test/efa_unit_test_ep.c b/prov/efa/test/efa_unit_test_ep.c index 1ac044ce00c..3adc8a136f9 100644 --- a/prov/efa/test/efa_unit_test_ep.c +++ b/prov/efa/test/efa_unit_test_ep.c @@ -659,6 +659,79 @@ void test_efa_rdm_ep_read_queue_before_handshake(struct efa_resource **state) test_efa_rdm_ep_rma_queue_before_handshake(state, ofi_op_read_req); } +/** + * @brief When local support unsolicited write, but the peer doesn't, fi_writedata + * (use rdma-write with imm) should fail as FI_EINVAL + * + * @param state struct efa_resource that is managed by the framework + */ +void test_efa_rdm_ep_rma_inconsistent_unsolicited_write_recv(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + struct efa_rdm_ep *efa_rdm_ep; + struct efa_ep_addr raw_addr = {0}; + size_t raw_addr_len = sizeof(struct efa_ep_addr); + fi_addr_t peer_addr; + int num_addr; + const int buf_len = 8; + char buf[8] = {0}; + int err; + uint64_t rma_addr, rma_key; + struct efa_rdm_peer *peer; + + resource->hints = efa_unit_test_alloc_hints(FI_EP_RDM); + resource->hints->caps |= FI_MSG | FI_TAGGED | FI_RMA; + resource->hints->domain_attr->mr_mode |= MR_MODE_BITS; + efa_unit_test_resource_construct_with_hints(resource, FI_EP_RDM, FI_VERSION(1, 22), + resource->hints, true, true); + + efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); + + /** + * TODO: It's better to mock this function + * so we can test on platform that doesn't + * support rdma-write. + */ + if (!(efa_rdm_ep_support_rdma_write(efa_rdm_ep))) + skip(); + + /* Make local ep support unsolicited write recv */ + efa_rdm_ep->extra_info[0] |= EFA_RDM_EXTRA_FEATURE_UNSOLICITED_WRITE_RECV; + + /* create a fake peer */ + err = fi_getname(&resource->ep->fid, &raw_addr, &raw_addr_len); + assert_int_equal(err, 0); + raw_addr.qpn = 1; + raw_addr.qkey = 0x1234; + num_addr = fi_av_insert(resource->av, &raw_addr, 1, &peer_addr, 0, NULL); + assert_int_equal(num_addr, 1); + + /* create a fake rma_key and address. fi_read should return before + * they are needed. */ + rma_key = 0x1234; + rma_addr = (uint64_t) &buf; + + /* + * Fake a peer that has made handshake and + * does not support unsolicited write recv + */ + peer = efa_rdm_ep_get_peer(efa_rdm_ep, peer_addr); + peer->flags |= EFA_RDM_PEER_HANDSHAKE_RECEIVED; + peer->extra_info[0] |= EFA_RDM_EXTRA_FEATURE_RDMA_WRITE; + peer->extra_info[0] &= ~EFA_RDM_EXTRA_FEATURE_UNSOLICITED_WRITE_RECV; + /* make sure shm is not used */ + peer->is_local = false; + + err = fi_writedata(resource->ep, buf, buf_len, + NULL, /* desc, not required */ + 0x1234, + peer_addr, + rma_addr, + rma_key, + NULL); /* context */ + assert_int_equal(err, -FI_EOPNOTSUPP); +} + /** * @brief verify that when shm was used to send a small message (<4k), no copy was performed. * @@ -1299,3 +1372,24 @@ void test_efa_rdm_ep_rx_refill_threshold_larger_than_rx_size(struct efa_resource { test_efa_rdm_ep_rx_refill_impl(state, 128, 64); } + +/** + * @brief when unsolicited write recv is supported (by device + env), + * efa_rdm_ep_support_unsolicited_write_recv + * should return true, otherwise it should return false + * + * @param[in] state struct efa_resource that is managed by the framework + * @param[in] is_supported support status + */ +void test_efa_rdm_ep_support_unsolicited_write_recv(struct efa_resource **state) +{ + struct efa_rdm_ep *efa_rdm_ep; + struct efa_resource *resource = *state; + + efa_unit_test_resource_construct(resource, FI_EP_RDM); + + efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); + + assert_int_equal(efa_rdm_use_unsolicited_write_recv(), + efa_rdm_ep_support_unsolicited_write_recv(efa_rdm_ep)); +} diff --git a/prov/efa/test/efa_unit_tests.c b/prov/efa/test/efa_unit_tests.c index 8d90a988bb9..017f4e65ded 100644 --- a/prov/efa/test/efa_unit_tests.c +++ b/prov/efa/test/efa_unit_tests.c @@ -117,6 +117,8 @@ int main(void) cmocka_unit_test_setup_teardown(test_efa_rdm_ep_post_handshake_error_handling_pke_exhaustion, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_efa_rdm_ep_rx_refill_threshold_smaller_than_rx_size, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_efa_rdm_ep_rx_refill_threshold_larger_than_rx_size, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_efa_rdm_ep_rma_inconsistent_unsolicited_write_recv, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_efa_rdm_ep_support_unsolicited_write_recv, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_dgram_cq_read_empty_cq, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_ibv_cq_ex_read_empty_cq, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_ibv_cq_ex_read_failed_poll, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), diff --git a/prov/efa/test/efa_unit_tests.h b/prov/efa/test/efa_unit_tests.h index b3a6fcbedee..4a796e5385f 100644 --- a/prov/efa/test/efa_unit_tests.h +++ b/prov/efa/test/efa_unit_tests.h @@ -139,6 +139,8 @@ void test_efa_rdm_ep_zcpy_recv_eagain(); void test_efa_rdm_ep_post_handshake_error_handling_pke_exhaustion(); void test_efa_rdm_ep_rx_refill_threshold_smaller_than_rx_size(); void test_efa_rdm_ep_rx_refill_threshold_larger_than_rx_size(); +void test_efa_rdm_ep_support_unsolicited_write_recv(); +void test_efa_rdm_ep_rma_inconsistent_unsolicited_write_recv(); void test_dgram_cq_read_empty_cq(); void test_ibv_cq_ex_read_empty_cq(); void test_ibv_cq_ex_read_failed_poll();