From 051224f13a511bc1205420711b2192d34a9cd6b8 Mon Sep 17 00:00:00 2001 From: Alexander Grissik Date: Wed, 6 Nov 2024 16:16:26 +0000 Subject: [PATCH 1/7] issue: 4154211 Removing ring_tap Removing TC and flow-message functionality, from Daemon, which was used by TAP logic. Signed-off-by: Alexander Grissik --- src/core/Makefile.am | 2 - src/core/dev/ib_ctx_handler_collection.cpp | 8 +- src/core/dev/net_device_table_mgr.cpp | 70 +- src/core/dev/net_device_val.cpp | 39 +- src/core/dev/net_device_val.h | 2 +- src/core/dev/rfs.cpp | 14 +- src/core/dev/rfs_mc.cpp | 4 +- src/core/dev/rfs_uc.cpp | 4 +- src/core/dev/ring_bond.cpp | 259 ++----- src/core/dev/ring_bond.h | 39 +- src/core/dev/ring_simple.cpp | 6 +- src/core/dev/ring_simple.h | 8 +- src/core/dev/ring_slave.cpp | 16 +- src/core/dev/ring_slave.h | 7 +- src/core/dev/ring_tap.cpp | 630 ----------------- src/core/dev/ring_tap.h | 146 ---- src/core/event/event_handler_manager.cpp | 3 - src/core/sock/fd_collection.cpp | 45 -- src/core/sock/fd_collection.h | 45 -- src/core/util/agent.cpp | 47 -- src/core/util/agent.h | 1 - src/core/util/agent_def.h | 52 -- src/core/util/sys_vars.h | 14 +- src/core/util/utils.cpp | 53 +- src/core/util/utils.h | 2 - src/core/util/xlio_stats.h | 11 - src/stats/stats_reader.cpp | 170 ++--- tests/gtest/Makefile.am | 3 +- tests/gtest/xliod/xliod_flow.cc | 402 ----------- tools/daemon/Makefile.am | 9 +- tools/daemon/daemon.c | 55 -- tools/daemon/daemon.h | 146 ---- tools/daemon/flow.c | 772 -------------------- tools/daemon/loop.c | 10 - tools/daemon/message.c | 185 ----- tools/daemon/nl.c | 239 ------- tools/daemon/nl.h | 123 ---- tools/daemon/notify.c | 21 - tools/daemon/store.c | 1 - tools/daemon/tc.c | 786 --------------------- tools/daemon/tc.h | 252 ------- 41 files changed, 171 insertions(+), 4530 deletions(-) delete mode 100644 src/core/dev/ring_tap.cpp delete mode 100644 src/core/dev/ring_tap.h delete mode 100644 tests/gtest/xliod/xliod_flow.cc delete mode 100644 tools/daemon/flow.c delete mode 100644 tools/daemon/nl.c delete mode 100644 tools/daemon/nl.h delete mode 100644 tools/daemon/tc.c delete mode 100644 tools/daemon/tc.h diff --git a/src/core/Makefile.am b/src/core/Makefile.am index f40e48211..656d07a80 100644 --- a/src/core/Makefile.am +++ b/src/core/Makefile.am @@ -96,7 +96,6 @@ libxlio_la_SOURCES := \ dev/ring_bond.cpp \ dev/ring_slave.cpp \ dev/ring_simple.cpp \ - dev/ring_tap.cpp \ dev/ring_allocation_logic.cpp \ \ event/delta_timer.cpp \ @@ -202,7 +201,6 @@ libxlio_la_SOURCES := \ dev/ring_bond.h \ dev/ring_slave.h \ dev/ring_simple.h \ - dev/ring_tap.h \ dev/ring_allocation_logic.h \ dev/wqe_send_handler.h \ dev/xlio_ti.h \ diff --git a/src/core/dev/ib_ctx_handler_collection.cpp b/src/core/dev/ib_ctx_handler_collection.cpp index ab53d2704..48db4b66a 100644 --- a/src/core/dev/ib_ctx_handler_collection.cpp +++ b/src/core/dev/ib_ctx_handler_collection.cpp @@ -193,15 +193,9 @@ void ib_ctx_handler_collection::print_val_tbl() ib_ctx_handler *ib_ctx_handler_collection::get_ib_ctx(const char *ifa_name) { char active_slave[IFNAMSIZ] = {0}; - unsigned int slave_flags = 0; ib_context_map_t::iterator ib_ctx_iter; - if (check_netvsc_device_exist(ifa_name)) { - if (!get_netvsc_slave(ifa_name, active_slave, slave_flags)) { - return nullptr; - } - ifa_name = (const char *)active_slave; - } else if (check_bond_device_exist(ifa_name)) { + if (check_bond_device_exist(ifa_name)) { /* active/backup: return active slave */ if (!get_bond_active_slave_name(ifa_name, active_slave, sizeof(active_slave))) { char slaves[IFNAMSIZ * 16] = {0}; diff --git a/src/core/dev/net_device_table_mgr.cpp b/src/core/dev/net_device_table_mgr.cpp index d31a3f98b..71d569424 100644 --- a/src/core/dev/net_device_table_mgr.cpp +++ b/src/core/dev/net_device_table_mgr.cpp @@ -351,25 +351,6 @@ net_device_val *net_device_table_mgr::get_net_device_val(int if_index) goto out; } } - /* Check if interface is new netvsc slave */ - if (net_dev->get_is_bond() == net_device_val::NETVSC) { - char if_name[IFNAMSIZ] = {0}; - char sys_path[256] = {0}; - int ret = 0; - if (if_indextoname(if_index, if_name)) { - ret = snprintf(sys_path, sizeof(sys_path), NETVSC_DEVICE_UPPER_FILE, if_name, - net_dev->get_ifname()); - if (ret > 0 && (size_t)ret < sizeof(sys_path)) { - ret = errno; /* to suppress errno */ - int fd = SYSCALL(open, sys_path, O_RDONLY); - if (fd >= 0) { - SYSCALL(close, fd); - goto out; - } - errno = ret; - } - } - } } ndtm_logdbg("Can't find net_device for index: %d", if_index); @@ -572,61 +553,24 @@ void net_device_table_mgr::get_net_devices(local_dev_vector &vec) void net_device_table_mgr::del_link_event(const netlink_link_info *info) { + NOT_IN_USE(info); ndtm_logdbg("netlink event: RTM_DELLINK if_index: %d", info->ifindex); - - /* This flow is actual when interface is removed quickly - * w/o moving it in DOWN state. - * Usually interface is removed during sequence of RTM_NEWLINK events - * that puts it in DOWN state. In this case XLIO has more time to release - * resources correctly. - */ - if (info->flags & IFF_SLAVE) { - net_device_val *net_dev = nullptr; - int if_index = info->ifindex; - - ndtm_logdbg("netlink event: if_index: %d state: %s", info->ifindex, - (info->flags & IFF_RUNNING ? "Up" : "Down")); - - net_dev = get_net_device_val(if_index); - if (net_dev && (if_index != net_dev->get_if_idx()) && - (net_dev->get_is_bond() == net_device_val::NETVSC) && (net_dev->get_slave(if_index))) { - ndtm_logdbg("found entry [%p]: if_index: %d : %s", net_dev, net_dev->get_if_idx(), - net_dev->get_ifname()); - net_dev->update_netvsc_slaves(info->ifindex, info->flags); - } - } + // Leftover from NETVSC bond (Tap ring). + // Can be reused for Dynamic Interface tracking. } void net_device_table_mgr::new_link_event(const netlink_link_info *info) { + NOT_IN_USE(info); ndtm_logdbg("netlink event: RTM_NEWLINK if_index: %d", info->ifindex); - - /* This flow is used to process interface UP and DOWN scenarios. - * It is important that interface can be removed w/o putting it into - * DOWN state (see RTM_DELLINK). - */ - if (info->flags & IFF_SLAVE) { - net_device_val *net_dev = nullptr; - int if_index = info->ifindex; - - ndtm_logdbg("netlink event: if_index: %d state: %s", info->ifindex, - (info->flags & IFF_RUNNING ? "Up" : "Down")); - - net_dev = get_net_device_val(if_index); - if (net_dev && (if_index != net_dev->get_if_idx()) && - (net_dev->get_is_bond() == net_device_val::NETVSC) && - ((net_dev->get_slave(if_index) && !(info->flags & IFF_RUNNING)) || - (!net_dev->get_slave(if_index) && (info->flags & IFF_RUNNING)))) { - ndtm_logdbg("found entry [%p]: if_index: %d : %s", net_dev, net_dev->get_if_idx(), - net_dev->get_ifname()); - net_dev->update_netvsc_slaves(info->ifindex, info->flags); - } - } + // Leftover from NETVSC bond (Tap ring) + // Can be reused for Dynamic Interface tracking. } void net_device_table_mgr::notify_cb(event *ev) { ndtm_logdbg("netlink event: LINK"); + // Leftover from NETVSC bond (Tap ring) link_nl_event *link_netlink_ev = dynamic_cast(ev); if (!link_netlink_ev) { diff --git a/src/core/dev/net_device_val.cpp b/src/core/dev/net_device_val.cpp index ba84d431b..61d30c2c9 100644 --- a/src/core/dev/net_device_val.cpp +++ b/src/core/dev/net_device_val.cpp @@ -53,7 +53,6 @@ #include "event/event_handler_manager.h" #include "proto/L2_address.h" #include "dev/ib_ctx_handler_collection.h" -#include "dev/ring_tap.h" #include "dev/ring_simple.h" #include "dev/ring_slave.h" #include "dev/ring_bond.h" @@ -229,8 +228,6 @@ net_device_val::net_device_val(struct net_device_val_desc *desc) /* Identify device type */ if ((get_flags() & IFF_MASTER) || check_bond_device_exist(get_ifname_link())) { verify_bonding_mode(); - } else if (check_netvsc_device_exist(get_ifname_link())) { - m_bond = NETVSC; } else { m_bond = NO_BOND; } @@ -240,16 +237,6 @@ net_device_val::net_device_val(struct net_device_val_desc *desc) valid = false; ib_ctx = g_p_ib_ctx_handler_collection->get_ib_ctx(get_ifname_link()); switch (m_bond) { - case NETVSC: - if (get_type() == ARPHRD_ETHER) { - char slave_ifname[IFNAMSIZ] = {0}; - unsigned int slave_flags = 0; - /* valid = true; uncomment it is valid flow to operate w/o SRIOV */ - if (get_netvsc_slave(get_ifname_link(), slave_ifname, slave_flags)) { - valid = verify_qp_creation(slave_ifname, IBV_QPT_RAW_PACKET); - } - } - break; case LAG_8023ad: case ACTIVE_BACKUP: // this is a bond interface (or a vlan/alias over bond), find the slaves @@ -518,9 +505,6 @@ const std::string net_device_val::to_str_ex() const rc += " ("; switch (m_bond) { - case NETVSC: - rc += "netvsc"; - break; case LAG_8023ad: rc += "lag 8023ad"; break; @@ -579,16 +563,7 @@ void net_device_val::set_slave_array() nd_logdbg(""); - if (m_bond == NETVSC) { - slave_data_t *s = nullptr; - unsigned int slave_flags = 0; - if (get_netvsc_slave(get_ifname_link(), active_slave, slave_flags)) { - if ((slave_flags & IFF_UP) && verify_qp_creation(active_slave, IBV_QPT_RAW_PACKET)) { - s = new slave_data_t(if_nametoindex(active_slave)); - m_slaves.push_back(s); - } - } - } else if (m_bond == NO_BOND) { + if (m_bond == NO_BOND) { slave_data_t *s = new slave_data_t(if_nametoindex(get_ifname())); m_slaves.push_back(s); } else { @@ -651,10 +626,6 @@ void net_device_val::set_slave_array() } } - if (m_bond == NETVSC) { - m_slaves[i]->active = true; - } - if (m_bond == NO_BOND) { m_slaves[i]->active = true; } @@ -677,7 +648,7 @@ void net_device_val::set_slave_array() } } - if (m_slaves.empty() && NETVSC != m_bond) { + if (m_slaves.empty()) { m_state = INVALID; nd_logpanic("No slave found."); } @@ -1392,16 +1363,12 @@ ring *net_device_val_eth::create_ring(resource_allocation_key *key) try { switch (m_bond) { case NO_BOND: - ring = new ring_eth(get_if_idx(), nullptr, RING_ETH, true, - (key ? key->get_use_locks() : true)); + ring = new ring_eth(get_if_idx(), nullptr, true, (key ? key->get_use_locks() : true)); break; case ACTIVE_BACKUP: case LAG_8023ad: ring = new ring_bond_eth(get_if_idx()); break; - case NETVSC: - ring = new ring_bond_netvsc(get_if_idx()); - break; default: nd_logdbg("Unknown ring type"); break; diff --git a/src/core/dev/net_device_val.h b/src/core/dev/net_device_val.h index 151753241..bd6e1d300 100644 --- a/src/core/dev/net_device_val.h +++ b/src/core/dev/net_device_val.h @@ -172,7 +172,7 @@ typedef std::unordered_map tc_class_priority_map; class net_device_val { public: enum state { DOWN, UP, RUNNING, INVALID }; - enum bond_type { NO_BOND, ACTIVE_BACKUP, LAG_8023ad, NETVSC }; + enum bond_type { NO_BOND, ACTIVE_BACKUP, LAG_8023ad }; enum bond_xmit_hash_policy { XHP_LAYER_2, XHP_LAYER_3_4, diff --git a/src/core/dev/rfs.cpp b/src/core/dev/rfs.cpp index b17b8f404..ce7b3899e 100644 --- a/src/core/dev/rfs.cpp +++ b/src/core/dev/rfs.cpp @@ -166,15 +166,11 @@ rfs::~rfs() int counter = 0; prepare_filter_detach(counter, true); if (counter == 0) { - if (m_p_ring->is_simple()) { - destroy_flow(); - } + destroy_flow(); m_p_rule_filter->m_map.erase(m_p_rule_filter->m_key); } } else if (m_b_tmp_is_attached) { - if (m_p_ring->is_simple()) { - destroy_flow(); - } + destroy_flow(); } if (m_p_rule_filter) { @@ -271,14 +267,14 @@ bool rfs::attach_flow(sockinfo *sink) // We also check if this is the FIRST sink so we need to call ibv_attach_flow if ((m_n_sinks_list_entries == 0) && (!m_b_tmp_is_attached) && (filter_counter == 1)) { - if (m_p_ring->is_simple() && !create_flow()) { + if (!create_flow()) { return false; } filter_keep_attached(filter_iter); } else { rfs_logdbg("rfs: Joining existing flow"); #if defined(DEFINED_NGINX) || defined(DEFINED_ENVOY) - if (g_p_app->type != APP_NONE && m_p_ring->is_simple() && g_p_app->add_second_4t_rule) { + if (g_p_app->type != APP_NONE && g_p_app->add_second_4t_rule) { // This is second 4 tuple rule for the same worker (when number // of workers is not power of two) create_flow(); @@ -313,7 +309,7 @@ bool rfs::detach_flow(sockinfo *sink) prepare_filter_detach(filter_counter, false); // We also need to check if this is the LAST sink so we need to call ibv_attach_flow - if (m_p_ring->is_simple() && (m_n_sinks_list_entries == 0) && (filter_counter == 0)) { + if ((m_n_sinks_list_entries == 0) && (filter_counter == 0)) { ret = destroy_flow(); } diff --git a/src/core/dev/rfs_mc.cpp b/src/core/dev/rfs_mc.cpp index b5966d0cb..becd35ea7 100644 --- a/src/core/dev/rfs_mc.cpp +++ b/src/core/dev/rfs_mc.cpp @@ -57,9 +57,7 @@ rfs_mc::rfs_mc(flow_tuple *flow_spec_5t, ring_slave *p_ring, } BULLSEYE_EXCLUDE_BLOCK_END - if (m_p_ring->is_simple()) { - prepare_flow_spec(); - } + prepare_flow_spec(); } void rfs_mc::prepare_flow_spec() diff --git a/src/core/dev/rfs_uc.cpp b/src/core/dev/rfs_uc.cpp index fac0701ba..f250ed710 100644 --- a/src/core/dev/rfs_uc.cpp +++ b/src/core/dev/rfs_uc.cpp @@ -60,9 +60,7 @@ rfs_uc::rfs_uc(flow_tuple *flow_spec_5t, ring_slave *p_ring, rfs_rule_filter *ru } BULLSEYE_EXCLUDE_BLOCK_END - if (m_p_ring->is_simple()) { - prepare_flow_spec(); - } + prepare_flow_spec(); } void rfs_uc::prepare_flow_spec() diff --git a/src/core/dev/ring_bond.cpp b/src/core/dev/ring_bond.cpp index dc05d0529..add028a7a 100644 --- a/src/core/dev/ring_bond.cpp +++ b/src/core/dev/ring_bond.cpp @@ -35,7 +35,6 @@ #include "sock/sockinfo.h" #include "dev/ring_simple.h" -#include "dev/ring_tap.h" #undef MODULE_NAME #define MODULE_NAME "ring_bond" @@ -156,185 +155,76 @@ void ring_bond::restart() m_lock_ring_rx.lock(); m_lock_ring_tx.lock(); - if (p_ndev->get_is_bond() == net_device_val::NETVSC) { - ring_bond_netvsc *p_ring_bond_netvsc = dynamic_cast(this); - if (p_ring_bond_netvsc) { - ring_tap *p_ring_tap = dynamic_cast(p_ring_bond_netvsc->m_tap_ring); - if (p_ring_tap) { - size_t num_ring_rx_fds = 0; - int epfd = -1; - int fd = -1; - int rc = 0; - size_t i, j, k; - - if (slaves.empty()) { - num_ring_rx_fds = p_ring_bond_netvsc->m_vf_ring->get_rx_channels_num(); - - for (k = 0; k < num_ring_rx_fds; k++) { - epfd = g_p_net_device_table_mgr->global_ring_epfd_get(); - if (epfd > 0) { - fd = p_ring_bond_netvsc->m_vf_ring->get_rx_channel_fd(k); - rc = SYSCALL(epoll_ctl, epfd, EPOLL_CTL_DEL, fd, nullptr); - ring_logdbg("Remove fd=%d from epfd=%d rc=%d errno=%d", fd, epfd, rc, - errno); - } - } - - for (j = 0; j < m_rx_flows.size(); j++) { - sockinfo *si = static_cast(m_rx_flows[j].sink); - for (k = 0; k < num_ring_rx_fds; k++) { - epfd = si->get_rx_epfd(); - if (epfd > 0) { - fd = p_ring_bond_netvsc->m_vf_ring->get_rx_channel_fd(k); - rc = SYSCALL(epoll_ctl, epfd, EPOLL_CTL_DEL, fd, NULL); - ring_logdbg("Remove fd=%d from epfd=%d rc=%d errno=%d", fd, epfd, - rc, errno); - } - epfd = si->get_epoll_context_fd(); - if (epfd > 0) { - fd = p_ring_bond_netvsc->m_vf_ring->get_rx_channel_fd(k); - rc = SYSCALL(epoll_ctl, epfd, EPOLL_CTL_DEL, fd, NULL); - ring_logdbg("Remove fd=%d from epfd=%d rc=%d errno=%d", fd, epfd, - rc, errno); - } - } - } - - p_ring_tap->m_active = true; - p_ring_tap->inc_vf_plugouts(); - p_ring_bond_netvsc->slave_destroy( - p_ring_bond_netvsc->m_vf_ring->get_if_index()); - p_ring_bond_netvsc->m_vf_ring = nullptr; - p_ring_tap->set_vf_ring(nullptr); - } else { - for (i = 0; i < slaves.size(); i++) { - if (slaves[i]->if_index != p_ring_tap->get_if_index()) { - p_ring_tap->m_active = false; - slave_create(slaves[i]->if_index); - p_ring_tap->set_vf_ring(p_ring_bond_netvsc->m_vf_ring); - - num_ring_rx_fds = p_ring_bond_netvsc->m_vf_ring->get_rx_channels_num(); - - for (k = 0; k < num_ring_rx_fds; k++) { - epfd = g_p_net_device_table_mgr->global_ring_epfd_get(); - if (epfd > 0) { - epoll_event ev = {0, {nullptr}}; - fd = p_ring_bond_netvsc->m_vf_ring->get_rx_channel_fd(k); - ev.events = EPOLLIN; - ev.data.fd = fd; - rc = SYSCALL(epoll_ctl, epfd, EPOLL_CTL_ADD, fd, &ev); - ring_logdbg("Add fd=%d from epfd=%d rc=%d errno=%d", fd, epfd, - rc, errno); - } - } - for (j = 0; j < m_rx_flows.size(); j++) { - sockinfo *si = static_cast(m_rx_flows[j].sink); - p_ring_bond_netvsc->m_vf_ring->attach_flow(m_rx_flows[j].flow, - m_rx_flows[j].sink); - for (k = 0; k < num_ring_rx_fds; k++) { - epfd = si->get_rx_epfd(); - if (epfd > 0) { - epoll_event ev = {0, {0}}; - fd = p_ring_bond_netvsc->m_vf_ring->get_rx_channel_fd(k); - ev.events = EPOLLIN; - ev.data.fd = fd; - rc = SYSCALL(epoll_ctl, epfd, EPOLL_CTL_ADD, fd, &ev); - ring_logdbg("Add fd=%d from epfd=%d rc=%d errno=%d", fd, - epfd, rc, errno); - } - epfd = si->get_epoll_context_fd(); - if (epfd > 0) { -#define CQ_FD_MARK 0xabcd /* see sockinfo */ - epoll_event ev = {0, {0}}; - fd = p_ring_bond_netvsc->m_vf_ring->get_rx_channel_fd(k); - ev.events = EPOLLIN | EPOLLPRI; - ev.data.u64 = (((uint64_t)CQ_FD_MARK << 32) | fd); - rc = SYSCALL(epoll_ctl, epfd, EPOLL_CTL_ADD, fd, &ev); - ring_logdbg("Add fd=%d from epfd=%d rc=%d errno=%d", fd, - epfd, rc, errno); - } - } - } - break; - } - } - } - NOT_IN_USE(rc); // Suppress --enable-opt-log=high warning - } + /* for active-backup mode + * It is guaranteed that the first slave is active by popup_active_rings() + */ + ring_simple *previously_active = dynamic_cast(m_xmit_rings[0]); + + for (uint32_t i = 0; i < m_bond_rings.size(); i++) { + ring_simple *tmp_ring = dynamic_cast(m_bond_rings[i]); + + if (!tmp_ring) { + continue; } - } else { - /* for active-backup mode - * It is guaranteed that the first slave is active by popup_active_rings() - */ - ring_simple *previously_active = dynamic_cast(m_xmit_rings[0]); - for (uint32_t i = 0; i < m_bond_rings.size(); i++) { - ring_simple *tmp_ring = dynamic_cast(m_bond_rings[i]); + for (uint32_t j = 0; j < slaves.size(); j++) { - if (!tmp_ring) { + if (slaves[j]->if_index != m_bond_rings[i]->get_if_index()) { continue; } - for (uint32_t j = 0; j < slaves.size(); j++) { - - if (slaves[j]->if_index != m_bond_rings[i]->get_if_index()) { - continue; + /* For RoCE LAG device income data is processed by single ring only + * Consider using ring related slave with lag_tx_port_affinity = 1 + * even if slave is not active. + * Always keep this ring active for RX + * but keep common logic for TX + */ + if (slaves[j]->active) { + ring_logdbg("ring %d active", i); + if (slaves[j]->lag_tx_port_affinity != 1) { + tmp_ring->start_active_queue_tx(); + /* coverity[sleep] */ + tmp_ring->start_active_queue_rx(); } - - /* For RoCE LAG device income data is processed by single ring only - * Consider using ring related slave with lag_tx_port_affinity = 1 - * even if slave is not active. - * Always keep this ring active for RX - * but keep common logic for TX - */ - if (slaves[j]->active) { - ring_logdbg("ring %d active", i); - if (slaves[j]->lag_tx_port_affinity != 1) { - tmp_ring->start_active_queue_tx(); - /* coverity[sleep] */ - tmp_ring->start_active_queue_rx(); - } - m_bond_rings[i]->m_active = true; - } else { - ring_logdbg("ring %d not active", i); - if (slaves[j]->lag_tx_port_affinity != 1) { - /* coverity[sleep] */ - tmp_ring->stop_active_queue_tx(); - /* coverity[sleep] */ - tmp_ring->stop_active_queue_rx(); - } - m_bond_rings[i]->m_active = false; + m_bond_rings[i]->m_active = true; + } else { + ring_logdbg("ring %d not active", i); + if (slaves[j]->lag_tx_port_affinity != 1) { + /* coverity[sleep] */ + tmp_ring->stop_active_queue_tx(); + /* coverity[sleep] */ + tmp_ring->stop_active_queue_rx(); } - break; + m_bond_rings[i]->m_active = false; } + break; } - popup_xmit_rings(); - - if (!request_notification(CQT_RX)) { - ring_logdbg("Failed arming RX notification"); - } - if (!request_notification(CQT_TX)) { - ring_logdbg("Failed arming TX notification"); - } + } + popup_xmit_rings(); - if (m_type == net_device_val::ACTIVE_BACKUP) { - ring_simple *currently_active = dynamic_cast(m_xmit_rings[0]); - if (currently_active && safe_mce_sys().cq_moderation_enable) { - if (likely(previously_active)) { - currently_active->m_cq_moderation_info.period = - previously_active->m_cq_moderation_info.period; - currently_active->m_cq_moderation_info.count = - previously_active->m_cq_moderation_info.count; - } else { - currently_active->m_cq_moderation_info.period = - safe_mce_sys().cq_moderation_period_usec; - currently_active->m_cq_moderation_info.count = - safe_mce_sys().cq_moderation_count; - } + if (!request_notification(CQT_RX)) { + ring_logdbg("Failed arming RX notification"); + } + if (!request_notification(CQT_TX)) { + ring_logdbg("Failed arming TX notification"); + } - currently_active->modify_cq_moderation(safe_mce_sys().cq_moderation_period_usec, - safe_mce_sys().cq_moderation_count); + if (m_type == net_device_val::ACTIVE_BACKUP) { + ring_simple *currently_active = dynamic_cast(m_xmit_rings[0]); + if (currently_active && safe_mce_sys().cq_moderation_enable) { + if (likely(previously_active)) { + currently_active->m_cq_moderation_info.period = + previously_active->m_cq_moderation_info.period; + currently_active->m_cq_moderation_info.count = + previously_active->m_cq_moderation_info.count; + } else { + currently_active->m_cq_moderation_info.period = + safe_mce_sys().cq_moderation_period_usec; + currently_active->m_cq_moderation_info.count = safe_mce_sys().cq_moderation_count; } + + currently_active->modify_cq_moderation(safe_mce_sys().cq_moderation_period_usec, + safe_mce_sys().cq_moderation_count); } } @@ -767,13 +657,8 @@ void ring_bond::popup_recv_rings() * - For RoCE LAG device (lag_tx_port_affinity > 0) income data is processed by single ring only * Consider using ring related slave with lag_tx_port_affinity = 1 * even if slave is not active. - * - For NETVSC device all rings (vf and tap) should be ready for receive. */ for (uint32_t i = 0; i < m_bond_rings.size(); i++) { - if (p_ndev->get_is_bond() == net_device_val::NETVSC) { - m_recv_rings.push_back(m_bond_rings[i]); - continue; - } for (uint32_t j = 0; j < slaves.size(); j++) { if (slaves[j]->if_index != m_bond_rings[i]->get_if_index()) { continue; @@ -950,33 +835,3 @@ void ring_bond_eth::slave_create(int if_index) popup_recv_rings(); update_rx_channel_fds(); } - -void ring_bond_netvsc::slave_create(int if_index) -{ - ring_slave *cur_slave = nullptr; - net_device_val *p_ndev = nullptr; - - p_ndev = g_p_net_device_table_mgr->get_net_device_val(m_parent->get_if_index()); - if (!p_ndev) { - ring_logpanic("Error creating bond ring"); - } - - if (if_index == p_ndev->get_if_idx()) { - cur_slave = new ring_tap(if_index, this); - m_tap_ring = cur_slave; - } else { - cur_slave = new ring_eth(if_index, this); - m_vf_ring = cur_slave; - update_cap(cur_slave); - } - - m_bond_rings.push_back(cur_slave); - - if (m_bond_rings.size() > 2) { - ring_logpanic("Error creating bond ring with more than %d resource", 2); - } - - popup_xmit_rings(); - popup_recv_rings(); - update_rx_channel_fds(); -} diff --git a/src/core/dev/ring_bond.h b/src/core/dev/ring_bond.h index 7dff0c605..6af6902ba 100644 --- a/src/core/dev/ring_bond.h +++ b/src/core/dev/ring_bond.h @@ -34,9 +34,7 @@ #ifndef RING_BOND_H #define RING_BOND_H -#include "ring.h" - -#include "dev/ring_tap.h" +#include "ring_slave.h" #include "dev/net_device_table_mgr.h" typedef std::vector ring_slave_vector_t; @@ -196,39 +194,4 @@ class ring_bond_eth : public ring_bond { virtual void slave_create(int if_index); }; -class ring_bond_netvsc : public ring_bond { -public: - ring_bond_netvsc(int if_index) - : ring_bond(if_index) - { - net_device_val *p_ndev = - g_p_net_device_table_mgr->get_net_device_val(m_parent->get_if_index()); - - m_vf_ring = nullptr; - m_tap_ring = nullptr; - if (p_ndev) { - const slave_data_vector_t &slaves = p_ndev->get_slave_array(); - update_cap(); - slave_create(p_ndev->get_if_idx()); - for (size_t i = 0; i < slaves.size(); i++) { - slave_create(slaves[i]->if_index); - } - - if (m_tap_ring && m_vf_ring) { - ring_tap *p_ring_tap = dynamic_cast(m_tap_ring); - if (p_ring_tap) { - p_ring_tap->set_vf_ring(m_vf_ring); - } - } - } - } - -protected: - virtual void slave_create(int if_index); - -public: - ring_slave *m_vf_ring; - ring_slave *m_tap_ring; -}; - #endif /* RING_BOND_H */ diff --git a/src/core/dev/ring_simple.cpp b/src/core/dev/ring_simple.cpp index 5041b08fa..492f5fccb 100644 --- a/src/core/dev/ring_simple.cpp +++ b/src/core/dev/ring_simple.cpp @@ -85,8 +85,8 @@ inline void ring_simple::send_status_handler(int ret, xlio_ibv_send_wr *p_send_w BULLSEYE_EXCLUDE_BLOCK_END } -ring_simple::ring_simple(int if_index, ring *parent, ring_type_t type, bool use_locks) - : ring_slave(if_index, parent, type, use_locks) +ring_simple::ring_simple(int if_index, ring *parent, bool use_locks) + : ring_slave(if_index, parent, use_locks) , m_lock_ring_tx_buf_wait("ring:lock_tx_buf_wait") , m_p_doca_mmap(g_buffer_pool_tx->get_doca_mmap()) , m_gro_mgr(safe_mce_sys().gro_streams_max, MAX_GRO_BUFS) @@ -900,7 +900,7 @@ mem_buf_desc_t *ring_simple::get_tx_buffers(pbuf_type type, uint32_t n_num_mem_b /* * TODO Unify request_more_tx_buffers so ring_slave * keeps number of buffers instead of reinventing it in - * ring_simple and ring_tap. + * ring_simple. */ if (type == PBUF_ZEROCOPY) { m_zc_num_bufs += count; diff --git a/src/core/dev/ring_simple.h b/src/core/dev/ring_simple.h index 2321018ca..ac52e3f35 100644 --- a/src/core/dev/ring_simple.h +++ b/src/core/dev/ring_simple.h @@ -61,7 +61,7 @@ struct cq_moderation_info { */ class ring_simple : public ring_slave { public: - ring_simple(int if_index, ring *parent, ring_type_t type, bool use_locks); + ring_simple(int if_index, ring *parent, bool use_locks); virtual ~ring_simple(); bool request_notification(cq_type_t cq_type) override; @@ -398,9 +398,9 @@ class ring_simple : public ring_slave { class ring_eth : public ring_simple { public: - ring_eth(int if_index, ring *parent = nullptr, ring_type_t type = RING_ETH, - bool call_create_res = true, bool use_locks = true) - : ring_simple(if_index, parent, type, use_locks) + ring_eth(int if_index, ring *parent = nullptr, bool call_create_res = true, + bool use_locks = true) + : ring_simple(if_index, parent, use_locks) { net_device_val_eth *p_ndev = dynamic_cast( g_p_net_device_table_mgr->get_net_device_val(m_parent->get_if_index())); diff --git a/src/core/dev/ring_slave.cpp b/src/core/dev/ring_slave.cpp index a7a23f8ff..2f2cdb7d9 100644 --- a/src/core/dev/ring_slave.cpp +++ b/src/core/dev/ring_slave.cpp @@ -57,7 +57,7 @@ static lock_base *get_new_lock(const char *name, bool real_lock) : static_cast(&t_lock_dummy_ring)); } -ring_slave::ring_slave(int if_index, ring *parent, ring_type_t type, bool use_locks) +ring_slave::ring_slave(int if_index, ring *parent, bool use_locks) : ring() , m_steering_ipv4(*this) , m_steering_ipv6(*this) @@ -68,7 +68,6 @@ ring_slave::ring_slave(int if_index, ring *parent, ring_type_t type, bool use_lo , m_flow_tag_enabled(false) , m_b_sysvar_eth_mc_l2_only_rules(safe_mce_sys().eth_mc_l2_only_rules) , m_b_sysvar_mc_force_flowtag(safe_mce_sys().mc_force_flowtag) - , m_type(type) { net_device_val *p_ndev = nullptr; const slave_data_t *p_slave = nullptr; @@ -88,15 +87,11 @@ ring_slave::ring_slave(int if_index, ring *parent, ring_type_t type, bool use_lo /* Configure ring_slave() fields */ m_transport_type = p_ndev->get_transport_type(); - /* Set the same ring active status as related slave has for all ring types - * excluding ring with type RING_TAP that does not have related slave device. - * So it is marked as active just in case related netvsc device is absent. - */ + /* Set the same ring active status as related slave has for all ring types */ m_active = p_slave ? p_slave->active : p_ndev->get_slave_array().empty(); // use local copy of stats by default memset(m_p_ring_stat.get(), 0, sizeof(ring_stats_t)); - m_p_ring_stat->n_type = m_type; if (m_parent != this) { m_p_ring_stat->p_ring_master = m_parent; } @@ -124,9 +119,8 @@ ring_slave::~ring_slave() void ring_slave::print_val() { - ring_logdbg("%d: %p: parent %p type %s", m_if_index, this, - ((uintptr_t)this == (uintptr_t)m_parent ? nullptr : m_parent), - ring_type_str[m_type]); + ring_logdbg("%d: %p: parent %p", m_if_index, this, + ((uintptr_t)this == (uintptr_t)m_parent ? nullptr : m_parent)); } void ring_slave::restart() @@ -340,7 +334,7 @@ bool steering_handler::attach_flow(flow_tuple &flow_spec_5t, new rfs_rule_filter(m_ring.m_tcp_dst_port_attach_map, rule_key, tcp_3t_only); } try { - if (safe_mce_sys().gro_streams_max && m_ring.is_simple()) { + if (safe_mce_sys().gro_streams_max) { p_tmp_rfs = new (std::nothrow) rfs_uc_tcp_gro(&flow_spec_5t, &m_ring, dst_port_filter, flow_tag_id); } else { diff --git a/src/core/dev/ring_slave.h b/src/core/dev/ring_slave.h index c20cc826f..631cc9e75 100644 --- a/src/core/dev/ring_slave.h +++ b/src/core/dev/ring_slave.h @@ -274,7 +274,7 @@ template class steering_handler { class ring_slave : public ring { public: - ring_slave(int if_index, ring *parent, ring_type_t type, bool use_locks); + ring_slave(int if_index, ring *parent, bool use_locks); virtual ~ring_slave(); virtual void print_val(); @@ -300,9 +300,7 @@ class ring_slave : public ring { rfs_rule *tls_rx_create_rule(const flow_tuple &flow_spec_5t, xlio_tir *tir); #endif /* DEFINED_UTLS */ - inline bool is_simple() const { return m_type != RING_TAP; } transport_type_t get_transport_type() const { return m_transport_type; } - inline ring_type_t get_type() const { return m_type; } virtual void flow_del_all_rfs_safe(); @@ -338,8 +336,7 @@ class ring_slave : public ring { template friend class steering_handler; private: - ring_type_t m_type; /* ring type */ - uint8_t padding[8]; // make class size up to a whole cache line + uint8_t padding[32]; // make class size up to a whole cache line }; #endif /* RING_SLAVE_H_ */ diff --git a/src/core/dev/ring_tap.cpp b/src/core/dev/ring_tap.cpp deleted file mode 100644 index 41b1dc1fc..000000000 --- a/src/core/dev/ring_tap.cpp +++ /dev/null @@ -1,630 +0,0 @@ -/* - * SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES - * Copyright (c) 2001-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include "ring_tap.h" - -#include -#include "util/sg_array.h" -#include "sock/fd_collection.h" -#include "dev/net_device_table_mgr.h" - -#undef MODULE_NAME -#define MODULE_NAME "ring_tap" -#undef MODULE_HDR -#define MODULE_HDR MODULE_NAME "%d:%s() " - -ring_tap::ring_tap(int if_index, ring *parent) - : ring_slave(if_index, parent, RING_TAP, true) - , m_tap_fd(-1) - , m_vf_ring(nullptr) - , m_sysvar_qp_compensation_level(safe_mce_sys().qp_compensation_level) - , m_tap_data_available(false) -{ - int rc = 0; - struct xlio_msg_flow data; - char tap_if_name[IFNAMSIZ] = {0}; - net_device_val *p_ndev = g_p_net_device_table_mgr->get_net_device_val(m_parent->get_if_index()); - - /* Create TAP device and update ring class with new if_index */ - tap_create(p_ndev); - - /* Register tap ring to the internal thread */ - if (m_tap_fd >= 0) { - g_p_fd_collection->addtapfd(m_tap_fd, this); - g_p_event_handler_manager->update_epfd(m_tap_fd, EPOLL_CTL_ADD, - EPOLLIN | EPOLLPRI | EPOLLONESHOT); - } - - /* Initialize RX buffer poll */ - request_more_rx_buffers(); - m_rx_pool.set_id("ring_tap (%p) : m_rx_pool", this); - - /* Initialize TX buffer poll */ - request_more_tx_buffers(PBUF_RAM, m_sysvar_qp_compensation_level, 0); - - /* Update ring statistics */ - m_p_ring_stat->tap.n_tap_fd = m_tap_fd; - if_indextoname(get_if_index(), tap_if_name); - memcpy(m_p_ring_stat->tap.s_tap_name, tap_if_name, IFNAMSIZ); - - /* create egress rule (redirect traffic from tap device to physical interface) */ - rc = prepare_flow_message(data, XLIO_MSG_FLOW_ADD); - if (rc != 0) { - ring_logwarn("Add TC rule failed with error=%d", rc); - } -} - -ring_tap::~ring_tap() -{ - m_lock_ring_rx.lock(); - flow_del_all_rfs(); - m_lock_ring_rx.unlock(); - - g_p_event_handler_manager->update_epfd(m_tap_fd, EPOLL_CTL_DEL, - EPOLLIN | EPOLLPRI | EPOLLONESHOT); - - if (g_p_fd_collection) { - g_p_fd_collection->del_tapfd(m_tap_fd); - } - - /* Release RX buffer poll */ - g_buffer_pool_rx_ptr->put_buffers_thread_safe(&m_rx_pool, m_rx_pool.size()); - - /* TAP device release */ - tap_destroy(); -} - -void ring_tap::tap_create(net_device_val *p_ndev) -{ -#define TAP_NAME_FORMAT "t%x%x" // t -#define TAP_STR_LENGTH 512 -#define TAP_DISABLE_IPV6 "sysctl -w net.ipv6.conf.%s.disable_ipv6=1" - - int rc = 0, tap_if_index = -1, ioctl_sock = -1; - struct ifreq ifr; - char command_str[TAP_STR_LENGTH], return_str[TAP_STR_LENGTH], tap_name[IFNAMSIZ]; - unsigned char hw_addr[ETH_ALEN]; - - /* Open TAP device */ - if ((m_tap_fd = SYSCALL(open, "/dev/net/tun", O_RDWR)) < 0) { - ring_logerr("FAILED to open tap %m"); - rc = -errno; - goto error; - } - - /* Tap name */ - rc = snprintf(tap_name, sizeof(tap_name), TAP_NAME_FORMAT, getpid() & 0xFFFFFFF, - m_tap_fd & 0xFFFFFFF); - if (unlikely(((int)sizeof(tap_name) < rc) || (rc < 0))) { - ring_logerr("FAILED to create tap name %m"); - rc = -errno; - goto error; - } - - /* Init ifr */ - memset(&ifr, 0, sizeof(ifr)); - rc = snprintf(ifr.ifr_name, IFNAMSIZ, "%s", tap_name); - if (unlikely((IFNAMSIZ < rc) || (rc < 0))) { - ring_logerr("FAILED to create tap name %m"); - rc = -errno; - goto error; - } - - /* Setting TAP attributes */ - ifr.ifr_flags = IFF_TAP | IFF_NO_PI | IFF_ONE_QUEUE; - if ((rc = SYSCALL(ioctl, m_tap_fd, TUNSETIFF, (void *)&ifr)) < 0) { - ring_logerr("ioctl failed fd = %d, %d %m", m_tap_fd, rc); - rc = -errno; - goto error; - } - - /* Set TAP fd nonblocking */ - if ((rc = SYSCALL(fcntl, m_tap_fd, F_SETFL, O_NONBLOCK)) < 0) { - ring_logerr("ioctl failed fd = %d, %d %m", m_tap_fd, rc); - rc = -errno; - goto error; - } - - /* Disable Ipv6 for TAP interface */ - snprintf(command_str, TAP_STR_LENGTH, TAP_DISABLE_IPV6, tap_name); - if (run_and_retreive_system_command(command_str, return_str, TAP_STR_LENGTH) < 0) { - ring_logerr("sysctl ipv6 failed fd = %d, %m", m_tap_fd); - rc = -errno; - goto error; - } - - /* Create socket */ - if ((ioctl_sock = SYSCALL(socket, AF_INET, SOCK_DGRAM, 0)) < 0) { - ring_logerr("FAILED to open socket"); - rc = -errno; - goto error; - } - - /* Set MAC address */ - ifr.ifr_hwaddr.sa_family = AF_LOCAL; - get_local_ll_addr(p_ndev->get_ifname_link(), hw_addr, ETH_ALEN, false); - memcpy(ifr.ifr_hwaddr.sa_data, hw_addr, ETH_ALEN); - if ((rc = SYSCALL(ioctl, ioctl_sock, SIOCSIFHWADDR, &ifr)) < 0) { - ring_logerr("ioctl SIOCSIFHWADDR failed %d %m, %s", rc, tap_name); - rc = -errno; - goto error; - } - - /* Set link UP */ - ifr.ifr_flags |= (IFF_UP | IFF_SLAVE); - if ((rc = SYSCALL(ioctl, ioctl_sock, SIOCSIFFLAGS, &ifr)) < 0) { - ring_logerr("ioctl SIOCGIFFLAGS failed %d %m, %s", rc, tap_name); - rc = -errno; - goto error; - } - - /* Get TAP interface index */ - tap_if_index = if_nametoindex(tap_name); - if (!tap_if_index) { - ring_logerr("if_nametoindex failed to get tap index [%s]", tap_name); - rc = -errno; - goto error; - } - - /* Update if_index on ring class */ - set_if_index(tap_if_index); - - SYSCALL(close, ioctl_sock); - - ring_logdbg("Tap device %d: %s [fd=%d] was created successfully", tap_if_index, ifr.ifr_name, - m_tap_fd); - - return; - -error: - ring_logerr("Tap device creation failed %d, %m", rc); - - if (ioctl_sock >= 0) { - SYSCALL(close, ioctl_sock); - } - - if (m_tap_fd >= 0) { - SYSCALL(close, m_tap_fd); - } - - m_tap_fd = -1; -} - -void ring_tap::tap_destroy() -{ - if (m_tap_fd >= 0) { - SYSCALL(close, m_tap_fd); - - m_tap_fd = -1; - } -} - -bool ring_tap::attach_flow(flow_tuple &flow_spec_5t, sockinfo *sink, bool force_5t) -{ - std::lock_guard lock(m_lock_ring_rx); - bool ret = ring_slave::attach_flow(flow_spec_5t, sink, force_5t); - - if (ret && (flow_spec_5t.is_tcp() || flow_spec_5t.is_udp_uc())) { - int rc = 0; - struct xlio_msg_flow data; - rc = prepare_flow_message(data, XLIO_MSG_FLOW_ADD, flow_spec_5t); - if (rc != 0) { - if (!g_b_exit) { - ring_logwarn("Add TC rule failed with error=%d", rc); - } - ring_slave::detach_flow(flow_spec_5t, sink); - ret = false; - } - } - - return ret; -} - -bool ring_tap::detach_flow(flow_tuple &flow_spec_5t, sockinfo *sink) -{ - std::lock_guard lock(m_lock_ring_rx); - bool ret = ring_slave::detach_flow(flow_spec_5t, sink); - - if (flow_spec_5t.is_tcp() || flow_spec_5t.is_udp_uc()) { - int rc = 0; - struct xlio_msg_flow data; - rc = prepare_flow_message(data, XLIO_MSG_FLOW_DEL, flow_spec_5t); - if (rc != 0) { - if (!g_b_exit) { - ring_logwarn("Del TC rule failed with error=%d", rc); - } - ret = false; - } - } - - return ret; -} - -int ring_tap::get_rx_channel_fd(size_t ch_idx) const -{ - NOT_IN_USE(ch_idx); - return m_tap_fd; -} - -bool ring_tap::poll_and_process_element_rx(void *pv_fd_ready_array) -{ - return (process_element_rx(pv_fd_ready_array) == 0); -} - -int ring_tap::drain_and_proccess() -{ - return process_element_rx(nullptr); -} - -bool ring_tap::reclaim_recv_buffers(descq_t *rx_reuse) -{ - while (!rx_reuse->empty()) { - mem_buf_desc_t *buff = rx_reuse->get_and_pop_front(); - reclaim_recv_buffers(buff); - } - - if (m_rx_pool.size() >= m_sysvar_qp_compensation_level * 2) { - int buff_to_rel = m_rx_pool.size() - m_sysvar_qp_compensation_level; - - g_buffer_pool_rx_ptr->put_buffers_thread_safe(&m_rx_pool, buff_to_rel); - m_p_ring_stat->tap.n_rx_buffers = m_rx_pool.size(); - } - - return true; -} - -bool ring_tap::reclaim_recv_buffers(mem_buf_desc_t *buff) -{ - if (buff && (buff->dec_ref_count() <= 1)) { - mem_buf_desc_t *temp = nullptr; - while (buff) { - if (buff->lwip_pbuf_dec_ref_count() <= 0) { - temp = buff; - buff = temp->p_next_desc; - temp->clear_transport_data(); - temp->p_next_desc = nullptr; - temp->p_prev_desc = nullptr; - temp->reset_ref_count(); - free_lwip_pbuf(&temp->lwip_pbuf); - m_rx_pool.push_back(temp); - } else { - buff->reset_ref_count(); - buff = buff->p_next_desc; - } - } - m_p_ring_stat->tap.n_rx_buffers = m_rx_pool.size(); - return true; - } - return false; -} - -void ring_tap::send_ring_buffer(ring_user_id_t id, xlio_ibv_send_wr *p_send_wqe, - xlio_wr_tx_packet_attr attr) -{ - NOT_IN_USE(id); - compute_tx_checksum((mem_buf_desc_t *)(p_send_wqe->wr_id), attr & XLIO_TX_PACKET_L3_CSUM, - attr & XLIO_TX_PACKET_L4_CSUM); - - std::lock_guard lock(m_lock_ring_tx); - int ret = send_buffer(p_send_wqe, attr); - send_status_handler(ret, p_send_wqe); -} - -int ring_tap::send_lwip_buffer(ring_user_id_t id, xlio_ibv_send_wr *p_send_wqe, - xlio_wr_tx_packet_attr attr, xlio_tis *tis) -{ - NOT_IN_USE(id); - NOT_IN_USE(tis); - compute_tx_checksum((mem_buf_desc_t *)(p_send_wqe->wr_id), attr & XLIO_TX_PACKET_L3_CSUM, - attr & XLIO_TX_PACKET_L4_CSUM); - - std::lock_guard lock(m_lock_ring_tx); - int ret = send_buffer(p_send_wqe, attr); - send_status_handler(ret, p_send_wqe); - return ret; -} - -int ring_tap::prepare_flow_message(xlio_msg_flow &data, msg_flow_t flow_action, - flow_tuple &flow_spec_5t) -{ - int rc = 0; - - memset(&data, 0, sizeof(data)); - data.hdr.code = XLIO_MSG_FLOW; - data.hdr.ver = XLIO_AGENT_VER; - data.hdr.pid = getpid(); - - data.action = flow_action; - data.if_id = get_parent()->get_if_index(); - data.tap_id = get_if_index(); - - data.flow.dst.family = flow_spec_5t.get_family(); - data.flow.dst.port = flow_spec_5t.get_dst_port(); - if (data.flow.dst.family == AF_INET) { - data.flow.dst.addr.ipv4 = flow_spec_5t.get_dst_ip().get_in4_addr().s_addr; - } else { - memcpy(&data.flow.dst.addr.ipv6[0], &flow_spec_5t.get_dst_ip().get_in6_addr(), - sizeof(data.flow.dst.addr.ipv6)); - } - - if (flow_spec_5t.is_3_tuple()) { - data.type = flow_spec_5t.is_tcp() ? XLIO_MSG_FLOW_TCP_3T : XLIO_MSG_FLOW_UDP_3T; - } else { - data.type = flow_spec_5t.is_tcp() ? XLIO_MSG_FLOW_TCP_5T : XLIO_MSG_FLOW_UDP_5T; - data.flow.src.family = flow_spec_5t.get_family(); - data.flow.src.port = flow_spec_5t.get_src_port(); - if (data.flow.src.family == AF_INET) { - data.flow.src.addr.ipv4 = flow_spec_5t.get_src_ip().get_in4_addr().s_addr; - } else { - memcpy(&data.flow.src.addr.ipv6[0], &flow_spec_5t.get_src_ip().get_in6_addr(), - sizeof(data.flow.src.addr.ipv6)); - } - } - - rc = g_p_agent->send_msg_flow(&data); - - return rc; -} - -int ring_tap::prepare_flow_message(xlio_msg_flow &data, msg_flow_t flow_action) -{ - int rc = 0; - - memset(&data, 0, sizeof(data)); - data.hdr.code = XLIO_MSG_FLOW; - data.hdr.ver = XLIO_AGENT_VER; - data.hdr.pid = getpid(); - data.action = flow_action; - data.if_id = get_parent()->get_if_index(); - data.tap_id = get_if_index(); - data.type = XLIO_MSG_FLOW_EGRESS; - - rc = g_p_agent->send_msg_flow(&data); - - return rc; -} - -int ring_tap::process_element_rx(void *pv_fd_ready_array) -{ - int ret = 0; - - if (m_tap_data_available) { - std::lock_guard lock(m_lock_ring_rx); - if (m_rx_pool.size() || request_more_rx_buffers()) { - mem_buf_desc_t *buff = m_rx_pool.get_and_pop_front(); - ret = SYSCALL(read, m_tap_fd, buff->p_buffer, buff->sz_buffer); - if (ret > 0) { - /* Data was read and processed successfully */ - buff->sz_data = ret; - buff->rx.is_sw_csum_need = 1; - if ((ret = rx_process_buffer(buff, pv_fd_ready_array))) { - m_p_ring_stat->tap.n_rx_buffers--; - } - } - if (ret <= 0) { - /* Unable to read data, return buffer to pool */ - ret = 0; - m_rx_pool.push_front(buff); - } - - m_tap_data_available = false; - g_p_event_handler_manager->update_epfd(m_tap_fd, EPOLL_CTL_MOD, - EPOLLIN | EPOLLPRI | EPOLLONESHOT); - } - } - - return ret; -} - -bool ring_tap::request_more_rx_buffers() -{ - ring_logfuncall("Allocating additional %d buffers for internal use", - m_sysvar_qp_compensation_level); - - bool res = g_buffer_pool_rx_ptr->get_buffers_thread_safe(m_rx_pool, this, - m_sysvar_qp_compensation_level, 0); - if (!res) { - ring_logfunc("Out of mem_buf_desc from RX free pool for internal object pool"); - return false; - } - - m_p_ring_stat->tap.n_rx_buffers = m_rx_pool.size(); - - return true; -} - -mem_buf_desc_t *ring_tap::mem_buf_tx_get(ring_user_id_t id, bool b_block, pbuf_type type, - int n_num_mem_bufs) -{ - mem_buf_desc_t *head = nullptr; - - NOT_IN_USE(id); - NOT_IN_USE(b_block); - NOT_IN_USE(type); - - ring_logfuncall("n_num_mem_bufs=%d", n_num_mem_bufs); - - m_lock_ring_tx.lock(); - - if (unlikely((int)m_tx_pool.size() < n_num_mem_bufs)) { - request_more_tx_buffers(PBUF_RAM, m_sysvar_qp_compensation_level, 0); - - if (unlikely((int)m_tx_pool.size() < n_num_mem_bufs)) { - m_lock_ring_tx.unlock(); - return head; - } - } - - head = m_tx_pool.get_and_pop_back(); - head->lwip_pbuf.ref = 1; - n_num_mem_bufs--; - - mem_buf_desc_t *next = head; - while (n_num_mem_bufs) { - next->p_next_desc = m_tx_pool.get_and_pop_back(); - next = next->p_next_desc; - next->lwip_pbuf.ref = 1; - n_num_mem_bufs--; - } - - m_lock_ring_tx.unlock(); - - return head; -} - -inline void ring_tap::return_to_global_pool() -{ - if (m_tx_pool.size() >= m_sysvar_qp_compensation_level * 2) { - int return_bufs = m_tx_pool.size() - m_sysvar_qp_compensation_level; - g_buffer_pool_tx->put_buffers_thread_safe(&m_tx_pool, return_bufs); - } -} - -void ring_tap::mem_buf_desc_return_single_to_owner_tx(mem_buf_desc_t *p_mem_buf_desc) -{ - std::lock_guard lock(m_lock_ring_tx); - - if (likely(p_mem_buf_desc)) { - // potential race, ref is protected here by ring_tx lock, and in dst_entry_tcp & - // sockinfo_tcp by tcp lock - if (likely(p_mem_buf_desc->lwip_pbuf.ref)) { - p_mem_buf_desc->lwip_pbuf.ref--; - } else { - ring_logerr("ref count of %p is already zero, double free??", p_mem_buf_desc); - } - - if (p_mem_buf_desc->lwip_pbuf.ref == 0) { - p_mem_buf_desc->p_next_desc = nullptr; - if (unlikely(p_mem_buf_desc->lwip_pbuf.type == PBUF_ZEROCOPY)) { - g_buffer_pool_zc->put_buffers_thread_safe(p_mem_buf_desc); - return; - } - free_lwip_pbuf(&p_mem_buf_desc->lwip_pbuf); - m_tx_pool.push_back(p_mem_buf_desc); - } - } - - return_to_global_pool(); -} - -void ring_tap::mem_buf_desc_return_single_multi_ref(mem_buf_desc_t *p_mem_buf_desc, unsigned ref) -{ - if (unlikely(ref == 0)) { - return; - } - - m_lock_ring_tx.lock(); - p_mem_buf_desc->lwip_pbuf.ref -= std::min(p_mem_buf_desc->lwip_pbuf.ref, ref - 1); - m_lock_ring_tx.unlock(); - mem_buf_desc_return_single_to_owner_tx(p_mem_buf_desc); -} - -int ring_tap::mem_buf_tx_release(mem_buf_desc_t *buff_list, bool b_accounting, bool trylock) -{ - int count = 0, freed = 0; - mem_buf_desc_t *next; - - NOT_IN_USE(b_accounting); - - if (!trylock) { - m_lock_ring_tx.lock(); - } else if (m_lock_ring_tx.trylock()) { - return 0; - } - - while (buff_list) { - next = buff_list->p_next_desc; - buff_list->p_next_desc = nullptr; - - // potential race, ref is protected here by ring_tx lock, and in dst_entry_tcp & - // sockinfo_tcp by tcp lock - if (likely(buff_list->lwip_pbuf.ref)) { - buff_list->lwip_pbuf.ref--; - } else { - ring_logerr("ref count of %p is already zero, double free??", buff_list); - } - - if (buff_list->lwip_pbuf.ref == 0) { - free_lwip_pbuf(&buff_list->lwip_pbuf); - m_tx_pool.push_back(buff_list); - freed++; - } - count++; - buff_list = next; - } - - return_to_global_pool(); - m_lock_ring_tx.unlock(); - - ring_logfunc("buf_list: %p count: %d freed: %d\n", buff_list, count, freed); - NOT_IN_USE(freed); - - return count; -} - -int ring_tap::send_buffer(xlio_ibv_send_wr *wr, xlio_wr_tx_packet_attr attr) -{ - int ret = 0; - iovec iovec[wr->num_sge]; - NOT_IN_USE(attr); - - for (int i = 0; i < wr->num_sge; i++) { - iovec[i].iov_base = (void *)wr->sg_list[i].addr; - iovec[i].iov_len = wr->sg_list[i].length; - } - - ret = SYSCALL(writev, m_tap_fd, iovec, wr->num_sge); - if (ret < 0) { - ring_logdbg("writev: tap_fd %d, errno: %d\n", m_tap_fd, errno); - } - - return ret; -} - -void ring_tap::send_status_handler(int ret, xlio_ibv_send_wr *p_send_wqe) -{ - // Pay attention that there is a difference in return values in ring_simple and ring_tap - // Non positive value of ret means that we are on error flow (unlike for ring_simple). - if (p_send_wqe) { - mem_buf_desc_t *p_mem_buf_desc = (mem_buf_desc_t *)(p_send_wqe->wr_id); - - if (likely(ret > 0)) { - // Update TX statistics - sg_array sga(p_send_wqe->sg_list, p_send_wqe->num_sge); - m_p_ring_stat->n_tx_byte_count += sga.length(); - ++m_p_ring_stat->n_tx_pkt_count; - } - - mem_buf_tx_release(p_mem_buf_desc, true); - } -} diff --git a/src/core/dev/ring_tap.h b/src/core/dev/ring_tap.h deleted file mode 100644 index 64d50cdf9..000000000 --- a/src/core/dev/ring_tap.h +++ /dev/null @@ -1,146 +0,0 @@ -/* - * SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES - * Copyright (c) 2001-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#ifndef RING_TAP_H_ -#define RING_TAP_H_ - -#include "ring_slave.h" -#include "util/agent.h" - -class ring_tap : public ring_slave { -public: - ring_tap(int if_index, ring *parent); - virtual ~ring_tap(); - - virtual bool is_up() { return (m_vf_ring || m_active); } - virtual bool attach_flow(flow_tuple &flow_spec_5t, sockinfo *sink, bool force_5t = false); - virtual bool detach_flow(flow_tuple &flow_spec_5t, sockinfo *sink); - virtual bool poll_and_process_element_rx(void *pv_fd_ready_array = NULL); - virtual int poll_and_process_element_tx() { return 0; } - virtual void clear_rx_notification() {}; - virtual int drain_and_proccess(); - virtual size_t get_rx_channels_num() const { return 1U; }; - virtual int get_rx_channel_fd(size_t ch_idx) const; - virtual int get_tx_channel_fd() const { return m_tap_fd; } - virtual bool reclaim_recv_buffers(descq_t *rx_reuse); - virtual bool reclaim_recv_buffers(mem_buf_desc_t *buff); - virtual void send_ring_buffer(ring_user_id_t id, xlio_ibv_send_wr *p_send_wqe, - xlio_wr_tx_packet_attr attr); - virtual int send_lwip_buffer(ring_user_id_t id, xlio_ibv_send_wr *p_send_wqe, - xlio_wr_tx_packet_attr attr, xlio_tis *tis); - virtual void mem_buf_desc_return_single_to_owner_tx(mem_buf_desc_t *p_mem_buf_desc); - virtual void mem_buf_desc_return_single_multi_ref(mem_buf_desc_t *p_mem_buf_desc, unsigned ref); - virtual mem_buf_desc_t *mem_buf_tx_get(ring_user_id_t id, bool b_block, pbuf_type type, - int n_num_mem_bufs = 1); - virtual int mem_buf_tx_release(mem_buf_desc_t *p_mem_buf_desc_list, bool b_accounting, - bool trylock = false); - virtual bool get_hw_dummy_send_support(ring_user_id_t id, xlio_ibv_send_wr *p_send_wqe) - { - NOT_IN_USE(id); - NOT_IN_USE(p_send_wqe); - return false; - } - virtual bool request_notification(cq_type_t cq_type) - { - NOT_IN_USE(cq_type); - return 0; - } - virtual void adapt_cq_moderation() {} - - virtual int modify_ratelimit(struct xlio_rate_limit_t &rate_limit) - { - NOT_IN_USE(rate_limit); - return 0; - } - void inc_cq_moderation_stats() {} - virtual uint32_t get_tx_user_lkey(void *addr, size_t length) - { - NOT_IN_USE(addr); - NOT_IN_USE(length); - return LKEY_ERROR; - } - virtual uint32_t get_max_inline_data() { return 0; } - ib_ctx_handler *get_ctx(ring_user_id_t id) - { - NOT_IN_USE(id); - return nullptr; - } - virtual uint32_t get_max_send_sge(void) { return 1; } - virtual uint32_t get_max_payload_sz(void) { return 0; } - virtual uint16_t get_max_header_sz(void) { return 0; } - virtual uint32_t get_tx_lkey(ring_user_id_t id) - { - NOT_IN_USE(id); - return 0; - } - virtual bool is_tso(void) { return false; } - - inline void set_tap_data_available() { m_tap_data_available = true; } - inline void set_vf_ring(ring_slave *p_ring) { m_vf_ring = p_ring; } - inline void inc_vf_plugouts() { m_p_ring_stat->tap.n_vf_plugouts++; } - uint32_t send_doca_single(void *ptr, uint32_t len, mem_buf_desc_t *buff) - { - NOT_IN_USE(ptr); - NOT_IN_USE(len); - NOT_IN_USE(buff); - return -1; - } - uint32_t send_doca_lso(struct iovec &h, struct pbuf *p, uint16_t mss, bool is_zerocopy) - { - NOT_IN_USE(h); - NOT_IN_USE(p); - NOT_IN_USE(mss); - NOT_IN_USE(is_zerocopy); - return -1; - } - -private: - inline void return_to_global_pool(); - int prepare_flow_message(xlio_msg_flow &data, msg_flow_t flow_action, flow_tuple &flow_spec_5t); - int prepare_flow_message(xlio_msg_flow &data, msg_flow_t flow_action); - int process_element_rx(void *pv_fd_ready_array); - bool request_more_rx_buffers(); - int send_buffer(xlio_ibv_send_wr *p_send_wqe, xlio_wr_tx_packet_attr attr); - void send_status_handler(int ret, xlio_ibv_send_wr *p_send_wqe); - void tap_create(net_device_val *p_ndev); - void tap_destroy(); - - /* These fields are NETVSC mode specific */ - int m_tap_fd; /* file descriptor of tap device */ - ring_slave *m_vf_ring; - const uint32_t m_sysvar_qp_compensation_level; - descq_t m_rx_pool; - bool m_tap_data_available; -}; - -#endif /* RING_TAP_H_ */ diff --git a/src/core/event/event_handler_manager.cpp b/src/core/event/event_handler_manager.cpp index 6fe873a09..46c16a1f0 100644 --- a/src/core/event/event_handler_manager.cpp +++ b/src/core/event/event_handler_manager.cpp @@ -1046,9 +1046,6 @@ void *event_handler_manager::thread_loop() event_handler_map_t::iterator i = m_event_handler_map.find(fd); if (i == m_event_handler_map.end()) { // No event handler - this is probably a poll_os event! - if (!g_p_fd_collection->set_immediate_os_sample(fd)) { - evh_logdbg("No event handler (fd=%d)", fd); - } continue; } diff --git a/src/core/sock/fd_collection.cpp b/src/core/sock/fd_collection.cpp index 468584f01..12772fd2c 100644 --- a/src/core/sock/fd_collection.cpp +++ b/src/core/sock/fd_collection.cpp @@ -84,9 +84,6 @@ fd_collection::fd_collection() m_p_cq_channel_map = new cq_channel_info *[m_n_fd_map_size]; memset(m_p_cq_channel_map, 0, m_n_fd_map_size * sizeof(cq_channel_info *)); - - m_p_tap_map = new ring_tap *[m_n_fd_map_size]; - memset(m_p_tap_map, 0, m_n_fd_map_size * sizeof(ring_tap *)); } fd_collection::~fd_collection() @@ -105,9 +102,6 @@ fd_collection::~fd_collection() delete[] m_p_cq_channel_map; m_p_cq_channel_map = nullptr; - delete[] m_p_tap_map; - m_p_tap_map = nullptr; - m_epfd_lst.clear_without_cleanup(); m_pending_to_remove_lst.clear_without_cleanup(); } @@ -192,11 +186,6 @@ void fd_collection::clear() m_p_cq_channel_map[fd] = nullptr; fdcoll_logdbg("destroyed cq_channel_fd=%d", fd); } - - if (m_p_tap_map[fd]) { - m_p_tap_map[fd] = nullptr; - fdcoll_logdbg("destroyed tapfd=%d", fd); - } } unlock(); @@ -390,29 +379,6 @@ int fd_collection::addepfd(int epfd, int size) return 0; } -int fd_collection::addtapfd(int tapfd, ring_tap *p_ring) -{ - fdcoll_logfunc("tapfd=%d, p_ring=%p", tapfd, p_ring); - - if (!is_valid_fd(tapfd)) { - return -1; - } - - lock(); - - if (get_tapfd(tapfd)) { - fdcoll_logwarn("[tapfd=%d] already exist in the collection (ring %p)", tapfd, - get_tapfd(tapfd)); - return -1; - } - - m_p_tap_map[tapfd] = p_ring; - - unlock(); - - return 0; -} - int fd_collection::add_cq_channel_fd(int cq_ch_fd, ring *p_ring) { fdcoll_logfunc("cq_ch_fd=%d", cq_ch_fd); @@ -524,17 +490,6 @@ int fd_collection::del_cq_channel_fd(int fd, bool b_cleanup /*=false*/) return del(fd, b_cleanup, m_p_cq_channel_map); } -void fd_collection::del_tapfd(int fd) -{ - if (!is_valid_fd(fd)) { - return; - } - - lock(); - m_p_tap_map[fd] = nullptr; - unlock(); -} - template int fd_collection::del(int fd, bool b_cleanup, cls **map_type) { fdcoll_logfunc("fd=%d%s", fd, diff --git a/src/core/sock/fd_collection.h b/src/core/sock/fd_collection.h index 59c97cbc3..1403b97f4 100644 --- a/src/core/sock/fd_collection.h +++ b/src/core/sock/fd_collection.h @@ -38,7 +38,6 @@ #include #include "vlogger/vlogger.h" -#include "dev/ring_tap.h" #include "event/event_handler_manager.h" #include "event/timer_handler.h" #include "sock/cleanable_obj.h" @@ -104,14 +103,6 @@ class fd_collection : private lock_mutex_recursive { */ int add_cq_channel_fd(int cq_ch_fd, ring *p_ring); - /** - * Add tap fd index to tap_map. - * @param tapfd: tap fd. - * @param p_ring: pointer to ring owner of the tap. - * @return 0 on success, -1 on failure. - */ - int addtapfd(int tapfd, ring_tap *p_ring); - /** * Remove sockinfo. */ @@ -128,18 +119,9 @@ class fd_collection : private lock_mutex_recursive { */ int del_cq_channel_fd(int fd, bool b_cleanup = false); - /** - * Remove tap_fd from tap_map. - */ - void del_tapfd(int fd); - void set_socket(int fd, sockinfo *si) { m_p_sockfd_map[fd] = si; } void clear_socket(int fd) { m_p_sockfd_map[fd] = nullptr; } void clear_sockets(); - /** - * Call set_immediate_os_sample of the input fd. - */ - inline bool set_immediate_os_sample(int fd); inline void reuse_sockfd(int fd, sockinfo *p_sfd_api_obj); inline void destroy_sockfd(sockinfo *p_sfd_api_obj); @@ -158,11 +140,6 @@ class fd_collection : private lock_mutex_recursive { */ inline cq_channel_info *get_cq_channel_fd(int fd); - /** - * Get rint_tap by tap fd. - */ - inline ring_tap *get_tapfd(int fd); - /** * Get the fd_map size. */ @@ -213,7 +190,6 @@ class fd_collection : private lock_mutex_recursive { sockinfo **m_p_sockfd_map; epfd_info **m_p_epfd_map; cq_channel_info **m_p_cq_channel_map; - ring_tap **m_p_tap_map; epfd_info_list_t m_epfd_lst; // Contains fds which are in closing process @@ -251,22 +227,6 @@ template inline cls *fd_collection::get(int fd, cls **map_type) return obj; } -inline bool fd_collection::set_immediate_os_sample(int fd) -{ - ring_tap *p_ring; - - lock(); - - if ((p_ring = get_tapfd(fd))) { - p_ring->set_tap_data_available(); - unlock(); - return true; - } - - unlock(); - return false; -} - inline void fd_collection::reuse_sockfd(int fd, sockinfo *p_sfd_api_obj) { lock(); @@ -300,11 +260,6 @@ inline cq_channel_info *fd_collection::get_cq_channel_fd(int fd) return get(fd, m_p_cq_channel_map); } -inline ring_tap *fd_collection::get_tapfd(int fd) -{ - return get(fd, m_p_tap_map); -} - inline int fd_collection::get_fd_map_size() { return m_n_fd_map_size; diff --git a/src/core/util/agent.cpp b/src/core/util/agent.cpp index f4cbaafe3..f83461d58 100644 --- a/src/core/util/agent.cpp +++ b/src/core/util/agent.cpp @@ -578,53 +578,6 @@ int agent::send_msg_exit(void) return rc; } -int agent::send_msg_flow(struct xlio_msg_flow *data) -{ - int rc = 0; - struct xlio_msg_flow answer; - - if (AGENT_ACTIVE != m_state) { - return -ENODEV; - } - - if (m_sock_fd < 0) { - return -EBADF; - } - - /* wait answer */ - data->hdr.status = 1; - - /* send(XLIO_MSG_TC) in blocking manner */ - sys_call(rc, send, m_sock_fd, data, sizeof(*data), 0); - if (rc < 0) { - __log_dbg("Failed to send(XLIO_MSG_TC) errno %d (%s)", errno, strerror(errno)); - rc = -errno; - goto err; - } - - /* recv(XLIO_MSG_TC|ACK) in blocking manner */ - memset(&answer, 0, sizeof(answer)); - sys_call(rc, recv, m_sock_fd, &answer.hdr, sizeof(answer.hdr), 0); - if (rc < (int)sizeof(answer.hdr)) { - __log_dbg("Failed to recv(XLIO_MSG_TC) errno %d (%s)", errno, strerror(errno)); - rc = -ECONNREFUSED; - goto err; - } - - /* reply sanity check */ - if (!(answer.hdr.code == (data->hdr.code | XLIO_MSG_ACK) && answer.hdr.ver == data->hdr.ver && - answer.hdr.pid == data->hdr.pid)) { - __log_dbg("Protocol version mismatch: code = 0x%X ver = 0x%X pid = %d", answer.hdr.code, - answer.hdr.ver, answer.hdr.pid); - rc = -EPROTO; - goto err; - } - - rc = answer.hdr.status; -err: - return rc; -} - int agent::create_agent_socket(void) { int rc = 0; diff --git a/src/core/util/agent.h b/src/core/util/agent.h index 2e2c6fff6..4e847ba38 100644 --- a/src/core/util/agent.h +++ b/src/core/util/agent.h @@ -86,7 +86,6 @@ class agent { void unregister_cb(agent_cb_t fn, void *arg); int put(const void *data, size_t length, intptr_t tag); void progress(void); - int send_msg_flow(struct xlio_msg_flow *data); private: /* state of this object */ diff --git a/src/core/util/agent_def.h b/src/core/util/agent_def.h index 5acf35a21..0bf6a05e7 100644 --- a/src/core/util/agent_def.h +++ b/src/core/util/agent_def.h @@ -34,21 +34,6 @@ #ifndef _AGENT_DEF_H_ #define _AGENT_DEF_H_ -#ifndef offsetof -#define offsetof(type, member) ((uintptr_t) & ((type *)0)->member) -#endif - -#ifndef container_of -/** - * container_of - cast a member of a structure out to the containing structure - * @ptr: the pointer to the member. - * @type: the type of the container struct this is embedded in. - * @member: the name of the member within the struct. - * - */ -#define container_of(ptr, type, member) (type *)((char *)(ptr)-offsetof(type, member)) -#endif - /* List of supported messages in range 0..63 * Two bits as 6-7 are reserved. * 6-bit is reserved @@ -58,7 +43,6 @@ #define XLIO_MSG_INIT 0x01 #define XLIO_MSG_STATE 0x02 #define XLIO_MSG_EXIT 0x03 -#define XLIO_MSG_FLOW 0x04 #define XLIO_MSG_ACK 0x80 @@ -109,42 +93,6 @@ struct xlio_msg_state { uint8_t state; }; -enum { - XLIO_MSG_FLOW_EGRESS = 0, - XLIO_MSG_FLOW_UDP_5T = 1, - XLIO_MSG_FLOW_UDP_3T = 2, - XLIO_MSG_FLOW_TCP_5T = 3, - XLIO_MSG_FLOW_TCP_3T = 4 -}; - -typedef enum { XLIO_MSG_FLOW_ADD = 1, XLIO_MSG_FLOW_DEL = 2 } msg_flow_t; - -struct xlio_msg_flow { - struct xlio_hdr hdr; - uint8_t type; /* format of tc rule command */ - uint8_t action; /* add, del */ - uint32_t if_id; /* interface index */ - uint32_t tap_id; /* tap device index */ - struct { - struct { - uint16_t family; - uint16_t port; - union { - uint32_t ipv4; - uint8_t ipv6[16]; - } addr; - } src; - struct { - uint16_t family; - uint16_t port; - union { - uint32_t ipv4; - uint8_t ipv6[16]; - } addr; - } dst; - } flow; -}; - #pragma pack(pop) #endif /* _AGENT_DEF_H_ */ diff --git a/src/core/util/sys_vars.h b/src/core/util/sys_vars.h index 73fe6f966..61f7e6725 100644 --- a/src/core/util/sys_vars.h +++ b/src/core/util/sys_vars.h @@ -888,15 +888,11 @@ extern mce_sys_var &safe_mce_sys(); #define OPER_STATE_PARAM_FILE "/sys/class/net/%s/operstate" #define FLOW_STEERING_MGM_ENTRY_SIZE_PARAM_FILE \ "/sys/module/mlx4_core/parameters/log_num_mgm_entry_size" -#define VIRTUAL_DEVICE_FOLDER "/sys/devices/virtual/net/%s/" -#define BOND_DEVICE_FILE "/proc/net/bonding/%s" -#define NETVSC_DEVICE_CLASS_FILE "/sys/class/net/%s/device/class_id" -#define NETVSC_DEVICE_LOWER_FILE "/sys/class/net/%s/lower_%s/ifindex" -#define NETVSC_DEVICE_UPPER_FILE "/sys/class/net/%s/upper_%s/ifindex" -#define NETVSC_ID "{f8615163-df3e-46c5-913f-f2d2f965ed0e}\n" - -#define MAX_STATS_FD_NUM 1024U -#define MAX_WINDOW_SCALING 14 +#define VIRTUAL_DEVICE_FOLDER "/sys/devices/virtual/net/%s/" +#define BOND_DEVICE_FILE "/proc/net/bonding/%s" +#define DEVICE_UPPER_FILE "/sys/class/net/%s/upper_%s/ifindex" +#define MAX_STATS_FD_NUM 1024U +#define MAX_WINDOW_SCALING 14 #define STRQ_MIN_STRIDES_NUM 512 #define STRQ_MAX_STRIDES_NUM 65536 diff --git a/src/core/util/utils.cpp b/src/core/util/utils.cpp index 3e636ddf9..c0d64a4ac 100644 --- a/src/core/util/utils.cpp +++ b/src/core/util/utils.cpp @@ -1046,8 +1046,7 @@ bool get_bond_name(IN const char *ifname, OUT char *bond_name, IN int sz) } for (ifa = ifaddr; ifa; ifa = ifa->ifa_next) { - snprintf(upper_path, sizeof(upper_path), NETVSC_DEVICE_UPPER_FILE, base_ifname, - ifa->ifa_name); + snprintf(upper_path, sizeof(upper_path), DEVICE_UPPER_FILE, base_ifname, ifa->ifa_name); int fd = SYSCALL(open, upper_path, O_RDONLY); if (fd >= 0) { SYSCALL(close, fd); @@ -1095,56 +1094,6 @@ bool check_bond_roce_lag_exist(OUT char *bond_roce_lag_path, int sz, IN const ch return false; } -bool get_netvsc_slave(IN const char *ifname, OUT char *slave_name, OUT unsigned int &slave_flags) -{ - char netvsc_path[256]; - char base_ifname[IFNAMSIZ]; - get_base_interface_name(ifname, base_ifname, sizeof(base_ifname)); - struct ifaddrs *ifaddr, *ifa; - bool ret = false; - - if (getifaddrs(&ifaddr) == -1) { - __log_err("getifaddrs() failed (errno = %d %m)", errno); - return ret; - } - - for (ifa = ifaddr; ifa; ifa = ifa->ifa_next) { - snprintf(netvsc_path, sizeof(netvsc_path), NETVSC_DEVICE_LOWER_FILE, base_ifname, - ifa->ifa_name); - int fd = SYSCALL(open, netvsc_path, O_RDONLY); - if (fd >= 0) { - SYSCALL(close, fd); - memcpy(slave_name, ifa->ifa_name, IFNAMSIZ); - slave_flags = ifa->ifa_flags; - __log_dbg("Found slave_name = %s, slave_flags = %u", slave_name, slave_flags); - ret = true; - break; - } - } - - freeifaddrs(ifaddr); - - return ret; -} - -bool check_netvsc_device_exist(const char *ifname) -{ - int ret = -1; - char device_path[256] = {0}; - char base_ifname[IFNAMSIZ]; - get_base_interface_name(ifname, base_ifname, sizeof(base_ifname)); - sprintf(device_path, NETVSC_DEVICE_CLASS_FILE, base_ifname); - char sys_res[1024] = {0}; - if ((ret = priv_read_file(device_path, sys_res, sizeof(sys_res) - 1, VLOG_FUNC)) > 0) { - sys_res[ret] = '\0'; - if (strcmp(sys_res, NETVSC_ID) == 0) { - return true; - } - } - - return false; -} - /* * this function will work only for kernel > 3.14 or RH7.2 and higher */ diff --git a/src/core/util/utils.h b/src/core/util/utils.h index 6c48d13a4..4b2b5fdbf 100644 --- a/src/core/util/utils.h +++ b/src/core/util/utils.h @@ -293,8 +293,6 @@ bool get_bond_slaves_name_list(IN const char *bond_name, OUT char *slaves_list, bool check_bond_roce_lag_exist(OUT char *bond_roce_lag_path, int sz, IN const char *slave_name); bool check_device_exist(const char *ifname, const char *path); bool check_device_name_ib_name(const char *ifname, const char *ibname); -bool check_netvsc_device_exist(const char *ifname); -bool get_netvsc_slave(IN const char *ifname, OUT char *slave_name, OUT unsigned int &slave_flags); bool get_interface_oper_state(IN const char *interface_name, OUT char *slaves_list, IN int sz); bool validate_user_has_cap_net_raw_privliges(); diff --git a/src/core/util/xlio_stats.h b/src/core/util/xlio_stats.h index bb4baf1c5..fae1e8659 100644 --- a/src/core/util/xlio_stats.h +++ b/src/core/util/xlio_stats.h @@ -353,10 +353,6 @@ typedef struct { cq_stats_t cq_stats; } cq_instance_block_t; -typedef enum { RING_ETH = 0, RING_TAP } ring_type_t; - -static const char *const ring_type_str[] = {"RING_ETH", "RING_TAP"}; - // Ring stat info typedef struct { uint64_t n_rx_pkt_count; @@ -369,7 +365,6 @@ typedef struct { uint32_t n_tx_tls_contexts; uint32_t n_rx_tls_contexts; #endif /* DEFINED_UTLS */ - ring_type_t n_type; union { struct { uint64_t n_tx_tso_pkt_count; @@ -386,12 +381,6 @@ typedef struct { uint32_t n_tx_num_bufs; uint32_t n_zc_num_bufs; } simple; - struct { - char s_tap_name[IFNAMSIZ]; - uint32_t n_tap_fd; - uint32_t n_rx_buffers; - uint32_t n_vf_plugouts; - } tap; }; } ring_stats_t; diff --git a/src/stats/stats_reader.cpp b/src/stats/stats_reader.cpp index 07e3c15f6..802c5c19b 100644 --- a/src/stats/stats_reader.cpp +++ b/src/stats/stats_reader.cpp @@ -114,7 +114,6 @@ typedef enum { e_K = 1024, e_M = 1048576 } units_t; #define FORMAT_RING_INTERRUPT "%-20s %zu / %zu [requests/received] %-3s\n" #define FORMAT_RING_MODERATION "%-20s %u / %u [frames/usec period] %-3s\n" #define FORMAT_RING_DM_STATS "%-20s %zu / %zu / %zu [kilobytes/packets/oob] %-3s\n" -#define FORMAT_RING_TAP_NAME "%-20s %s\n" #define FORMAT_RING_MASTER "%-20s %p\n" #define INTERVAL 1 @@ -370,50 +369,39 @@ void update_delta_ring_stat(ring_stats_t *p_curr_ring_stats, ring_stats_t *p_pre p_prev_ring_stats->n_rx_tls_contexts = (p_curr_ring_stats->n_rx_tls_contexts - p_prev_ring_stats->n_rx_tls_contexts) / delay; #endif /* DEFINED_UTLS */ - - if (p_prev_ring_stats->n_type == RING_TAP) { - memcpy(p_prev_ring_stats->tap.s_tap_name, p_curr_ring_stats->tap.s_tap_name, - sizeof(p_curr_ring_stats->tap.s_tap_name)); - p_prev_ring_stats->tap.n_tap_fd = p_curr_ring_stats->tap.n_tap_fd; - p_prev_ring_stats->tap.n_rx_buffers = p_curr_ring_stats->tap.n_rx_buffers; - p_prev_ring_stats->tap.n_vf_plugouts = - (p_curr_ring_stats->tap.n_vf_plugouts - p_prev_ring_stats->tap.n_vf_plugouts); - } else { - p_prev_ring_stats->simple.n_tx_tso_pkt_count = - (p_curr_ring_stats->simple.n_tx_tso_pkt_count - - p_prev_ring_stats->simple.n_tx_tso_pkt_count) / - delay; - p_prev_ring_stats->simple.n_tx_tso_byte_count = - (p_curr_ring_stats->simple.n_tx_tso_byte_count - - p_prev_ring_stats->simple.n_tx_tso_byte_count) / - delay; - p_prev_ring_stats->simple.n_rx_interrupt_received = - (p_curr_ring_stats->simple.n_rx_interrupt_received - - p_prev_ring_stats->simple.n_rx_interrupt_received) / - delay; - p_prev_ring_stats->simple.n_rx_interrupt_requests = - (p_curr_ring_stats->simple.n_rx_interrupt_requests - - p_prev_ring_stats->simple.n_rx_interrupt_requests) / - delay; - p_prev_ring_stats->simple.n_rx_cq_moderation_count = - p_curr_ring_stats->simple.n_rx_cq_moderation_count; - p_prev_ring_stats->simple.n_rx_cq_moderation_period = - p_curr_ring_stats->simple.n_rx_cq_moderation_period; - p_prev_ring_stats->simple.n_tx_dev_mem_allocated = - p_curr_ring_stats->simple.n_tx_dev_mem_allocated; - p_prev_ring_stats->simple.n_tx_dev_mem_byte_count = - (p_curr_ring_stats->simple.n_tx_dev_mem_byte_count - - p_prev_ring_stats->simple.n_tx_dev_mem_byte_count) / - delay; - p_prev_ring_stats->simple.n_tx_dev_mem_pkt_count = - (p_curr_ring_stats->simple.n_tx_dev_mem_pkt_count - - p_prev_ring_stats->simple.n_tx_dev_mem_pkt_count) / - delay; - p_prev_ring_stats->simple.n_tx_dev_mem_oob = - (p_curr_ring_stats->simple.n_tx_dev_mem_oob - - p_prev_ring_stats->simple.n_tx_dev_mem_oob) / - delay; - } + p_prev_ring_stats->simple.n_tx_tso_pkt_count = + (p_curr_ring_stats->simple.n_tx_tso_pkt_count - + p_prev_ring_stats->simple.n_tx_tso_pkt_count) / + delay; + p_prev_ring_stats->simple.n_tx_tso_byte_count = + (p_curr_ring_stats->simple.n_tx_tso_byte_count - + p_prev_ring_stats->simple.n_tx_tso_byte_count) / + delay; + p_prev_ring_stats->simple.n_rx_interrupt_received = + (p_curr_ring_stats->simple.n_rx_interrupt_received - + p_prev_ring_stats->simple.n_rx_interrupt_received) / + delay; + p_prev_ring_stats->simple.n_rx_interrupt_requests = + (p_curr_ring_stats->simple.n_rx_interrupt_requests - + p_prev_ring_stats->simple.n_rx_interrupt_requests) / + delay; + p_prev_ring_stats->simple.n_rx_cq_moderation_count = + p_curr_ring_stats->simple.n_rx_cq_moderation_count; + p_prev_ring_stats->simple.n_rx_cq_moderation_period = + p_curr_ring_stats->simple.n_rx_cq_moderation_period; + p_prev_ring_stats->simple.n_tx_dev_mem_allocated = + p_curr_ring_stats->simple.n_tx_dev_mem_allocated; + p_prev_ring_stats->simple.n_tx_dev_mem_byte_count = + (p_curr_ring_stats->simple.n_tx_dev_mem_byte_count - + p_prev_ring_stats->simple.n_tx_dev_mem_byte_count) / + delay; + p_prev_ring_stats->simple.n_tx_dev_mem_pkt_count = + (p_curr_ring_stats->simple.n_tx_dev_mem_pkt_count - + p_prev_ring_stats->simple.n_tx_dev_mem_pkt_count) / + delay; + p_prev_ring_stats->simple.n_tx_dev_mem_oob = (p_curr_ring_stats->simple.n_tx_dev_mem_oob - + p_prev_ring_stats->simple.n_tx_dev_mem_oob) / + delay; } } @@ -505,8 +493,6 @@ void print_ring_stats(ring_instance_block_t *p_ring_inst_arr) p_ring_stats = &p_ring_inst_arr[i].ring_stats; printf("======================================================\n"); - printf("\t%s=[%u]\n", ring_type_str[p_ring_stats->n_type], i); - if (p_ring_stats->p_ring_master) { printf(FORMAT_RING_MASTER, "Master:", p_ring_stats->p_ring_master); } @@ -541,46 +527,36 @@ void print_ring_stats(ring_instance_block_t *p_ring_inst_arr) } #endif /* DEFINED_UTLS */ - if (p_ring_stats->n_type == RING_TAP) { - printf(FORMAT_STATS_32bit, "Rx Buffers:", p_ring_stats->tap.n_rx_buffers); - if (p_ring_stats->tap.n_vf_plugouts) { - printf(FORMAT_STATS_32bit, "VF Plugouts:", p_ring_stats->tap.n_vf_plugouts); - } - printf(FORMAT_STATS_32bit, "Tap fd:", p_ring_stats->tap.n_tap_fd); - printf(FORMAT_RING_TAP_NAME, "Tap Device:", p_ring_stats->tap.s_tap_name); - } else { - if (p_ring_stats->simple.n_tx_tso_pkt_count || - p_ring_stats->simple.n_tx_tso_byte_count) { - printf(FORMAT_RING_PACKETS, "TSO Offload:", - p_ring_stats->simple.n_tx_tso_byte_count / BYTES_TRAFFIC_UNIT, - p_ring_stats->simple.n_tx_tso_pkt_count, post_fix); - } - if (p_ring_stats->simple.n_rx_interrupt_requests || - p_ring_stats->simple.n_rx_interrupt_received) { - printf(FORMAT_RING_INTERRUPT, - "Interrupts:", p_ring_stats->simple.n_rx_interrupt_requests, - p_ring_stats->simple.n_rx_interrupt_received, post_fix); - } - if (p_ring_stats->simple.n_rx_cq_moderation_count || - p_ring_stats->simple.n_rx_cq_moderation_period) { - printf(FORMAT_RING_MODERATION, - "Moderation:", p_ring_stats->simple.n_rx_cq_moderation_count, - p_ring_stats->simple.n_rx_cq_moderation_period, post_fix); - } - if (p_ring_stats->simple.n_tx_dev_mem_allocated) { - printf(FORMAT_STATS_32bit, - "Dev Mem Alloc:", p_ring_stats->simple.n_tx_dev_mem_allocated); - printf(FORMAT_RING_DM_STATS, "Dev Mem Stats:", - p_ring_stats->simple.n_tx_dev_mem_byte_count / BYTES_TRAFFIC_UNIT, - p_ring_stats->simple.n_tx_dev_mem_pkt_count, - p_ring_stats->simple.n_tx_dev_mem_oob, post_fix); - } - - printf(FORMAT_STATS_32bit, - "TX buffers inflight:", p_ring_stats->simple.n_tx_num_bufs); + if (p_ring_stats->simple.n_tx_tso_pkt_count || + p_ring_stats->simple.n_tx_tso_byte_count) { + printf(FORMAT_RING_PACKETS, "TSO Offload:", + p_ring_stats->simple.n_tx_tso_byte_count / BYTES_TRAFFIC_UNIT, + p_ring_stats->simple.n_tx_tso_pkt_count, post_fix); + } + if (p_ring_stats->simple.n_rx_interrupt_requests || + p_ring_stats->simple.n_rx_interrupt_received) { + printf(FORMAT_RING_INTERRUPT, + "Interrupts:", p_ring_stats->simple.n_rx_interrupt_requests, + p_ring_stats->simple.n_rx_interrupt_received, post_fix); + } + if (p_ring_stats->simple.n_rx_cq_moderation_count || + p_ring_stats->simple.n_rx_cq_moderation_period) { + printf(FORMAT_RING_MODERATION, + "Moderation:", p_ring_stats->simple.n_rx_cq_moderation_count, + p_ring_stats->simple.n_rx_cq_moderation_period, post_fix); + } + if (p_ring_stats->simple.n_tx_dev_mem_allocated) { printf(FORMAT_STATS_32bit, - "TX ZC buffers inflight:", p_ring_stats->simple.n_zc_num_bufs); + "Dev Mem Alloc:", p_ring_stats->simple.n_tx_dev_mem_allocated); + printf(FORMAT_RING_DM_STATS, "Dev Mem Stats:", + p_ring_stats->simple.n_tx_dev_mem_byte_count / BYTES_TRAFFIC_UNIT, + p_ring_stats->simple.n_tx_dev_mem_pkt_count, + p_ring_stats->simple.n_tx_dev_mem_oob, post_fix); } + + printf(FORMAT_STATS_32bit, "TX buffers inflight:", p_ring_stats->simple.n_tx_num_bufs); + printf(FORMAT_STATS_32bit, + "TX ZC buffers inflight:", p_ring_stats->simple.n_zc_num_bufs); } } printf("======================================================\n"); @@ -1823,20 +1799,16 @@ void zero_ring_stats(ring_stats_t *p_ring_stats) p_ring_stats->n_tx_tls_contexts = 0; p_ring_stats->n_rx_tls_contexts = 0; #endif /* DEFINED_UTLS */ - if (p_ring_stats->n_type == RING_TAP) { - p_ring_stats->tap.n_vf_plugouts = 0; - } else { - p_ring_stats->simple.n_tx_tso_pkt_count = 0; - p_ring_stats->simple.n_tx_tso_byte_count = 0; - p_ring_stats->simple.n_rx_interrupt_received = 0; - p_ring_stats->simple.n_rx_interrupt_requests = 0; - p_ring_stats->simple.n_tx_dropped_wqes = 0; - p_ring_stats->simple.n_tx_dev_mem_byte_count = 0; - p_ring_stats->simple.n_tx_dev_mem_pkt_count = 0; - p_ring_stats->simple.n_tx_dev_mem_oob = 0; - p_ring_stats->simple.n_tx_num_bufs = 0; - p_ring_stats->simple.n_zc_num_bufs = 0; - } + p_ring_stats->simple.n_tx_tso_pkt_count = 0; + p_ring_stats->simple.n_tx_tso_byte_count = 0; + p_ring_stats->simple.n_rx_interrupt_received = 0; + p_ring_stats->simple.n_rx_interrupt_requests = 0; + p_ring_stats->simple.n_tx_dropped_wqes = 0; + p_ring_stats->simple.n_tx_dev_mem_byte_count = 0; + p_ring_stats->simple.n_tx_dev_mem_pkt_count = 0; + p_ring_stats->simple.n_tx_dev_mem_oob = 0; + p_ring_stats->simple.n_tx_num_bufs = 0; + p_ring_stats->simple.n_zc_num_bufs = 0; } void zero_cq_stats(cq_stats_t *p_cq_stats) diff --git a/tests/gtest/Makefile.am b/tests/gtest/Makefile.am index 2bda6993b..2e2ad7d14 100644 --- a/tests/gtest/Makefile.am +++ b/tests/gtest/Makefile.am @@ -128,8 +128,7 @@ gtest_SOURCES = \ xliod/xliod_bitmap.cc \ xliod/xliod_hash.cc \ xliod/xliod_init.cc \ - xliod/xliod_state.cc \ - xliod/xliod_flow.cc + xliod/xliod_state.cc noinst_HEADERS = \ common/tap.h \ diff --git a/tests/gtest/xliod/xliod_flow.cc b/tests/gtest/xliod/xliod_flow.cc deleted file mode 100644 index 5bf8c1a1e..000000000 --- a/tests/gtest/xliod/xliod_flow.cc +++ /dev/null @@ -1,402 +0,0 @@ -/* - * SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES - * Copyright (c) 2001-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include "common/def.h" -#include "common/log.h" -#include "common/sys.h" -#include "common/base.h" -#include "common/cmn.h" - -#include "xliod_base.h" - -#include "src/core/util/agent_def.h" - -class xliod_flow : public xliod_base { -protected: - struct xlio_msg_flow m_data; - pid_t m_pid; - int m_if; - int m_tap; - xliod_flow() - { - - char opt_val[IF_NAMESIZE]; - socklen_t opt_len; - - m_pid = 0x464C4F57; - memset(&m_data, 0, sizeof(m_data)); - m_data.hdr.code = XLIO_MSG_FLOW; - m_data.hdr.ver = XLIO_AGENT_VER; - m_data.hdr.pid = m_pid; - - opt_val[0] = '\0'; - opt_len = sizeof(opt_val); - sys_addr2dev((struct sockaddr *)&server_addr, opt_val, opt_len); - m_if = if_nametoindex(opt_val); - sys_addr2dev((struct sockaddr *)&client_addr, opt_val, opt_len); - m_tap = if_nametoindex(opt_val); - m_data.if_id = m_if; - m_data.tap_id = m_tap; - } -}; - -/** - * @test xliod_flow.ti_1 - * @brief - * Send valid TCP 3tuple XLIO_MSG_FLOW(ADD) - * @details - */ -TEST_F(xliod_flow, ti_1) -{ - int rc = 0; - struct xlio_hdr answer; - - rc = xliod_base::msg_init(m_pid); - ASSERT_LT(0, rc); - - m_data.hdr.status = 1; - m_data.action = XLIO_MSG_FLOW_ADD; - m_data.type = XLIO_MSG_FLOW_TCP_3T; - m_data.flow.dst.family = m_family; - if (m_family == PF_INET) { - m_data.flow.dst.addr.ipv4 = ((struct sockaddr_in *)&server_addr)->sin_addr.s_addr; - } else { - memcpy(&m_data.flow.dst.addr.ipv6[0], - &((struct sockaddr_in6 *)&server_addr)->sin6_addr.s6_addr[0], - sizeof(m_data.flow.dst.addr.ipv6)); - } - m_data.flow.dst.port = htons(sys_get_port((struct sockaddr *)&server_addr)); - - errno = 0; - rc = send(m_sock_fd, &m_data, sizeof(m_data), 0); - EXPECT_EQ(0, errno); - EXPECT_EQ((int)sizeof(m_data), rc); - - memset(&answer, 0, sizeof(answer)); - rc = recv(m_sock_fd, &answer, sizeof(answer), 0); - EXPECT_EQ((int)sizeof(answer), rc); - - EXPECT_EQ((XLIO_MSG_FLOW | XLIO_MSG_ACK), answer.code); - EXPECT_LE(XLIO_AGENT_VER, answer.ver); - EXPECT_EQ(m_pid, answer.pid); - EXPECT_EQ(0, answer.status); - - rc = xliod_base::msg_exit(m_pid); - ASSERT_LT(0, rc); -} - -/** - * @test xliod_flow.ti_2 - * @brief - * Send valid TCP 5tuple XLIO_MSG_FLOW(ADD) - * @details - */ -TEST_F(xliod_flow, ti_2) -{ - int rc = 0; - struct xlio_hdr answer; - - rc = xliod_base::msg_init(m_pid); - ASSERT_LT(0, rc); - - m_data.hdr.status = 1; - m_data.action = XLIO_MSG_FLOW_ADD; - m_data.type = XLIO_MSG_FLOW_TCP_5T; - m_data.flow.src.family = m_family; - if (m_family == PF_INET) { - m_data.flow.src.addr.ipv4 = ((struct sockaddr_in *)&client_addr)->sin_addr.s_addr; - } else { - memcpy(&m_data.flow.src.addr.ipv6[0], - &((struct sockaddr_in6 *)&client_addr)->sin6_addr.s6_addr[0], - sizeof(m_data.flow.src.addr.ipv6)); - } - m_data.flow.src.port = htons(sys_get_port((struct sockaddr *)&client_addr)); - m_data.flow.dst.family = m_family; - if (m_family == PF_INET) { - m_data.flow.dst.addr.ipv4 = ((struct sockaddr_in *)&server_addr)->sin_addr.s_addr; - } else { - memcpy(&m_data.flow.dst.addr.ipv6[0], - &((struct sockaddr_in6 *)&server_addr)->sin6_addr.s6_addr[0], - sizeof(m_data.flow.dst.addr.ipv6)); - } - m_data.flow.dst.port = htons(sys_get_port((struct sockaddr *)&server_addr)); - - errno = 0; - rc = send(m_sock_fd, &m_data, sizeof(m_data), 0); - EXPECT_EQ(0, errno); - EXPECT_EQ((int)sizeof(m_data), rc); - - memset(&answer, 0, sizeof(answer)); - rc = recv(m_sock_fd, &answer, sizeof(answer), 0); - EXPECT_EQ((int)sizeof(answer), rc); - - EXPECT_EQ((XLIO_MSG_FLOW | XLIO_MSG_ACK), answer.code); - EXPECT_LE(XLIO_AGENT_VER, answer.ver); - EXPECT_EQ(m_pid, answer.pid); - EXPECT_EQ(0, answer.status); - - rc = xliod_base::msg_exit(m_pid); - ASSERT_LT(0, rc); -} - -/** - * @test xliod_flow.ti_3 - * @brief - * Send valid 3tuple XLIO_MSG_FLOW(ADD) and XLIO_MSG_FLOW(DEL) - * @details - */ -TEST_F(xliod_flow, ti_3) -{ - int rc = 0; - struct xlio_hdr answer; - - rc = xliod_base::msg_init(m_pid); - ASSERT_LT(0, rc); - - m_data.hdr.status = 1; - m_data.action = XLIO_MSG_FLOW_ADD; - m_data.type = XLIO_MSG_FLOW_TCP_3T; - m_data.flow.dst.family = m_family; - if (m_family == PF_INET) { - m_data.flow.dst.addr.ipv4 = ((struct sockaddr_in *)&server_addr)->sin_addr.s_addr; - } else { - memcpy(&m_data.flow.dst.addr.ipv6[0], - &((struct sockaddr_in6 *)&server_addr)->sin6_addr.s6_addr[0], - sizeof(m_data.flow.dst.addr.ipv6)); - } - m_data.flow.dst.port = htons(sys_get_port((struct sockaddr *)&server_addr)); - - errno = 0; - rc = send(m_sock_fd, &m_data, sizeof(m_data), 0); - EXPECT_EQ(0, errno); - EXPECT_EQ((int)sizeof(m_data), rc); - - memset(&answer, 0, sizeof(answer)); - rc = recv(m_sock_fd, &answer, sizeof(answer), 0); - EXPECT_EQ((int)sizeof(answer), rc); - - EXPECT_EQ((XLIO_MSG_FLOW | XLIO_MSG_ACK), answer.code); - EXPECT_LE(XLIO_AGENT_VER, answer.ver); - EXPECT_EQ(m_pid, answer.pid); - EXPECT_EQ(0, answer.status); - - m_data.hdr.status = 1; - m_data.action = XLIO_MSG_FLOW_DEL; - - errno = 0; - rc = send(m_sock_fd, &m_data, sizeof(m_data), 0); - EXPECT_EQ(0, errno); - EXPECT_EQ((int)sizeof(m_data), rc); - - memset(&answer, 0, sizeof(answer)); - rc = recv(m_sock_fd, &answer, sizeof(answer), 0); - EXPECT_EQ((int)sizeof(answer), rc); - - EXPECT_EQ((XLIO_MSG_FLOW | XLIO_MSG_ACK), answer.code); - EXPECT_LE(XLIO_AGENT_VER, answer.ver); - EXPECT_EQ(m_pid, answer.pid); - EXPECT_EQ(0, answer.status); - - rc = xliod_base::msg_exit(m_pid); - ASSERT_LT(0, rc); -} - -/** - * @test xliod_flow.ti_4 - * @brief - * Send valid 5tuple XLIO_MSG_FLOW(ADD) and XLIO_MSG_FLOW(DEL) - * @details - */ -TEST_F(xliod_flow, ti_4) -{ - int rc = 0; - struct xlio_hdr answer; - - rc = xliod_base::msg_init(m_pid); - ASSERT_LT(0, rc); - - m_data.hdr.status = 1; - m_data.action = XLIO_MSG_FLOW_ADD; - m_data.type = XLIO_MSG_FLOW_TCP_5T; - m_data.flow.src.family = m_family; - if (m_family == PF_INET) { - m_data.flow.src.addr.ipv4 = ((struct sockaddr_in *)&client_addr)->sin_addr.s_addr; - } else { - memcpy(&m_data.flow.src.addr.ipv6[0], - &((struct sockaddr_in6 *)&client_addr)->sin6_addr.s6_addr[0], - sizeof(m_data.flow.src.addr.ipv6)); - } - m_data.flow.src.port = htons(sys_get_port((struct sockaddr *)&client_addr)); - m_data.flow.dst.family = m_family; - if (m_family == PF_INET) { - m_data.flow.dst.addr.ipv4 = ((struct sockaddr_in *)&server_addr)->sin_addr.s_addr; - } else { - memcpy(&m_data.flow.dst.addr.ipv6[0], - &((struct sockaddr_in6 *)&server_addr)->sin6_addr.s6_addr[0], - sizeof(m_data.flow.dst.addr.ipv6)); - } - m_data.flow.dst.port = htons(sys_get_port((struct sockaddr *)&server_addr)); - - errno = 0; - rc = send(m_sock_fd, &m_data, sizeof(m_data), 0); - EXPECT_EQ(0, errno); - EXPECT_EQ((int)sizeof(m_data), rc); - - memset(&answer, 0, sizeof(answer)); - rc = recv(m_sock_fd, &answer, sizeof(answer), 0); - EXPECT_EQ((int)sizeof(answer), rc); - - EXPECT_EQ((XLIO_MSG_FLOW | XLIO_MSG_ACK), answer.code); - EXPECT_LE(XLIO_AGENT_VER, answer.ver); - EXPECT_EQ(m_pid, answer.pid); - EXPECT_EQ(0, answer.status); - - m_data.hdr.status = 1; - m_data.action = XLIO_MSG_FLOW_DEL; - - errno = 0; - rc = send(m_sock_fd, &m_data, sizeof(m_data), 0); - EXPECT_EQ(0, errno); - EXPECT_EQ((int)sizeof(m_data), rc); - - memset(&answer, 0, sizeof(answer)); - rc = recv(m_sock_fd, &answer, sizeof(answer), 0); - EXPECT_EQ((int)sizeof(answer), rc); - - EXPECT_EQ((XLIO_MSG_FLOW | XLIO_MSG_ACK), answer.code); - EXPECT_LE(XLIO_AGENT_VER, answer.ver); - EXPECT_EQ(m_pid, answer.pid); - EXPECT_EQ(0, answer.status); - - rc = xliod_base::msg_exit(m_pid); - ASSERT_LT(0, rc); -} - -/** - * @test xliod_flow.ti_51 - * @brief - * Send valid UDP 3tuple XLIO_MSG_FLOW(ADD) - * @details - */ -TEST_F(xliod_flow, ti_5) -{ - int rc = 0; - struct xlio_hdr answer; - - rc = xliod_base::msg_init(m_pid); - ASSERT_LT(0, rc); - - m_data.hdr.status = 1; - m_data.action = XLIO_MSG_FLOW_ADD; - m_data.type = XLIO_MSG_FLOW_UDP_3T; - m_data.flow.dst.family = m_family; - if (m_family == PF_INET) { - m_data.flow.dst.addr.ipv4 = ((struct sockaddr_in *)&server_addr)->sin_addr.s_addr; - } else { - memcpy(&m_data.flow.dst.addr.ipv6[0], - &((struct sockaddr_in6 *)&server_addr)->sin6_addr.s6_addr[0], - sizeof(m_data.flow.dst.addr.ipv6)); - } - m_data.flow.dst.port = htons(sys_get_port((struct sockaddr *)&server_addr)); - - errno = 0; - rc = send(m_sock_fd, &m_data, sizeof(m_data), 0); - EXPECT_EQ(0, errno); - EXPECT_EQ((int)sizeof(m_data), rc); - - memset(&answer, 0, sizeof(answer)); - rc = recv(m_sock_fd, &answer, sizeof(answer), 0); - EXPECT_EQ((int)sizeof(answer), rc); - - EXPECT_EQ((XLIO_MSG_FLOW | XLIO_MSG_ACK), answer.code); - EXPECT_LE(XLIO_AGENT_VER, answer.ver); - EXPECT_EQ(m_pid, answer.pid); - EXPECT_EQ(0, answer.status); - - rc = xliod_base::msg_exit(m_pid); - ASSERT_LT(0, rc); -} - -/** - * @test xliod_flow.ti_6 - * @brief - * Send valid UDP 5tuple XLIO_MSG_FLOW(ADD) - * @details - */ -TEST_F(xliod_flow, ti_6) -{ - int rc = 0; - struct xlio_hdr answer; - - rc = xliod_base::msg_init(m_pid); - ASSERT_LT(0, rc); - - m_data.hdr.status = 1; - m_data.action = XLIO_MSG_FLOW_ADD; - m_data.type = XLIO_MSG_FLOW_UDP_5T; - m_data.flow.src.family = m_family; - if (m_family == PF_INET) { - m_data.flow.src.addr.ipv4 = ((struct sockaddr_in *)&client_addr)->sin_addr.s_addr; - } else { - memcpy(&m_data.flow.src.addr.ipv6[0], - &((struct sockaddr_in6 *)&client_addr)->sin6_addr.s6_addr[0], - sizeof(m_data.flow.src.addr.ipv6)); - } - m_data.flow.src.port = htons(sys_get_port((struct sockaddr *)&client_addr)); - m_data.flow.dst.family = m_family; - if (m_family == PF_INET) { - m_data.flow.dst.addr.ipv4 = ((struct sockaddr_in *)&server_addr)->sin_addr.s_addr; - } else { - memcpy(&m_data.flow.dst.addr.ipv6[0], - &((struct sockaddr_in6 *)&server_addr)->sin6_addr.s6_addr[0], - sizeof(m_data.flow.dst.addr.ipv6)); - } - m_data.flow.dst.port = htons(sys_get_port((struct sockaddr *)&server_addr)); - - errno = 0; - rc = send(m_sock_fd, &m_data, sizeof(m_data), 0); - EXPECT_EQ(0, errno); - EXPECT_EQ((int)sizeof(m_data), rc); - - memset(&answer, 0, sizeof(answer)); - rc = recv(m_sock_fd, &answer, sizeof(answer), 0); - EXPECT_EQ((int)sizeof(answer), rc); - - EXPECT_EQ((XLIO_MSG_FLOW | XLIO_MSG_ACK), answer.code); - EXPECT_LE(XLIO_AGENT_VER, answer.ver); - EXPECT_EQ(m_pid, answer.pid); - EXPECT_EQ(0, answer.status); - - rc = xliod_base::msg_exit(m_pid); - ASSERT_LT(0, rc); -} diff --git a/tools/daemon/Makefile.am b/tools/daemon/Makefile.am index 626c090f1..c7b25b849 100644 --- a/tools/daemon/Makefile.am +++ b/tools/daemon/Makefile.am @@ -14,15 +14,10 @@ xliod_SOURCES = \ loop.c \ hash.c \ store.c \ - flow.c \ message.c \ - notify.c \ - nl.c \ - tc.c + notify.c noinst_HEADERS = \ daemon.h \ hash.h \ - bitmap.h \ - nl.h \ - tc.h + bitmap.h diff --git a/tools/daemon/daemon.c b/tools/daemon/daemon.c index 644065e28..f501ef32a 100644 --- a/tools/daemon/daemon.c +++ b/tools/daemon/daemon.c @@ -45,7 +45,6 @@ #endif #include "hash.h" -#include "tc.h" #include "daemon.h" extern int proc_loop(void); @@ -267,7 +266,6 @@ static int config_def(void) daemon_cfg.notify_fd = -1; daemon_cfg.notify_dir = XLIO_AGENT_PATH; daemon_cfg.ht = NULL; - daemon_cfg.tc = NULL; return rc; } @@ -421,56 +419,3 @@ ssize_t sys_sendto(int sockfd, const void *buf, size_t len, int flags, // coverity[return_overflow:FALSE] /*Turn off coverity check for overflow*/ return nb; } - -char *sys_exec(const char *format, ...) -{ - static __thread char outbuf[256]; - FILE *file = NULL; - va_list va; - char *cmd; - int ret; - - /* calculate needed size for command buffer */ - va_start(va, format); - ret = vsnprintf(NULL, 0, format, va); - va_end(va); - if (ret <= 0) { - goto err; - } - - /* allocate command buffer */ - ret += 1; - cmd = malloc(ret); - if (NULL == cmd) { - goto err; - } - - /* fill command buffer */ - va_start(va, format); - ret = vsnprintf(cmd, ret, format, va); - va_end(va); - if (ret <= 0) { - free(cmd); - goto err; - } - - /* execute command */ - file = popen(cmd, "r"); - log_trace("Run command: %s\n", cmd); - free(cmd); - if (NULL == file) { - goto err; - } - - /* save output */ - memset(outbuf, 0, sizeof(outbuf)); - if ((NULL == fgets(outbuf, sizeof(outbuf) - 1, file)) && (ferror(file))) { - pclose(file); - goto err; - } - pclose(file); - - return outbuf; -err: - return NULL; -} diff --git a/tools/daemon/daemon.h b/tools/daemon/daemon.h index d0d5a9c73..6d67680c9 100644 --- a/tools/daemon/daemon.h +++ b/tools/daemon/daemon.h @@ -127,12 +127,6 @@ sys_log(LOG_INFO, "[TRACE ] " fmt, ##__VA_ARGS__); \ } while (0) -#define log_hexdump(_ptr, _size) \ - do { \ - if (daemon_cfg.opt.log_level > 5) \ - sys_hexdump((_ptr), (_size)); \ - } while (0) - /** * @struct module_cfg * @brief Configuration parameters in global values @@ -160,7 +154,6 @@ struct module_cfg { int notify_fd; const char *notify_dir; hash_t ht; - tc_t tc; struct list_head if_list; }; @@ -185,7 +178,6 @@ struct sockaddr_store { struct store_pid { pid_t pid; /**< Process id */ hash_t ht; /**< Handle to socket store */ - struct list_head flow_list; /**< List of flows */ uint32_t lib_ver; /**< Library version that the process uses */ struct timeval t_start; /**< Start time of the process */ }; @@ -202,29 +194,11 @@ struct store_fid { uint8_t state; /**< Current TCP state of the connection */ }; -/** - * @struct store_flow - * @brief Describe flow - */ -struct store_flow { - struct list_head item; /**< Link to use in queue */ - uint32_t handle; /**< Handle value in term of tc */ - int type; /**< Flow type */ - uint32_t if_id; /**< Interface index */ - uint32_t tap_id; /**< Tap device index */ - struct { - struct sockaddr_store src; /**< Source address */ - struct sockaddr_store dst; /**< Destination address */ - } flow; -}; - void sys_log(int level, const char *format, ...); ssize_t sys_sendto(int sockfd, const void *buf, size_t len, int flags, const struct sockaddr *dest_addr, socklen_t addrlen); -char *sys_exec(const char *format, ...); - static inline char *sys_addr2str(struct sockaddr_store *addr) { static char buf[100]; @@ -240,124 +214,4 @@ static inline char *sys_addr2str(struct sockaddr_store *addr) return addrbuf; } -static inline char *sys_ip2str(struct sockaddr_store *addr) -{ - static __thread char ipbuf[100]; - if (addr->family == AF_INET) { - inet_ntop(addr->family, &addr->addr4.sin_addr, ipbuf, sizeof(ipbuf) - 1); - } else { - inet_ntop(addr->family, &addr->addr6.sin6_addr, ipbuf, sizeof(ipbuf) - 1); - } - - return ipbuf; -} - -static inline uint32_t sys_lo_ifindex(void) -{ - static __thread uint32_t lo_ifindex = 0; - struct ifaddrs *ifaddr, *ifa; - - if (lo_ifindex > 0) { - return lo_ifindex; - } - - if (!getifaddrs(&ifaddr)) { - for (ifa = ifaddr; NULL != ifa; ifa = ifa->ifa_next) { - if (ifa->ifa_addr->sa_family == AF_INET && (ifa->ifa_flags & IFF_LOOPBACK)) { - lo_ifindex = if_nametoindex(ifa->ifa_name); - break; - } - } - freeifaddrs(ifaddr); - } - - return lo_ifindex; -} - -static inline char *sys_lo_ifname(void) -{ - static __thread char lo_ifname[IF_NAMESIZE] = {0}; - - if (lo_ifname[0] > 0) { - return lo_ifname; - } - - if (NULL == if_indextoname(sys_lo_ifindex(), lo_ifname)) { - lo_ifname[0] = 0; - } - - return lo_ifname; -} - -static inline int sys_iplocal(uint32_t addr) -{ - int rc = 0; - struct ifaddrs *ifaddr, *ifa; - struct sockaddr_in *sa; - - if (!getifaddrs(&ifaddr)) { - for (ifa = ifaddr; NULL != ifa; ifa = ifa->ifa_next) { - if (ifa->ifa_addr->sa_family == AF_INET) { - sa = (struct sockaddr_in *)ifa->ifa_addr; - if (addr == sa->sin_addr.s_addr) { - rc = 1; - break; - } - } - } - freeifaddrs(ifaddr); - } - - return rc; -} - -static inline void sys_hexdump(void *ptr, int buflen) -{ - unsigned char *buf = (unsigned char *)ptr; - char out_buf[256]; - int ret = 0; - int out_pos = 0; - int i, j; - - log_trace("dump data at %p\n", ptr); - for (i = 0; i < buflen; i += 16) { - out_pos = 0; - ret = sprintf(out_buf + out_pos, "%06x: ", i); - if (ret < 0) { - return; - } - out_pos += ret; - for (j = 0; j < 16; j++) { - if (i + j < buflen) { - ret = sprintf(out_buf + out_pos, "%02x ", buf[i + j]); - } else { - ret = sprintf(out_buf + out_pos, " "); - } - if (ret < 0) { - return; - } - out_pos += ret; - } - ret = sprintf(out_buf + out_pos, " "); - if (ret < 0) { - return; - } - out_pos += ret; - for (j = 0; j < 16; j++) { - if (i + j < buflen) { - ret = sprintf(out_buf + out_pos, "%c", isprint(buf[i + j]) ? buf[i + j] : '.'); - if (ret < 0) { - return; - } - out_pos += ret; - } - } - ret = sprintf(out_buf + out_pos, "\n"); - if (ret < 0) { - return; - } - log_trace("%s", out_buf); - } -} - #endif /* TOOLS_DAEMON_DAEMON_H_ */ diff --git a/tools/daemon/flow.c b/tools/daemon/flow.c deleted file mode 100644 index f8e4fc746..000000000 --- a/tools/daemon/flow.c +++ /dev/null @@ -1,772 +0,0 @@ -/* - * SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES - * Copyright (c) 2001-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#ifdef HAVE_CONFIG_H -#include "config.h" -#endif - -#include -#include -#include -#include -#include -#include - -#include "hash.h" -#include "bitmap.h" -#include "tc.h" -#include "daemon.h" - -/** - * @struct htid_node_t - * @brief It is an object to be used for removal workaround. - */ -struct htid_node_t { - struct list_head node; - int htid; - int prio; -}; - -/** - * @struct flow_ctx - * @brief It is an object described extra details for flow element - */ -struct flow_ctx { - bitmap_t *ht; /**< bitmap of used hash tables */ - struct list_head pending_list; - struct { - int prio; - int id; - } ht_prio[4]; /**< internal hash tables related priority (size should be set as number of - possible priorities) */ -}; - -/** - * @struct flow_element - * @brief Object to describe tc element - */ -struct flow_element { - struct list_head item; /**< link sequence of elements in list */ - struct list_head list; /**< head of children list */ - int ref; /**< reference counter */ - uint32_t - value[5]; /**< data - size should be enough to keep ifindex, ipv4, ipv6, flow type etc. */ - union { - struct flow_ctx *ctx; /**< data related if */ - uint32_t ht_id; /**< data related ip (16 bytes for internal ht id 16 bytes ht id) */ - }; -}; - -int open_flow(void); -void close_flow(void); -int add_flow(struct store_pid *pid_value, struct store_flow *value); -int del_flow(struct store_pid *pid_value, struct store_flow *value); - -static int add_flow_egress(struct store_pid *pid_value, struct store_flow *value); -static inline void get_htid(struct flow_ctx *ctx, int prio, int *ht_krn, int *ht_id); -static inline void free_htid(struct flow_ctx *ctx, int ht_id); -static inline void add_pending_list(pid_t pid, struct flow_ctx *ctx, int if_index, int ht_id, - int prio, int *rc); -static inline void free_pending_list(pid_t pid, struct flow_ctx *ctx, int if_index, uint16_t proto); -static inline int get_prio(struct store_flow *value); -static inline int get_bkt(struct store_flow *value); -static inline int get_protocol(struct store_flow *value); -static inline int get_node(struct list_head **list); - -int open_flow(void) -{ - int rc = 0; - - INIT_LIST_HEAD(&daemon_cfg.if_list); - daemon_cfg.tc = tc_create(); - if (NULL == daemon_cfg.tc) { - rc = -EFAULT; - log_error("Failed to create TC object %d (%s)\n", errno, strerror(errno)); - goto err; - } - -err: - return rc; -} - -void close_flow(void) -{ - tc_destroy(daemon_cfg.tc); - daemon_cfg.tc = NULL; -} - -int add_flow(struct store_pid *pid_value, struct store_flow *value) -{ - int rc = 0; - pid_t pid = pid_value->pid; - struct list_head *cur_head = NULL; - struct flow_element *cur_element = NULL; - struct list_head *cur_entry = NULL; - uint32_t ip = value->flow.dst.addr4.sin_addr.s_addr; - int ht = HANDLE_HT(value->handle); - int bkt = HANDLE_BKT(value->handle); - int id = HANDLE_ID(value->handle); - int ht_internal = KERNEL_HT; - struct flow_ctx *ctx = NULL; - uint16_t proto = (value->flow.dst.family == AF_INET ? ETH_P_IP : ETH_P_IPV6); - - /* Egress rules should be created for new tap device - */ - if (XLIO_MSG_FLOW_EGRESS == value->type) { - return add_flow_egress(pid_value, value); - } - - errno = 0; - - /* interface list processing - * use interface index as unique identifier - * every network interface has qdisc - * so as first step let find if interface referenced in this flow exists - * in the if_list or allocate new element related one - */ - cur_head = &daemon_cfg.if_list; - list_for_each(cur_entry, cur_head) - { - cur_element = list_entry(cur_entry, struct flow_element, item); - if (cur_element->value[0] == value->if_id) { - break; - } - } - if (cur_entry == cur_head) { - cur_element = (void *)calloc(1, sizeof(*cur_element)); - if (NULL == cur_element) { - rc = -ENOMEM; - goto err; - } - - /* Cleanup from possible failure during last daemon session */ - tc_del_qdisc(daemon_cfg.tc, value->if_id); - - /* Create filter to redirect traffic from netvsc device to tap device */ - if (tc_add_qdisc(daemon_cfg.tc, value->if_id) < 0) { - log_error("[%d] failed tc_add_qdisc() errno = %d\n", pid, errno); - free(cur_element); - rc = -EFAULT; - goto err; - } - - INIT_LIST_HEAD(&cur_element->list); - cur_element->ref = 0; - cur_element->value[0] = value->if_id; - cur_element->ctx = (void *)calloc(1, sizeof(*cur_element->ctx)); - if (NULL == cur_element->ctx) { - free(cur_element); - rc = -ENOMEM; - goto err; - } - /* tables from 0x800 are reserved by kernel */ - bitmap_create(&cur_element->ctx->ht, (KERNEL_HT - 1)); - if (NULL == cur_element->ctx->ht) { - free(cur_element->ctx); - free(cur_element); - rc = -ENOMEM; - goto err; - } - - /* table id = 0 is not used */ - bitmap_set(cur_element->ctx->ht, 0); - INIT_LIST_HEAD(&(cur_element->ctx->pending_list)); - list_add_tail(&cur_element->item, cur_head); - } - if (NULL == cur_element) { - rc = -EFAULT; - goto err; - } - cur_element->ref++; - ctx = cur_element->ctx; - - log_debug("[%d] add flow (if): 0x%p value: %d ref: %d\n", pid, cur_element, - cur_element->value[0], cur_element->ref); - - /* table list processing - * table id calculation is based on type and ip (ipv4/ipv6) - * so as first step let find if hash table referenced in this flow exists - * in the list of tables related specific interface or allocate new element related one - */ - cur_head = &cur_element->list; - list_for_each(cur_entry, cur_head) - { - cur_element = list_entry(cur_entry, struct flow_element, item); - if (cur_element->value[0] == (uint32_t)value->type && cur_element->value[1] == ip) { - ht = cur_element->ht_id & 0x0000FFFF; - ht_internal = (cur_element->ht_id >> 16) & 0x0000FFFF; - break; - } - } - if (cur_entry == cur_head) { - cur_element = (void *)calloc(1, sizeof(*cur_element)); - if (NULL == cur_element) { - rc = -ENOMEM; - goto err; - } - - get_htid(ctx, get_prio(value), &ht_internal, &ht); - - if (tc_add_filter_divisor(daemon_cfg.tc, value->if_id, get_prio(value), ht, proto) < 0) { - log_error("[%d] failed tc_add_filter_divisor() errno = %d\n", pid, errno); - free(cur_element); - rc = -EFAULT; - goto err; - } - if (tc_add_filter_link(daemon_cfg.tc, value->if_id, get_prio(value), ht_internal, ht, - &value->flow.dst) < 0) { - log_error("[%d] failed tc_add_filter_link() errno = %d\n", pid, errno); - free(cur_element); - rc = -EFAULT; - goto err; - } - - INIT_LIST_HEAD(&cur_element->list); - cur_element->ref = 0; - cur_element->value[0] = value->type; - cur_element->value[1] = ip; - cur_element->ht_id = ((ht_internal << 16) & 0xFFFF0000) | (ht & 0x0000FFFF); - list_add_tail(&cur_element->item, cur_head); - } - assert(cur_element); - cur_element->ref++; - - log_debug("[%d] add flow (ht): 0x%p value: %d:%d ref: %d\n", pid, cur_element, - cur_element->value[0], cur_element->value[1], cur_element->ref); - - /* bucket list processing - * bucket number calculation can be different for flow types - * so as first step let find if bucket referenced in this flow exists - * in the list of buckets related specific hash table or allocate new element related one - */ - cur_head = &cur_element->list; - bkt = get_bkt(value); - if (bkt < 0) { - log_warn("[%d] invalid flow bkt: %d\n", pid, bkt); - goto err; - } - list_for_each(cur_entry, cur_head) - { - cur_element = list_entry(cur_entry, struct flow_element, item); - if ((int)cur_element->value[0] == bkt) { - break; - } - } - if (cur_entry == cur_head) { - cur_element = (void *)calloc(1, sizeof(*cur_element)); - if (NULL == cur_element) { - rc = -ENOMEM; - goto err; - } - - INIT_LIST_HEAD(&cur_element->list); - cur_element->ref = 0; - cur_element->value[0] = bkt; - list_add_tail(&cur_element->item, cur_head); - } - assert(cur_element); - cur_element->ref++; - - log_debug("[%d] add flow (bkt): 0x%p value: %d ref: %d\n", pid, cur_element, - cur_element->value[0], cur_element->ref); - - /* node list processing - * node number calculation can be different for flow types - * allocate new element related one - * cur_entry pointed by cur_head can depends on internal logic and - * direct a place in the list where new entry should be inserted - */ - cur_head = &cur_element->list; - id = get_node(&cur_head); - if (id <= 0) { - log_warn("[%d] invalid flow id: %d\n", pid, id); - goto err; - } else { - cur_element = (void *)calloc(1, sizeof(*cur_element)); - if (NULL == cur_element) { - rc = -ENOMEM; - goto err; - } - - switch (value->type) { - case XLIO_MSG_FLOW_TCP_3T: - case XLIO_MSG_FLOW_UDP_3T: - rc = tc_add_filter_dev2tap(daemon_cfg.tc, value->if_id, get_prio(value), ht, bkt, id, - get_protocol(value), &value->flow.dst, NULL, value->tap_id); - break; - case XLIO_MSG_FLOW_TCP_5T: - case XLIO_MSG_FLOW_UDP_5T: - rc = tc_add_filter_dev2tap(daemon_cfg.tc, value->if_id, get_prio(value), ht, bkt, id, - get_protocol(value), &value->flow.dst, &value->flow.src, - value->tap_id); - break; - default: - break; - } - if (rc < 0) { - log_error("[%d] failed tc_add_filter_dev2tap() errno = %d\n", pid, errno); - free(cur_element); - rc = -EFAULT; - goto err; - } - - INIT_LIST_HEAD(&cur_element->list); - cur_element->ref = 0; - cur_element->value[0] = id; - list_add_tail(&cur_element->item, cur_head); - } - assert(cur_element); - cur_element->ref++; - - log_debug("[%d] add flow (node): 0x%p value: %d ref: %d\n", pid, cur_element, - cur_element->value[0], cur_element->ref); - - free_pending_list(pid, ctx, value->if_id, proto); - -err: - // coverity[overflow_const:FALSE] /*Turn off coverity check for overflow*/ - value->handle = HANDLE_SET(ht, bkt, id); - log_debug("[%d] add flow filter: %x:%x:%x rc=%d\n", pid, ht, bkt, id, rc); - - return rc; -} - -int del_flow(struct store_pid *pid_value, struct store_flow *value) -{ - int rc = 0; - pid_t pid = pid_value->pid; - struct list_head *cur_head = NULL; - struct flow_element *cur_element = NULL; - struct list_head *cur_entry = NULL; - struct flow_element *save_element[3]; - struct list_head *save_entry[3]; - uint32_t ip = value->flow.dst.addr4.sin_addr.s_addr; - int ht = HANDLE_HT(value->handle); - int bkt = HANDLE_BKT(value->handle); - int id = HANDLE_ID(value->handle); - int ht_internal = KERNEL_HT; - struct flow_ctx *ctx = NULL; - int found = 0; - uint16_t proto = (value->flow.dst.family == AF_INET ? ETH_P_IP : ETH_P_IPV6); - - errno = 0; - - /* interface list processing */ - found = 0; - cur_head = &daemon_cfg.if_list; - list_for_each(cur_entry, cur_head) - { - cur_element = list_entry(cur_entry, struct flow_element, item); - if (cur_element->value[0] == value->if_id) { - found = 1; - break; - } - } - if (found) { - assert(cur_entry != cur_head); - assert(cur_element); - ctx = cur_element->ctx; - save_element[0] = cur_element; - save_entry[0] = cur_entry; - - /* table list processing */ - found = 0; - cur_head = &cur_element->list; - list_for_each(cur_entry, cur_head) - { - cur_element = list_entry(cur_entry, struct flow_element, item); - if (cur_element->value[0] == (uint32_t)value->type && cur_element->value[1] == ip) { - ht = cur_element->ht_id & 0x0000FFFF; - ht_internal = (cur_element->ht_id >> 16) & 0x0000FFFF; - found = 1; - break; - } - } - if (found) { - assert(cur_entry != cur_head); - assert(cur_element); - save_element[1] = cur_element; - save_entry[1] = cur_entry; - - /* bucket list processing */ - found = 0; - cur_head = &cur_element->list; - list_for_each(cur_entry, cur_head) - { - cur_element = list_entry(cur_entry, struct flow_element, item); - if ((int)cur_element->value[0] == bkt) { - found = 1; - break; - } - } - if (found) { - assert(cur_entry != cur_head); - assert(cur_element); - save_element[2] = cur_element; - save_entry[2] = cur_entry; - - /* node list processing */ - found = 0; - cur_head = &cur_element->list; - list_for_each(cur_entry, cur_head) - { - cur_element = list_entry(cur_entry, struct flow_element, item); - if ((int)cur_element->value[0] == id) { - found = 1; - break; - } - } - if (found) { - assert(cur_entry != cur_head); - assert(cur_element); - - cur_element->ref--; - - log_debug("[%d] del flow (node): 0x%p value: %d:%d ref: %d\n", pid, cur_element, - cur_element->value[0], cur_element->value[1], cur_element->ref); - if (list_empty(&cur_element->list) && (cur_element->ref <= 0)) { - - if (tc_del_filter(daemon_cfg.tc, value->if_id, get_prio(value), ht, bkt, id, - proto) < 0) { - log_warn("[%d] failed tc_del_filter() errno = %d\n", pid, errno); - rc = -EFAULT; - } - - list_del_init(cur_entry); - free(cur_element); - } - } - - cur_element = save_element[2]; - cur_entry = save_entry[2]; - cur_element->ref--; - - log_debug("[%d] del flow (bkt): 0x%p value: %d ref: %d\n", pid, cur_element, - cur_element->value[0], cur_element->ref); - if (list_empty(&cur_element->list) && (cur_element->ref <= 0)) { - list_del_init(cur_entry); - free(cur_element); - } - } - - cur_element = save_element[1]; - cur_entry = save_entry[1]; - cur_element->ref--; - - log_debug("[%d] del flow (ht): 0x%p value: %d:%d ref: %d\n", pid, cur_element, - cur_element->value[0], cur_element->value[1], cur_element->ref); - if (list_empty(&cur_element->list) && (cur_element->ref <= 0)) { - - if (tc_del_filter(daemon_cfg.tc, value->if_id, get_prio(value), ht_internal, 0, ht, - proto) < 0) { - log_warn("[%d] failed tc_del_filter() errno = %d\n", pid, errno); - rc = -EFAULT; - } - - /* Device busy error is returned while trying to remove table in this location */ - add_pending_list(pid, ctx, value->if_id, ht, get_prio(value), &rc); - - list_del_init(cur_entry); - free(cur_element); - } - } - - cur_element = save_element[0]; - cur_entry = save_entry[0]; - cur_element->ref--; - - log_debug("[%d] del flow (if): 0x%p value: %d ref: %d\n", pid, cur_element, - cur_element->value[0], cur_element->ref); - if (list_empty(&cur_element->list) && (cur_element->ref <= 0)) { - - if (tc_del_qdisc(daemon_cfg.tc, value->if_id) < 0) { - log_warn("[%d] failed tc_del_qdisc() errno = %d\n", pid, errno); - rc = -EFAULT; - } - - assert(ctx == cur_element->ctx); - free_pending_list(pid, cur_element->ctx, value->if_id, proto); - bitmap_destroy(cur_element->ctx->ht); - free(cur_element->ctx); - ctx = NULL; - list_del_init(cur_entry); - free(cur_element); - } - } - - free_pending_list(pid, ctx, value->if_id, proto); - - log_debug("[%d] del flow filter: %x:%x:%x rc=%d\n", pid, ht, bkt, id, rc); - - return rc; -} - -static int add_flow_egress(struct store_pid *pid_value, struct store_flow *value) -{ - int rc = 0; - pid_t pid = pid_value->pid; - struct list_head *cur_entry = NULL; - struct store_flow *cur_flow = NULL; - int prio = 0; - - errno = 0; - - /* Egress rules should be created for new tap device - */ - list_for_each(cur_entry, &pid_value->flow_list) - { - cur_flow = list_entry(cur_entry, struct store_flow, item); - if (value->tap_id == cur_flow->tap_id) { - break; - } - } - if (cur_entry == &pid_value->flow_list) { - struct ifaddrs *ifaddr, *ifa; - int handle = 1; - - /* This cleanup is done just to support verification */ - tc_del_qdisc(daemon_cfg.tc, value->tap_id); - - /* Create rules to process ingress trafic on tap device */ - if (tc_add_qdisc(daemon_cfg.tc, value->tap_id) < 0) { - log_error("[%d] failed tc_add_qdisc() errno = %d\n", pid, errno); - rc = -EFAULT; - goto err; - } - - if (!getifaddrs(&ifaddr)) { - for (ifa = ifaddr; NULL != ifa; ifa = ifa->ifa_next) { - if ((ifa->ifa_addr->sa_family == AF_INET || ifa->ifa_addr->sa_family == AF_INET6) && - !(ifa->ifa_flags & IFF_LOOPBACK) && - value->if_id == if_nametoindex(ifa->ifa_name)) { - - /* Ignore link-local IPv6 address */ - if (ifa->ifa_addr->sa_family == AF_INET6) { - struct sockaddr_in6 *sa6 = (struct sockaddr_in6 *)(ifa->ifa_addr); - if (IN6_IS_ADDR_LINKLOCAL(&sa6->sin6_addr) || - IN6_IS_ADDR_MC_LINKLOCAL(&sa6->sin6_addr)) { - continue; - } - } - - /* Create filter to redirect traffic from tap device to lo device - * in case destination address relates netvsc - */ - if (tc_add_filter_tap2dev( - daemon_cfg.tc, value->tap_id, ++prio, handle, ifa->ifa_addr->sa_family, - (struct sockaddr_store *)ifa->ifa_addr, sys_lo_ifindex()) < 0) { - log_error("[%d] failed tc_add_filter_tap2dev() errno = %d\n", pid, errno); - rc = -EFAULT; - goto err; - } - handle++; - } - } - freeifaddrs(ifaddr); - } - - /* Create filter to redirect traffic from tap device to netvsc device - * Use another prio value for common filter just to separate one - * actually the same value should work too - */ - if (tc_add_filter_tap2dev(daemon_cfg.tc, value->tap_id, ++prio, handle, AF_INET, NULL, - value->if_id) < 0) { - log_error("[%d] failed tc_add_filter_tap2dev() errno = %d\n", pid, errno); - rc = -EFAULT; - goto err; - } -#if 0 /* TODO: does not work */ - if (tc_add_filter_tap2dev(daemon_cfg.tc, value->tap_id, ++prio, handle, AF_INET6, NULL, value->if_id) < 0) { - log_error("[%d] failed tc_add_filter_tap2dev() errno = %d\n", pid, errno); - rc = -EFAULT; - goto err; - } -#endif - } - -err: - - return rc; -} - -static inline void get_htid(struct flow_ctx *ctx, int prio, int *ht_krn, int *ht_id) -{ - if (ht_krn) { - int i; - int free_index = -1; - int free_id = -1; - - *ht_krn = 0; - for (i = 0; i < (int)(sizeof(ctx->ht_prio) / sizeof(ctx->ht_prio[0])); i++) { - if (ctx->ht_prio[i].prio == prio) { - *ht_krn = (KERNEL_HT + ctx->ht_prio[i].id); - break; - } - if (ctx->ht_prio[i].prio == 0) { - free_index = i; - } else { - free_id = (free_id < ctx->ht_prio[i].id ? ctx->ht_prio[i].id : free_id); - } - } - - if ((0 == *ht_krn) && (0 <= free_index)) { - ctx->ht_prio[free_index].prio = prio; - ctx->ht_prio[free_index].id = free_id + 1; - - *ht_krn = (KERNEL_HT + ctx->ht_prio[free_index].id); - } - } - - if (ht_id) { - *ht_id = bitmap_find_first_zero(ctx->ht); - if (*ht_id >= 0) { - bitmap_set(ctx->ht, *ht_id); - } - } -} - -static inline void free_pending_list(pid_t pid, struct flow_ctx *ctx, int if_index, uint16_t proto) -{ - struct htid_node_t *cur_element = NULL; - struct list_head *cur_entry = NULL, *tmp_entry = NULL; - - if (ctx) { - list_for_each_safe(cur_entry, tmp_entry, &ctx->pending_list) - { - cur_element = list_entry(cur_entry, struct htid_node_t, node); - - if (tc_del_filter(daemon_cfg.tc, if_index, cur_element->prio, cur_element->htid, 0, 0, - proto) < 0) { - continue; - } - - log_debug("[%d] del flow request was removed successfully: if %d htid %d prio %d\n", - pid, if_index, cur_element->htid, cur_element->prio); - - list_del_init(&cur_element->node); - free_htid(ctx, cur_element->htid); - free(cur_element); - } - } -} - -static inline void add_pending_list(pid_t pid, struct flow_ctx *ctx, int if_index, int ht_id, - int prio, int *rc) -{ - struct htid_node_t *htid_node = (void *)calloc(1, sizeof(struct htid_node_t)); - if (NULL == htid_node) { - *rc = -ENOMEM; - return; - } - - INIT_LIST_HEAD(&htid_node->node); - htid_node->htid = ht_id; - htid_node->prio = prio; - - list_add(&htid_node->node, &ctx->pending_list); - - log_debug("[%d] del flow request was added to the pending list: if %d htid %d prio %d\n", pid, - if_index, ht_id, prio); -} - -static inline void free_htid(struct flow_ctx *ctx, int ht_id) -{ - bitmap_clear(ctx->ht, ht_id); -} - -static inline int get_prio(struct store_flow *value) -{ - return value->type; -} - -static inline int get_bkt(struct store_flow *value) -{ - uint16_t port = (value->flow.dst.family == AF_INET ? value->flow.dst.addr4.sin_port - : value->flow.dst.addr6.sin6_port); - return ntohs(port) & 0xFF; -} - -static inline int get_protocol(struct store_flow *value) -{ - switch (value->type) { - case XLIO_MSG_FLOW_UDP_3T: - case XLIO_MSG_FLOW_UDP_5T: - return IPPROTO_UDP; - - case XLIO_MSG_FLOW_TCP_3T: - case XLIO_MSG_FLOW_TCP_5T: - return IPPROTO_TCP; - - default: - return -EINVAL; - } -} - -static inline int get_node(struct list_head **cur_head) -{ - int id = 1; - struct flow_element *cur_element = NULL; - struct list_head *cur_entry = NULL; - - /* node id logic is smart (keep list entry in ascending order) - * there are two ways as - * 1 - simply take last entry in the list and increment id value until - * maximum value is not achieved - * 2 - if last entry has maximum possible value try look for first free - * entry from start in the list - */ - if (!list_empty((*cur_head))) { - cur_entry = (*cur_head)->prev; - cur_element = list_entry(cur_entry, struct flow_element, item); - if (cur_element->value[0] < MAX_ID) { - id = cur_element->value[0] + 1; - } else { - id = 1; - list_for_each(cur_entry, (*cur_head)) - { - cur_element = list_entry(cur_entry, struct flow_element, item); - if ((int)cur_element->value[0] > id) { - *cur_head = cur_entry; - break; - } - id++; - } - } - } - - if ((0 >= id) || (id > MAX_ID)) { - return -EINVAL; - } - - return id; -} diff --git a/tools/daemon/loop.c b/tools/daemon/loop.c index 972049fa3..2f80842b6 100644 --- a/tools/daemon/loop.c +++ b/tools/daemon/loop.c @@ -42,13 +42,10 @@ #include #include "hash.h" -#include "tc.h" #include "daemon.h" extern int open_store(void); extern void close_store(void); -extern int open_flow(void); -extern void close_flow(void); extern int open_message(void); extern void close_message(void); extern int proc_message(void); @@ -73,12 +70,6 @@ int proc_loop(void) goto err; } - log_debug("setting flow ...\n"); - rc = open_flow(); - if (rc < 0) { - goto err; - } - log_debug("setting notification ...\n"); rc = open_notify(); if (rc < 0) { @@ -137,7 +128,6 @@ int proc_loop(void) close_message(); close_notify(); - close_flow(); close_store(); return rc; diff --git a/tools/daemon/message.c b/tools/daemon/message.c index 90b2c0444..3744732e9 100644 --- a/tools/daemon/message.c +++ b/tools/daemon/message.c @@ -44,20 +44,15 @@ #include "core/lwip/tcp.h" /* display TCP states */ #include "hash.h" -#include "tc.h" #include "daemon.h" int open_message(void); void close_message(void); int proc_message(void); -extern int add_flow(struct store_pid *pid_value, struct store_flow *value); -extern int del_flow(struct store_pid *pid_value, struct store_flow *value); - static int proc_msg_init(struct xlio_hdr *msg_hdr, size_t size, struct sockaddr_un *peeraddr); static int proc_msg_exit(struct xlio_hdr *msg_hdr, size_t size); static int proc_msg_state(struct xlio_hdr *msg_hdr, size_t size); -static int proc_msg_flow(struct xlio_hdr *msg_hdr, size_t size, struct sockaddr_un *peeraddr); int open_message(void) { @@ -163,15 +158,6 @@ int proc_message(void) case XLIO_MSG_EXIT: rc = proc_msg_exit(msg_hdr, len); break; - case XLIO_MSG_FLOW: - /* Note: special loopback logic, it - * should be added first as far as observed issue with delay - * in activation loopback filters in case two processes - * communicate locally w/o SRIOV - */ - proc_msg_flow(msg_hdr, len, NULL); - rc = proc_msg_flow(msg_hdr, len, &peeraddr); - break; default: rc = -EPROTO; log_error("Received unknown message errno %d (%s)\n", errno, strerror(errno)); @@ -223,7 +209,6 @@ static int proc_msg_init(struct xlio_hdr *msg_hdr, size_t size, struct sockaddr_ value->pid = data->hdr.pid; value->lib_ver = data->ver; gettimeofday(&value->t_start, NULL); - INIT_LIST_HEAD(&value->flow_list); value->ht = hash_create(&free, daemon_cfg.opt.max_fid_num); if (NULL == value->ht) { @@ -270,17 +255,6 @@ static int proc_msg_exit(struct xlio_hdr *msg_hdr, size_t size) pid_value = hash_get(daemon_cfg.ht, data->hdr.pid); if (pid_value) { - struct store_flow *flow_value = NULL; - struct list_head *cur_entry = NULL; - struct list_head *tmp_entry = NULL; - list_for_each_safe(cur_entry, tmp_entry, &pid_value->flow_list) - { - flow_value = list_entry(cur_entry, struct store_flow, item); - list_del_init(&flow_value->item); - del_flow(pid_value, flow_value); - free(flow_value); - } - hash_del(daemon_cfg.ht, pid_value->pid); } @@ -375,162 +349,3 @@ static int proc_msg_state(struct xlio_hdr *msg_hdr, size_t size) return (sizeof(*data)); } - -static int proc_msg_flow(struct xlio_hdr *msg_hdr, size_t size, struct sockaddr_un *peeraddr) -{ - int rc = 0; - struct xlio_msg_flow *data; - struct store_pid *pid_value; - struct store_flow *value = NULL; - struct store_flow *cur_flow = NULL; - struct list_head *cur_entry = NULL; - int value_new = 0; - int ack = 0; - - assert(msg_hdr); - assert((msg_hdr->code & ~XLIO_MSG_ACK) == XLIO_MSG_FLOW); - assert(size); - - data = (struct xlio_msg_flow *)msg_hdr; - if (size < sizeof(*data)) { - rc = -EBADMSG; - goto err; - } - - /* Note: special loopback logic */ - if (NULL == peeraddr && data->type == XLIO_MSG_FLOW_EGRESS) { - return 0; - } - - ack = (1 == data->hdr.status); - - pid_value = hash_get(daemon_cfg.ht, data->hdr.pid); - if (NULL == pid_value) { - /* Return success because this case can be valid - * if the process is terminated using abnormal way - * So no needs in acknowledgement. - */ - log_debug("Failed hash_get() for pid %d errno %d (%s). The process should be abnormal " - "terminated\n", - data->hdr.pid, errno, strerror(errno)); - return ((int)sizeof(*data)); - } - - /* Allocate memory for this value in this place - */ - value = (void *)calloc(1, sizeof(*value)); - if (NULL == value) { - rc = -ENOMEM; - goto err; - } - - value->type = data->type; - value->if_id = data->if_id; - value->tap_id = data->tap_id; - value->flow.dst.family = data->flow.dst.family; - if (value->flow.dst.family == AF_INET) { - value->flow.dst.addr4.sin_port = data->flow.dst.port; - value->flow.dst.addr4.sin_addr.s_addr = data->flow.dst.addr.ipv4; - } else { - value->flow.dst.addr6.sin6_port = data->flow.dst.port; - memcpy(&value->flow.dst.addr6.sin6_addr.s6_addr[0], &data->flow.dst.addr.ipv6[0], - sizeof(value->flow.dst.addr6.sin6_addr.s6_addr)); - } - - switch (data->type) { - case XLIO_MSG_FLOW_EGRESS: - case XLIO_MSG_FLOW_TCP_3T: - case XLIO_MSG_FLOW_UDP_3T: - break; - case XLIO_MSG_FLOW_TCP_5T: - case XLIO_MSG_FLOW_UDP_5T: - value->flow.src.family = data->flow.src.family; - if (value->flow.src.family == AF_INET) { - value->flow.src.addr4.sin_port = data->flow.src.port; - value->flow.src.addr4.sin_addr.s_addr = data->flow.src.addr.ipv4; - } else { - value->flow.src.addr6.sin6_port = data->flow.src.port; - memcpy(&value->flow.src.addr6.sin6_addr.s6_addr[0], &data->flow.src.addr.ipv6[0], - sizeof(value->flow.src.addr6.sin6_addr.s6_addr)); - } - break; - default: - log_error("Received unknown message errno %d (%s)\n", errno, strerror(errno)); - rc = -EPROTO; - goto err; - } - - /* Note: - * - special loopback logic when peeraddr is null - * - avoid useless rules creation in case expected 5t traffic is local - */ - if (NULL == peeraddr) { - value->if_id = sys_lo_ifindex(); - ack = 0; - if (value->if_id <= 0) { - rc = -EFAULT; - goto err; - } - } else if ((XLIO_MSG_FLOW_TCP_5T == data->type || XLIO_MSG_FLOW_UDP_5T == data->type) && - sys_iplocal(value->flow.src.addr4.sin_addr.s_addr)) { - rc = 0; - goto err; - } - - if (XLIO_MSG_FLOW_ADD == data->action) { - list_for_each(cur_entry, &pid_value->flow_list) - { - cur_flow = list_entry(cur_entry, struct store_flow, item); - if (value->type == cur_flow->type && value->if_id == cur_flow->if_id && - value->tap_id == cur_flow->tap_id && - !memcmp(&value->flow, &cur_flow->flow, sizeof(cur_flow->flow))) { - break; - } - } - if (cur_entry == &pid_value->flow_list) { - rc = add_flow(pid_value, value); - if (rc < 0) { - goto err; - } - value_new = 1; /* mark value as new to avoid releasing */ - list_add_tail(&value->item, &pid_value->flow_list); - - log_debug("[%d] add flow handle: 0x%08X type: %d if_id: %d tap_id: %d\n", - pid_value->pid, value->handle, value->type, value->if_id, value->tap_id); - } - } - - if (XLIO_MSG_FLOW_DEL == data->action) { - list_for_each(cur_entry, &pid_value->flow_list) - { - cur_flow = list_entry(cur_entry, struct store_flow, item); - if (value->type == cur_flow->type && value->if_id == cur_flow->if_id && - value->tap_id == cur_flow->tap_id && - !memcmp(&value->flow, &cur_flow->flow, sizeof(cur_flow->flow))) { - log_debug("[%d] del flow handle: 0x%08X type: %d if_id: %d tap_id: %d\n", - pid_value->pid, cur_flow->handle, cur_flow->type, cur_flow->if_id, - cur_flow->tap_id); - list_del_init(&cur_flow->item); - rc = del_flow(pid_value, cur_flow); - free(cur_flow); - break; - } - } - } - -err: - if (ack) { - data->hdr.code |= XLIO_MSG_ACK; - data->hdr.status = (rc ? 1 : 0); - if (0 > sys_sendto(daemon_cfg.sock_fd, &data->hdr, sizeof(data->hdr), 0, - (struct sockaddr *)peeraddr, sizeof(*peeraddr))) { - log_warn("Failed sendto() message errno %d (%s)\n", errno, strerror(errno)); - } - } - - if (value && !value_new) { - free(value); - } - - return (rc ? rc : (int)sizeof(*data)); -} diff --git a/tools/daemon/nl.c b/tools/daemon/nl.c deleted file mode 100644 index fb27bece3..000000000 --- a/tools/daemon/nl.c +++ /dev/null @@ -1,239 +0,0 @@ -/* - * SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES - * Copyright (c) 2001-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include -#include -#include -#include -#include - -#include "hash.h" -#include "tc.h" -#include "daemon.h" -#include "nl.h" - -/** - * @struct nl_object - * @brief netlink container - */ -struct nl_object { - int fd; /**< the netlink socket file descriptor used for communication */ - int seq; /**< sequence number of send operation */ - char buf[81920]; /**< buffer for receive data */ -}; - -nl_t nl_create(void) -{ - nl_t nt = NULL; - int fd = -1; - - nt = (struct nl_object *)malloc(sizeof(*nt)); - if (nt) { - int sndbuf_size = 32768; - int rcvbuf_size = 32768; - struct sockaddr_nl local; - - fd = socket(AF_NETLINK, SOCK_RAW | SOCK_CLOEXEC, NETLINK_ROUTE); - if (fd < 0) { - log_error("Unable to create a netlink socket\n"); - goto err; - } - if (setsockopt(fd, SOL_SOCKET, SO_SNDBUF, &sndbuf_size, sizeof(int))) { - log_error("Unable to set SO_SNDBUF\n"); - goto err; - } - if (setsockopt(fd, SOL_SOCKET, SO_RCVBUF, &rcvbuf_size, sizeof(int))) { - log_error("Unable to set SO_RCVBUF\n"); - goto err; - } - memset(&local, 0, sizeof(local)); - local.nl_family = AF_NETLINK; - local.nl_groups = 0; - if (bind(fd, (struct sockaddr *)&local, sizeof(local)) < 0) { - log_error("Unable to bind to the netlink socket\n"); - goto err; - } - - memset(nt, 0, sizeof(*nt)); - nt->fd = fd; - nt->seq = 0; - } - - return nt; -err: - if (fd >= 0) { - close(fd); - } - if (nt) { - free(nt); - } - nt = NULL; - - return NULL; -} - -void nl_destroy(nl_t nt) -{ - if (nt) { - close(nt->fd); - free(nt); - nt = NULL; - } -} - -int nl_send(nl_t nt, struct nlmsghdr *nl_msg) -{ - struct sockaddr_nl nladdr; - struct iovec iov; - struct msghdr msg; - int ret = -1; - - nl_msg->nlmsg_seq = nt->seq++; - - memset(&nladdr, 0, sizeof(nladdr)); - nladdr.nl_family = AF_NETLINK; - nladdr.nl_pid = 0; - nladdr.nl_groups = 0; - - iov.iov_base = nl_msg; - iov.iov_len = nl_msg->nlmsg_len; - - memset(&msg, 0, sizeof(msg)); - msg.msg_name = &nladdr; - msg.msg_namelen = sizeof(nladdr); - msg.msg_iov = &iov; - msg.msg_iovlen = 1; - - log_hexdump((void *)nl_msg, nl_msg->nlmsg_len); - ret = sendmsg(nt->fd, &msg, 0); - if (ret < 0) { - log_error("Failed to send netlink message: %s (%d)\n", strerror(errno), errno); - return ret; - } - - return ret; -} - -int nl_recv(nl_t nt, int (*cb)(struct nlmsghdr *, void *arg), void *arg) -{ - struct sockaddr_nl nladdr; - struct iovec iov; - struct msghdr msg; - int ret = 0; - int multipart = 0; - - memset(&nladdr, 0, sizeof(nladdr)); - - iov.iov_base = nt->buf; - iov.iov_len = sizeof(nt->buf); - - memset(&msg, 0, sizeof(msg)); - msg.msg_name = &nladdr; - msg.msg_namelen = sizeof(nladdr); - msg.msg_iov = &iov; - msg.msg_iovlen = 1; - - do { - struct nlmsghdr *nl_msg; - int recv_bytes = 0; - - recv_bytes = recvmsg(nt->fd, &msg, 0); - if (recv_bytes <= 0) { - if (errno == EAGAIN || errno == EWOULDBLOCK || errno == EINTR) { - continue; - } - return -1; - } - - for (nl_msg = (struct nlmsghdr *)nt->buf; NLMSG_OK(nl_msg, (unsigned int)recv_bytes); - nl_msg = NLMSG_NEXT(nl_msg, recv_bytes)) { - if (nl_msg->nlmsg_type == NLMSG_ERROR) { - struct nlmsgerr *err_data = NLMSG_DATA(nl_msg); - - if (err_data->error < 0) { - errno = -err_data->error; - return -1; - } - /* Ack message. */ - return 0; - } - /* Multi-part msgs and their trailing DONE message. */ - if (nl_msg->nlmsg_flags & NLM_F_MULTI) { - if (nl_msg->nlmsg_type == NLMSG_DONE) { - return 0; - } - multipart = 1; - } - if (cb) { - ret = cb(nl_msg, arg); - } - } - } while (multipart || (msg.msg_flags & MSG_TRUNC)); - - return ret; -} - -void nl_attr_add(struct nlmsghdr *nl_msg, unsigned short type, const void *data, - unsigned int data_len) -{ - struct rtattr *rta; - - if ((NLMSG_ALIGN(nl_msg->nlmsg_len) + RTA_ALIGN(RTA_LENGTH(data_len))) > NLMSG_BUF) { - log_error("Message size is: %zu that exceeds limit: %d\n", - (NLMSG_ALIGN(nl_msg->nlmsg_len) + RTA_ALIGN(RTA_LENGTH(data_len))), NLMSG_BUF); - return; - } - rta = (struct rtattr *)NLMSG_TAIL(nl_msg); - rta->rta_len = RTA_LENGTH(data_len); - rta->rta_type = type; - if (data && data_len) { - memcpy(RTA_DATA(rta), data, data_len); - } - nl_msg->nlmsg_len = NLMSG_ALIGN(nl_msg->nlmsg_len) + RTA_ALIGN(rta->rta_len); -} - -struct rtattr *nl_attr_nest_start(struct nlmsghdr *nl_msg, int type) -{ - struct rtattr *nest = NLMSG_TAIL(nl_msg); - - nl_attr_add(nl_msg, type, NULL, 0); - - return nest; -} - -int nl_attr_nest_end(struct nlmsghdr *nl_msg, struct rtattr *nest) -{ - nest->rta_len = (uintptr_t)NLMSG_TAIL(nl_msg) - (uintptr_t)nest; - - return nest->rta_len; -} diff --git a/tools/daemon/nl.h b/tools/daemon/nl.h deleted file mode 100644 index 3ffd35d05..000000000 --- a/tools/daemon/nl.h +++ /dev/null @@ -1,123 +0,0 @@ -/* - * SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES - * Copyright (c) 2001-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#ifndef TOOLS_DAEMON_NL_H_ -#define TOOLS_DAEMON_NL_H_ - -#include -#include - -/* The nl_t opaque data type - */ -typedef struct nl_object *nl_t; - -#define NLMSG_BUF (16384) -#define NLMSG_TAIL(nl_msg) \ - ((struct rtattr *)(((char *)(nl_msg)) + NLMSG_ALIGN((nl_msg)->nlmsg_len))) - -struct nl_req { - struct nlmsghdr hdr; - struct tcmsg msg; - char buf[NLMSG_BUF]; -}; - -/** - * Initialize a netlink object for communicating with the kernel. - * - * @return - * the newly allocated netlink object. Must be freed with nl_destory. - */ -nl_t nl_create(void); - -/** - * Destroy up a netlink socket. - * - * @param nt - * The netlink object. - * - * @return - * @a none - */ -void nl_destroy(nl_t nt); - -/** - * Send a message to the kernel on the netlink socket. - * - * @param nl_t nt - * The netlink object used for communication. - * @param nl_msg - * The netlink message send to the kernel. - * - * @return - * the number of sent bytes on success, -1 otherwise. - */ -int nl_send(nl_t nt, struct nlmsghdr *nl_msg); - -/** - * Receive a message from the kernel on the netlink socket. - * - * @param nl_t nt - * The netlink object used for communication. - * @param cb - * The callback function to call for each netlink message received. - * @param arg - * Custom arguments for the callback. - * - * @return - * 0 on success, -1 otherwise with errno set. - */ -int nl_recv(nl_t nt, int (*cb)(struct nlmsghdr *, void *arg), void *arg); - -/** - * Append a netlink attribute to a message. - * - * @param nl_msg - * The netlink message to parse, received from the kernel. - * @param type - * The type of attribute to append. - * @param data - * The data to append. - * @param data_len - * The length of the data to append. - * - * @return - * @a none - */ -void nl_attr_add(struct nlmsghdr *nl_msg, unsigned short type, const void *data, - unsigned int data_len); - -struct rtattr *nl_attr_nest_start(struct nlmsghdr *nl_msg, int type); - -int nl_attr_nest_end(struct nlmsghdr *nl_msg, struct rtattr *nest); - -#endif /* TOOLS_DAEMON_NL_H_ */ diff --git a/tools/daemon/notify.c b/tools/daemon/notify.c index 3f08d451c..1c4b3b264 100644 --- a/tools/daemon/notify.c +++ b/tools/daemon/notify.c @@ -48,7 +48,6 @@ #endif #include "hash.h" -#include "tc.h" #include "daemon.h" #ifndef KERNEL_O_LARGEFILE @@ -108,9 +107,6 @@ int open_notify(void); void close_notify(void); int proc_notify(void); -extern int add_flow(struct store_pid *pid_value, struct store_flow *value); -extern int del_flow(struct store_pid *pid_value, struct store_flow *value); - static int setup_notify(void); static int create_raw_socket(void); static int clean_process(pid_t pid); @@ -315,25 +311,8 @@ static int clean_process(pid_t pid) if (pid_value) { struct rst_info rst; struct store_fid *fid_value = NULL; - struct store_flow *flow_value = NULL; - struct list_head *cur_entry = NULL; - struct list_head *tmp_entry = NULL; int i, j; - /* Cleanup flow store */ - j = 0; - list_for_each_safe(cur_entry, tmp_entry, &pid_value->flow_list) - { - flow_value = list_entry(cur_entry, struct store_flow, item); - j++; - log_debug("[%d] #%d found handle: 0x%08X type: %d if_id: %d tap_id: %d\n", - pid_value->pid, j, flow_value->handle, flow_value->type, - flow_value->if_id, flow_value->tap_id); - list_del_init(&flow_value->item); - del_flow(pid_value, flow_value); - free(flow_value); - } - /* Cleanup fid store */ j = 0; for (i = 0; (i < hash_size(pid_value->ht)) && (j < hash_count(pid_value->ht)); diff --git a/tools/daemon/store.c b/tools/daemon/store.c index 7f438c626..73b5f63c8 100644 --- a/tools/daemon/store.c +++ b/tools/daemon/store.c @@ -41,7 +41,6 @@ #include #include "hash.h" -#include "tc.h" #include "daemon.h" int open_store(void); diff --git a/tools/daemon/tc.c b/tools/daemon/tc.c deleted file mode 100644 index f7f4be567..000000000 --- a/tools/daemon/tc.c +++ /dev/null @@ -1,786 +0,0 @@ -/* - * SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES - * Copyright (c) 2001-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include -#include -#include -#include -#include - -#include "hash.h" -#include "tc.h" -#include "daemon.h" -#include "nl.h" - -/* Traffic control usage method - * 0 - tc application - * 1 - netlink api - */ -#define USE_NETLINK 1 - -/** - * @struct tc_object - * @brief tc container - */ -struct tc_object { - nl_t nl; /**< netlink object */ - struct nl_req req; /**< netlink request storage */ -}; - -#if defined(USE_NETLINK) && (USE_NETLINK == 1) -/* Use iproute2 / tc implementation as a reference - * to pack data for specific attribute - */ -static int pack_key(struct tc_u32_sel *sel, uint32_t key, uint32_t mask, int off, int offmask); -static int pack_key8(struct tc_u32_sel *sel, uint32_t key, uint32_t mask, int off, int offmask); -static int pack_key16(struct tc_u32_sel *sel, uint32_t key, uint32_t mask, int off, int offmask); -static int pack_key32(struct tc_u32_sel *sel, uint32_t key, uint32_t mask, int off, int offmask); -static int pack_ip6(struct tc_u32_sel *sel, uint8_t *addr, uint32_t mask, int off, int offmask); -#endif /* USE_NETLINK */ - -tc_t tc_create(void) -{ - tc_t tc = NULL; - - tc = (struct tc_object *)malloc(sizeof(*tc)); - if (tc) { - tc->nl = nl_create(); - if (NULL == tc->nl) { - log_error("Unable to create a netlink object\n"); - goto err; - } - memset(&tc->req, 0, sizeof(tc->req)); - } - - return tc; -err: - free(tc); - tc = NULL; - - return NULL; -} - -void tc_destroy(tc_t tc) -{ - if (tc) { - nl_destroy(tc->nl); - free(tc); - tc = NULL; - } -} - -void tc_req(tc_t tc, int ifindex, uint16_t proto, uint16_t type, uint16_t flags, - struct tc_qdisc qdisc) -{ - memset(&tc->req, 0, sizeof(tc->req)); - - tc->req.hdr.nlmsg_len = NLMSG_LENGTH(sizeof(tc->req.msg)); - tc->req.hdr.nlmsg_type = type; - tc->req.hdr.nlmsg_flags = (flags ? flags : (NLM_F_REQUEST | NLM_F_ACK)); - tc->req.hdr.nlmsg_pid = 0; /* to communicate kernel */ - tc->req.hdr.nlmsg_seq = 0; /* update during send */ - - tc->req.msg.tcm_family = AF_UNSPEC; - tc->req.msg.tcm_ifindex = ifindex; - tc->req.msg.tcm_handle = qdisc.handle; - tc->req.msg.tcm_parent = qdisc.parent; - tc->req.msg.tcm_info = TC_H_MAKE(qdisc.prio << 16, htons(proto)); -} - -int tc_add_qdisc(tc_t tc, int ifindex) -{ - int rc = 0; - - log_debug("add qdisc using if_id: %d\n", ifindex); - -#if defined(USE_NETLINK) && (USE_NETLINK == 1) - struct tc_qdisc qdisc = {TC_H_MAKE(TC_H_INGRESS, 0), TC_H_INGRESS, 0}; - struct rtattr *opts = NULL; - - tc_req(tc, ifindex, 0, RTM_NEWQDISC, (NLM_F_REQUEST | NLM_F_ACK | NLM_F_EXCL | NLM_F_CREATE), - qdisc); - - nl_attr_add(&tc->req.hdr, TCA_KIND, "ingress", sizeof("ingress")); - - opts = nl_attr_nest_start(&tc->req.hdr, TCA_OPTIONS); - nl_attr_nest_end(&tc->req.hdr, opts); - - if (nl_send(tc->nl, &tc->req.hdr) < 0) { - rc = -1; - goto err; - } - if ((nl_recv(tc->nl, NULL, NULL) < 0) && (errno != EEXIST)) { - rc = -1; - goto err; - } -#else - char *out_buf = NULL; - char if_name[IF_NAMESIZE]; - - NOT_IN_USE(tc); - - if (NULL == if_indextoname(ifindex, if_name)) { - rc = -errno; - goto err; - } - - out_buf = sys_exec("tc qdisc add dev %s handle ffff: ingress " - "> /dev/null 2>&1 || echo $?", - if_name); - if (NULL == out_buf || (out_buf[0] != '\0' && out_buf[0] != '0')) { - rc = -1; - goto err; - } -#endif /* USE_NETLINK */ - -err: - return rc; -} - -int tc_del_qdisc(tc_t tc, int ifindex) -{ - int rc = 0; - - log_debug("remove qdisc using if_id: %d\n", ifindex); - -#if defined(USE_NETLINK) && (USE_NETLINK == 1) - struct tc_qdisc qdisc = {TC_H_MAKE(TC_H_INGRESS, 0), TC_H_INGRESS, 0}; - struct rtattr *opts = NULL; - - tc_req(tc, ifindex, 0, RTM_DELQDISC, 0, qdisc); - - nl_attr_add(&tc->req.hdr, TCA_KIND, "ingress", sizeof("ingress")); - - opts = nl_attr_nest_start(&tc->req.hdr, TCA_OPTIONS); - nl_attr_nest_end(&tc->req.hdr, opts); - - if (nl_send(tc->nl, &tc->req.hdr) < 0) { - rc = -1; - goto err; - } - if ((nl_recv(tc->nl, NULL, NULL) < 0) && (errno != EINVAL)) { - rc = -1; - goto err; - } -#else - char *out_buf = NULL; - char if_name[IF_NAMESIZE]; - - NOT_IN_USE(tc); - - if (NULL == if_indextoname(ifindex, if_name)) { - rc = -errno; - goto err; - } - - out_buf = sys_exec("tc qdisc del dev %s handle ffff: ingress " - "> /dev/null 2>&1 || echo $?", - if_name); - if (NULL == out_buf || (out_buf[0] != '\0' && out_buf[0] != '0')) { - rc = -1; - goto err; - } -#endif /* USE_NETLINK */ - -err: - return rc; -} - -int tc_add_filter_divisor(tc_t tc, int ifindex, int prio, int ht, uint16_t proto) -{ - int rc = 0; - - log_debug("apply filter divisor using if_id: %d proto: %04hx\n", ifindex, proto); - -#if defined(USE_NETLINK) && (USE_NETLINK == 1) - struct tc_qdisc qdisc = {HANDLE_SET(ht, 0, 0), 0xffff0000, prio}; - char opt_kind[] = "u32"; - uint32_t opt_divisor = 256; - struct rtattr *opts = NULL; - - tc_req(tc, ifindex, proto, RTM_NEWTFILTER, - (NLM_F_REQUEST | NLM_F_ACK | NLM_F_EXCL | NLM_F_CREATE), qdisc); - - nl_attr_add(&tc->req.hdr, TCA_KIND, opt_kind, sizeof(opt_kind)); - - opts = nl_attr_nest_start(&tc->req.hdr, TCA_OPTIONS); - nl_attr_add(&tc->req.hdr, TCA_U32_DIVISOR, &opt_divisor, sizeof(opt_divisor)); - nl_attr_nest_end(&tc->req.hdr, opts); - - if (nl_send(tc->nl, &tc->req.hdr) < 0) { - rc = -1; - goto err; - } - if (nl_recv(tc->nl, NULL, NULL) < 0) { - rc = -1; - goto err; - } -#else - char *out_buf = NULL; - char if_name[IF_NAMESIZE]; - - NOT_IN_USE(tc); - - if (NULL == if_indextoname(ifindex, if_name)) { - rc = -errno; - goto err; - } - - out_buf = - sys_exec("tc filter add dev %s parent ffff: prio %d handle %x: protocol %s u32 divisor 256 " - "> /dev/null 2>&1 || echo $?", - if_name, prio, ht, (proto == ETH_P_IP ? "ip" : "ipv6")); - if (NULL == out_buf || (out_buf[0] != '\0' && out_buf[0] != '0')) { - rc = -1; - goto err; - } -#endif /* USE_NETLINK */ - -err: - return rc; -} - -int tc_add_filter_link(tc_t tc, int ifindex, int prio, int ht, int id, struct sockaddr_store *ip) -{ - int rc = 0; - uint16_t proto = (ip->family == AF_INET ? ETH_P_IP : ETH_P_IPV6); - - log_debug("add link filter using if_id: %d\n", ifindex); - -#if defined(USE_NETLINK) && (USE_NETLINK == 1) - struct tc_qdisc qdisc = {HANDLE_SET(0, 0, id), 0xffff0000, prio}; - char opt_kind[] = "u32"; - uint32_t opt_link = HANDLE_SET(id, 0, 0); - uint32_t opt_ht = HANDLE_SET(ht, 0, 0); - struct rtattr *opts = NULL; - struct { - union { - struct tc_u32_sel sel; - uint8_t pad[sizeof(struct tc_u32_sel) + sizeof(struct tc_u32_key) * 20U]; - }; - } opt_sel; - - tc_req(tc, ifindex, proto, RTM_NEWTFILTER, - (NLM_F_REQUEST | NLM_F_ACK | NLM_F_EXCL | NLM_F_CREATE), qdisc); - - nl_attr_add(&tc->req.hdr, TCA_KIND, opt_kind, sizeof(opt_kind)); - - opts = nl_attr_nest_start(&tc->req.hdr, TCA_OPTIONS); - nl_attr_add(&tc->req.hdr, TCA_U32_LINK, &opt_link, sizeof(opt_link)); - nl_attr_add(&tc->req.hdr, TCA_U32_HASH, &opt_ht, sizeof(opt_ht)); - memset(&opt_sel, 0, sizeof(opt_sel)); - /* hashkey option: - * mask: 0x000000ff - * at: 20 - */ - opt_sel.sel.hmask = htonl(0x000000ff); - opt_sel.sel.hoff = 20; - /* match option for ip protocol: - * dst: 16 - * addr/mask: ip/0xffffffff - */ - if (proto == ETH_P_IP) { - pack_key32(&opt_sel.sel, ntohl(ip->addr4.sin_addr.s_addr), 0xffffffff, 16, 0); - } else { - pack_ip6(&opt_sel.sel, ip->addr6.sin6_addr.s6_addr, 0xffffffff, 16, 0); - } - nl_attr_add(&tc->req.hdr, TCA_U32_SEL, &opt_sel, - sizeof(opt_sel.sel) + opt_sel.sel.nkeys * sizeof(opt_sel.sel.keys[0])); - nl_attr_nest_end(&tc->req.hdr, opts); - - if (nl_send(tc->nl, &tc->req.hdr) < 0) { - rc = -1; - goto err; - } - if (nl_recv(tc->nl, NULL, NULL) < 0) { - rc = -1; - goto err; - } -#else - char *out_buf = NULL; - char if_name[IF_NAMESIZE]; - - NOT_IN_USE(tc); - - if (NULL == if_indextoname(ifindex, if_name)) { - rc = -errno; - goto err; - } - - out_buf = sys_exec("tc filter add dev %s protocol %s parent ffff: prio %d handle ::%x u32 " - "ht %x:: match ip dst %s hashkey mask 0x000000ff at 20 link %x: " - "> /dev/null 2>&1 || echo $?", - if_name, (proto == ETH_P_IP ? "ip" : "ipv6"), prio, id, ht, - (proto == ETH_P_IP ? "ip" : "ip6"), sys_ip2str(ip), id); - if (NULL == out_buf || (out_buf[0] != '\0' && out_buf[0] != '0')) { - rc = -1; - goto err; - } -#endif /* USE_NETLINK */ - -err: - return rc; -} - -int tc_add_filter_tap2dev(tc_t tc, int ifindex, int prio, int id, uint16_t proto, - struct sockaddr_store *ip, int ifindex_to) -{ - int rc = 0; - - proto = (proto == AF_INET ? ETH_P_IP : ETH_P_IPV6); - log_debug("add filter to redirect traffic from if_id: %d to if_id: %d\n", ifindex, ifindex_to); - -#if defined(USE_NETLINK) && (USE_NETLINK == 1) - struct tc_qdisc qdisc = {HANDLE_SET(0, 0, id), 0xffff0000, prio}; - char opt_kind[] = "u32"; - uint32_t opt_ht = HANDLE_SET(0x800, 0, 0); - struct rtattr *opts = NULL; - struct { - union { - struct tc_u32_sel sel; - uint8_t pad[sizeof(struct tc_u32_sel) + sizeof(struct tc_u32_key) * 20U]; - }; - } opt_sel; - - tc_req(tc, ifindex, proto, RTM_NEWTFILTER, - (NLM_F_REQUEST | NLM_F_ACK | NLM_F_EXCL | NLM_F_CREATE), qdisc); - - nl_attr_add(&tc->req.hdr, TCA_KIND, opt_kind, sizeof(opt_kind)); - - /* [filter] options filling */ - opts = nl_attr_nest_start(&tc->req.hdr, TCA_OPTIONS); - { - struct rtattr *opts_action = NULL; - - /* [action] options filling */ - opts_action = nl_attr_nest_start(&tc->req.hdr, TCA_U32_ACT); - { - int opt_prio = 0; - char opt_act_kind[] = "mirred"; - struct rtattr *opts_action_prio = NULL; - - /* [mirred] options filling */ - opts_action_prio = nl_attr_nest_start(&tc->req.hdr, ++opt_prio); - nl_attr_add(&tc->req.hdr, TCA_ACT_KIND, opt_act_kind, sizeof(opt_act_kind)); - { - struct rtattr *opts_action_prio_mirred = NULL; - struct tc_mirred opt_mirred; - - opts_action_prio_mirred = nl_attr_nest_start(&tc->req.hdr, TCA_ACT_OPTIONS); - memset(&opt_mirred, 0, sizeof(opt_mirred)); - opt_mirred.eaction = TCA_EGRESS_REDIR; - opt_mirred.action = TC_ACT_STOLEN; - opt_mirred.ifindex = ifindex_to; - nl_attr_add(&tc->req.hdr, TCA_MIRRED_PARMS, &opt_mirred, sizeof(opt_mirred)); - - nl_attr_nest_end(&tc->req.hdr, opts_action_prio_mirred); - } - - nl_attr_nest_end(&tc->req.hdr, opts_action_prio); - } - - nl_attr_nest_end(&tc->req.hdr, opts_action); - } - - nl_attr_add(&tc->req.hdr, TCA_U32_HASH, &opt_ht, sizeof(opt_ht)); - memset(&opt_sel, 0, sizeof(opt_sel)); - /* match option for ip protocol: - * dst: 16 - * addr/mask: addr/0xffffffff - */ - if (ip) { - if (proto == ETH_P_IP) { - pack_key32(&opt_sel.sel, ntohl(ip->addr4.sin_addr.s_addr), 0xffffffff, 16, 0); - } else { - pack_ip6(&opt_sel.sel, ip->addr6.sin6_addr.s6_addr, 0xffffffff, 16, 0); - } - } else { - if (proto == ETH_P_IP) { - pack_key32(&opt_sel.sel, ntohl(0), 0, 0, 0); - } else { - uint32_t s_addr[4] = {0, 0, 0, 0}; - pack_ip6(&opt_sel.sel, (uint8_t *)s_addr, 0xffffffff, 0, 0); - } - } - opt_sel.sel.flags |= TC_U32_TERMINAL; - nl_attr_add(&tc->req.hdr, TCA_U32_SEL, &opt_sel, - sizeof(opt_sel.sel) + opt_sel.sel.nkeys * sizeof(opt_sel.sel.keys[0])); - - nl_attr_nest_end(&tc->req.hdr, opts); - - if (nl_send(tc->nl, &tc->req.hdr) < 0) { - rc = -1; - goto err; - } - if (nl_recv(tc->nl, NULL, NULL) < 0) { - rc = -1; - goto err; - } -#else - char *out_buf = NULL; - char if_name[IF_NAMESIZE]; - char tap_name[IF_NAMESIZE]; - - NOT_IN_USE(tc); - - if (NULL == if_indextoname(ifindex_to, if_name)) { - rc = -errno; - goto err; - } - - if (NULL == if_indextoname(ifindex, tap_name)) { - rc = -errno; - goto err; - } - - if (ip) { - out_buf = sys_exec("tc filter add dev %s protocol %s parent ffff: prio %d " - "handle ::%d u32 ht 800:: " - "match %s dst %s action mirred egress redirect dev %s " - "> /dev/null 2>&1 || echo $?", - tap_name, (proto == ETH_P_IP ? "ip" : "ipv6"), prio, id, - (proto == ETH_P_IP ? "ip" : "ip6"), sys_ip2str(ip), if_name); - } else { - out_buf = sys_exec("tc filter add dev %s protocol %s parent ffff: prio %d " - "handle ::%d u32 ht 800:: " - "match u8 0 0 action mirred egress redirect dev %s " - "> /dev/null 2>&1 || echo $?", - tap_name, (proto == ETH_P_IP ? "ip" : "ipv6"), prio, id, if_name); - } - if (NULL == out_buf || (out_buf[0] != '\0' && out_buf[0] != '0')) { - rc = -1; - goto err; - } -#endif /* USE_NETLINK */ - -err: - return rc; -} - -int tc_add_filter_dev2tap(tc_t tc, int ifindex, int prio, int ht, int bkt, int id, int l4_proto, - struct sockaddr_store *dst_ip, struct sockaddr_store *src_ip, - int ifindex_to) -{ - int rc = 0; - uint16_t proto = (dst_ip->family == AF_INET ? ETH_P_IP : ETH_P_IPV6); - uint16_t dst_port = - (dst_ip->family == AF_INET ? dst_ip->addr4.sin_port : dst_ip->addr6.sin6_port); - uint16_t src_port = - (src_ip ? (src_ip->family == AF_INET ? src_ip->addr4.sin_port : src_ip->addr6.sin6_port) - : 0); - - log_debug("add filter to redirect traffic from if_id: %d to if_id: %d\n", ifindex, ifindex_to); - -#if defined(USE_NETLINK) && (USE_NETLINK == 1) - struct tc_qdisc qdisc = {HANDLE_SET(0, 0, id), 0xffff0000, prio}; - char opt_kind[] = "u32"; - uint32_t opt_ht = HANDLE_SET(ht, bkt, 0); - struct rtattr *opts = NULL; - struct { - union { - struct tc_u32_sel sel; - uint8_t pad[sizeof(struct tc_u32_sel) + sizeof(struct tc_u32_key) * 10U]; - }; - } opt_sel; - - tc_req(tc, ifindex, proto, RTM_NEWTFILTER, - (NLM_F_REQUEST | NLM_F_ACK | NLM_F_EXCL | NLM_F_CREATE), qdisc); - - nl_attr_add(&tc->req.hdr, TCA_KIND, opt_kind, sizeof(opt_kind)); - - /* [filter] options filling */ - opts = nl_attr_nest_start(&tc->req.hdr, TCA_OPTIONS); - { - struct rtattr *opts_action = NULL; - - /* [action] options filling */ - opts_action = nl_attr_nest_start(&tc->req.hdr, TCA_U32_ACT); - { - int opt_prio = 0; - char opt_act_kind[] = "mirred"; - struct rtattr *opts_action_prio = NULL; - - /* [mirred] options filling */ - opts_action_prio = nl_attr_nest_start(&tc->req.hdr, ++opt_prio); - nl_attr_add(&tc->req.hdr, TCA_ACT_KIND, opt_act_kind, sizeof(opt_act_kind)); - { - struct rtattr *opts_action_prio_mirred = NULL; - struct tc_mirred opt_mirred; - - opts_action_prio_mirred = nl_attr_nest_start(&tc->req.hdr, TCA_ACT_OPTIONS); - memset(&opt_mirred, 0, sizeof(opt_mirred)); - opt_mirred.eaction = TCA_EGRESS_REDIR; - opt_mirred.action = TC_ACT_STOLEN; - opt_mirred.ifindex = ifindex_to; - nl_attr_add(&tc->req.hdr, TCA_MIRRED_PARMS, &opt_mirred, sizeof(opt_mirred)); - - nl_attr_nest_end(&tc->req.hdr, opts_action_prio_mirred); - } - - nl_attr_nest_end(&tc->req.hdr, opts_action_prio); - } - - nl_attr_nest_end(&tc->req.hdr, opts_action); - } - - nl_attr_add(&tc->req.hdr, TCA_U32_HASH, &opt_ht, sizeof(opt_ht)); - memset(&opt_sel, 0, sizeof(opt_sel)); - /* [match] protocol option */ - pack_key8(&opt_sel.sel, l4_proto, 0xff, 9, 0); - /* [match] nofrag option */ - pack_key16(&opt_sel.sel, 0, 0x3fff, 6, 0); - if (src_ip) { - /* [match] src option */ - if (proto == ETH_P_IP) { - pack_key32(&opt_sel.sel, ntohl(src_ip->addr4.sin_addr.s_addr), 0xffffffff, 12, 0); - } else { - pack_ip6(&opt_sel.sel, src_ip->addr6.sin6_addr.s6_addr, 0xffffffff, 12, 0); - } - /* [match] sport option */ - pack_key16(&opt_sel.sel, ntohs(src_port), 0xffff, 20, 0); - } - /* [match] dst option */ - if (proto == ETH_P_IP) { - pack_key32(&opt_sel.sel, ntohl(dst_ip->addr4.sin_addr.s_addr), 0xffffffff, 16, 0); - } else { - pack_ip6(&opt_sel.sel, dst_ip->addr6.sin6_addr.s6_addr, 0xffffffff, 16, 0); - } - /* [match] dport option */ - pack_key16(&opt_sel.sel, ntohs(dst_port), 0xffff, 22, 0); - opt_sel.sel.flags |= TC_U32_TERMINAL; - nl_attr_add(&tc->req.hdr, TCA_U32_SEL, &opt_sel, - sizeof(opt_sel.sel) + opt_sel.sel.nkeys * sizeof(opt_sel.sel.keys[0])); - - nl_attr_nest_end(&tc->req.hdr, opts); - - if (nl_send(tc->nl, &tc->req.hdr) < 0) { - rc = -1; - goto err; - } - if (nl_recv(tc->nl, NULL, NULL) < 0) { - rc = -1; - goto err; - } -#else - char *out_buf = NULL; - char if_name[IF_NAMESIZE]; - char tap_name[IF_NAMESIZE]; - char str_tmp[100]; - - NOT_IN_USE(tc); - - if (NULL == if_indextoname(ifindex, if_name)) { - rc = -errno; - goto err; - } - - if (NULL == if_indextoname(ifindex_to, tap_name)) { - rc = -errno; - goto err; - } - - if (src_ip) { - strncpy(str_tmp, sys_ip2str(src_ip), sizeof(str_tmp)); - str_tmp[sizeof(str_tmp) - 1] = '\0'; - out_buf = sys_exec("tc filter add dev %s parent ffff: protocol %s " - "prio %d handle ::%x u32 ht %x:%x: " - "match %s protocol %d 0xff " - "match %s nofrag " - "match %s src %s match ip sport %d 0xffff " - "match %s dst %s match ip dport %d 0xffff " - "action mirred egress redirect dev %s " - "> /dev/null 2>&1 || echo $?", - if_name, (proto == ETH_P_IP ? "ip" : "ipv6"), prio, id, ht, bkt, - (proto == ETH_P_IP ? "ip" : "ip6"), l4_proto, - (proto == ETH_P_IP ? "ip" : "ip6"), (proto == ETH_P_IP ? "ip" : "ip6"), - str_tmp, src_port, (proto == ETH_P_IP ? "ip" : "ip6"), - sys_ip2str(dst_ip), ntohs(dst_port), tap_name); - } else { - out_buf = sys_exec("tc filter add dev %s parent ffff: protocol %s " - "prio %d handle ::%x u32 ht %x:%x: " - "match %s protocol %d 0xff " - "match %s nofrag " - "match %s dst %s match ip dport %d 0xffff " - "action mirred egress redirect dev %s " - "> /dev/null 2>&1 || echo $?", - if_name, (proto == ETH_P_IP ? "ip" : "ipv6"), prio, id, ht, bkt, - (proto == ETH_P_IP ? "ip" : "ip6"), l4_proto, - (proto == ETH_P_IP ? "ip" : "ip6"), (proto == ETH_P_IP ? "ip" : "ip6"), - sys_ip2str(dst_ip), ntohs(dst_port), tap_name); - } - if (NULL == out_buf || (out_buf[0] != '\0' && out_buf[0] != '0')) { - rc = -1; - goto err; - } -#endif /* USE_NETLINK */ - -err: - return rc; -} - -int tc_del_filter(tc_t tc, int ifindex, int prio, int ht, int bkt, int id, uint16_t proto) -{ - int rc = 0; - - log_debug("remove filter for if_id: %d proto: %04hx\n", ifindex, proto); - -#if defined(USE_NETLINK) && (USE_NETLINK == 1) - struct tc_qdisc qdisc = {HANDLE_SET(ht, bkt, id), 0xffff0000, prio}; - char opt_kind[] = "u32"; - - tc_req(tc, ifindex, proto, RTM_DELTFILTER, 0, qdisc); - - nl_attr_add(&tc->req.hdr, TCA_KIND, opt_kind, sizeof(opt_kind)); - - if (nl_send(tc->nl, &tc->req.hdr) < 0) { - rc = -1; - goto err; - } - if (nl_recv(tc->nl, NULL, NULL) < 0) { - rc = -1; - goto err; - } -#else - char *out_buf = NULL; - char if_name[IF_NAMESIZE]; - - NOT_IN_USE(tc); - - if (NULL == if_indextoname(ifindex, if_name)) { - rc = -errno; - goto err; - } - - out_buf = sys_exec("tc filter del dev %s parent ffff: protocol %s prio %d handle %x:%x:%x u32 " - "> /dev/null 2>&1 || echo $?", - if_name, (proto == ETH_P_IP ? "ip" : "ipv6"), prio, ht, bkt, id); - if (NULL == out_buf || (out_buf[0] != '\0' && out_buf[0] != '0')) { - rc = -1; - goto err; - } -#endif /* USE_NETLINK */ - -err: - return rc; -} - -#if defined(USE_NETLINK) && (USE_NETLINK == 1) -static int pack_key(struct tc_u32_sel *sel, uint32_t key, uint32_t mask, int off, int offmask) -{ - int i; - - key &= mask; - - for (i = 0; i < sel->nkeys; i++) { - if ((sel->keys[i].off == off) && (sel->keys[i].offmask == offmask)) { - uint32_t intersect = mask & sel->keys[i].mask; - - if ((key ^ sel->keys[i].val) & intersect) { - return -1; - } - sel->keys[i].val |= key; - sel->keys[i].mask |= mask; - return 0; - } - } - - if (off % 4) { - return -1; - } - sel->keys[sel->nkeys].val = key; - sel->keys[sel->nkeys].mask = mask; - sel->keys[sel->nkeys].off = off; - sel->keys[sel->nkeys].offmask = offmask; - sel->nkeys++; - - return 0; -} - -static int pack_key8(struct tc_u32_sel *sel, uint32_t key, uint32_t mask, int off, int offmask) -{ - if ((off & 3) == 0) { - key <<= 24; - mask <<= 24; - } else if ((off & 3) == 1) { - key <<= 16; - mask <<= 16; - } else if ((off & 3) == 2) { - key <<= 8; - mask <<= 8; - } - off &= ~3; - key = htonl(key); - mask = htonl(mask); - - return pack_key(sel, key, mask, off, offmask); -} - -static int pack_key16(struct tc_u32_sel *sel, uint32_t key, uint32_t mask, int off, int offmask) -{ - if ((off & 3) == 0) { - key <<= 16; - mask <<= 16; - } - off &= ~3; - key = htonl(key); - mask = htonl(mask); - - return pack_key(sel, key, mask, off, offmask); -} - -static int pack_key32(struct tc_u32_sel *sel, uint32_t key, uint32_t mask, int off, int offmask) -{ - key = htonl(key); - mask = htonl(mask); - - return pack_key(sel, key, mask, off, offmask); -} - -static int pack_ip6(struct tc_u32_sel *sel, uint8_t *addr, uint32_t mask, int off, int offmask) -{ - int ret = 0; - int i = 0; - uint32_t key = 0; - - for (i = 0; i < 4; i++) { - key = htonl(((uint32_t *)addr)[i]); - mask = htonl(mask); - ret = pack_key(sel, key, mask, off + 4 * (i - 1), offmask); - if (ret) { - return ret; - } - } - - return ret; -} -#endif /* USE_NETLINK */ diff --git a/tools/daemon/tc.h b/tools/daemon/tc.h deleted file mode 100644 index a4f294453..000000000 --- a/tools/daemon/tc.h +++ /dev/null @@ -1,252 +0,0 @@ -/* - * SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES - * Copyright (c) 2001-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#ifndef TOOLS_DAEMON_TC_H_ -#define TOOLS_DAEMON_TC_H_ - -#include /* for the TC_H_* macros */ -#include -#include -#include - -struct sockaddr_store; - -/* The tc_t opaque data type - */ -typedef struct tc_object *tc_t; - -struct tc_qdisc { - uint32_t handle; - uint32_t parent; - int prio; -}; - -#define KERNEL_HT 0x800 -#define MAX_BKT 0xFF -#define MAX_ID 0xFFE -#define HANDLE_INVALID (uint32_t)(-1) - -#define HANDLE_SET(ht, bkt, id) \ - ((((uint32_t)(ht) << 20) & 0xFFF00000) | (((uint32_t)(bkt) << 12) & 0x000FF000) | \ - (((uint32_t)(id) << 0) & 0x00000FFF)) - -#define HANDLE_HT(value) ((((uint32_t)(value)) & 0xFFF00000) >> 20) /* 12bits by offset 20 */ -#define HANDLE_BKT(value) ((((uint32_t)(value)) & 0x000FF000) >> 12) /* 8bits by offset 12 */ -#define HANDLE_ID(value) ((((uint32_t)(value)) & 0x00000FFF) >> 0) /* 12bits by offset 0 */ - -/** - * Initialize a tc object. - * - * @return - * the newly allocated netlink object. Must be freed with nl_destory. - */ -tc_t tc_create(void); - -/** - * Destroy up a tc object. - * - * @param tc - * The tc object. - * - * @return - * @a none - */ -void tc_destroy(tc_t tc); - -/** - * Initialize a TC request. - * - * @param[in] tc - * The TC object. - * @param[in] ifindex - * The netdevice ifindex where the rule will be applied. - * @param[in] proto - * Protocol (IPv4/IPv6). - * @param[in] type - * The type of TC message to create (RTM_NEWTFILTER, RTM_NEWQDISC, etc.). - * @param[in] flags - * Overrides the default netlink flags for this msg with those specified. - * @param[in] qdisc - * Set qdisc data. - * - * @return - * @a none - */ -void tc_req(tc_t tc, int ifindex, uint16_t proto, uint16_t type, uint16_t flags, - struct tc_qdisc qdisc); - -/** - * Add qdisc as a TC request. - * - * @param[in] tc - * The TC object. - * @param[in] ifindex - * The netdevice ifindex where the rule will be applied. - * - * @return - * 0 on success, -1 otherwise with errno set. - */ -int tc_add_qdisc(tc_t tc, int ifindex); - -/** - * Remove qdisc as a TC request. - * - * @param[in] tc - * The TC object. - * @param[in] ifindex - * The netdevice ifindex where the rule will be applied. - * - * @return - * 0 on success, -1 otherwise with errno set. - */ -int tc_del_qdisc(tc_t tc, int ifindex); - -/** - * Add filter divisor for hash tables as a TC request. - * - * @param[in] tc - * The TC object. - * @param[in] ifindex - * The netdevice ifindex where the rule will be applied. - * @param[in] prio - * Priority value. - * @param[in] ht - * Hash table index. - * @param[in] proto - * Protocol (IPv4/IPv6). - * - * @return - * 0 on success, -1 otherwise with errno set. - */ -int tc_add_filter_divisor(tc_t tc, int ifindex, int prio, int ht, uint16_t proto); - -/** - * Add filter link as a TC request. - * - * @param[in] tc - * The TC object. - * @param[in] ifindex - * The netdevice ifindex where the rule will be applied. - * @param[in] prio - * Priority value. - * @param[in] ht - * Hash table index. - * @param[in] id - * Index in link table. - * @param[in] ip - * Destination ip address. - * - * @return - * 0 on success, -1 otherwise with errno set. - */ -int tc_add_filter_link(tc_t tc, int ifindex, int prio, int ht, int id, struct sockaddr_store *ip); - -/** - * Add filter to redirect traffic from tap device - * to Interface device as TC request. - * - * @param[in] tc - * The TC object. - * @param[in] ifindex - * The tap device ifindex. - * @param[in] prio - * Priority value. - * @param[in] id - * Item index. - * @param[in] proto - * Protocol (IPv4/IPv6). - * @param[in] ip - * Destination ip. - * @param[in] ifindex - * The netdevice ifindex where the rule will be applied. - * - * @return - * 0 on success, -1 otherwise with errno set. - */ -int tc_add_filter_tap2dev(tc_t tc, int ifindex, int prio, int id, uint16_t proto, - struct sockaddr_store *ip, int ifindex_to); - -/** - * Add filter to redirect traffic from ethernet device - * to tap device using 3tuple or 5tuple as TC request. - * - * @param[in] tc - * The TC object. - * @param[in] ifindex - * The netdevice ifindex where the rule will be applied. - * @param[in] prio - * Priority value. - * @param[in] ht - * Hash table index. - * @param[in] id - * Item index. - * @param[in] l4_proto - * Protocol type as tcp, udp etc. - * @param[in] dst_ip - * Destination ip. - * @param[in] src_ip - * Source ip. - * @param[in] ifindex - * The tap device ifindex. - * - * @return - * 0 on success, -1 otherwise with errno set. - */ -int tc_add_filter_dev2tap(tc_t tc, int ifindex, int prio, int ht, int bkt, int id, int l4_proto, - struct sockaddr_store *dst_ip, struct sockaddr_store *src_ip, - int ifindex_to); - -/** - * Remove specific filter as a TC request. - * - * @param[in] tc - * The TC object. - * @param[in] ifindex - * The netdevice ifindex where the rule will be applied. - * @param[in] prio - * Priority value. - * @param[in] ht - * Hash table index. - * @param[in] bkt - * Bucket index. - * @param[in] id - * Item index. - * @param[in] proto - * Protocol (IPv4/IPv6). - * - * @return - * 0 on success, -1 otherwise with errno set. - */ -int tc_del_filter(tc_t tc, int ifindex, int prio, int ht, int bkt, int id, uint16_t proto); - -#endif /* TOOLS_DAEMON_TC_H_ */ From ad58715b6a248f73feba6aaa8535c7d6009e3794 Mon Sep 17 00:00:00 2001 From: Alexander Grissik Date: Wed, 6 Nov 2024 16:26:02 +0000 Subject: [PATCH 2/7] issue: 4082814 Fix TX migration for DOCA DP Attempt TX migration as part of DOCA data path Signed-off-by: Alexander Grissik --- src/core/sock/sockinfo_tcp.cpp | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/core/sock/sockinfo_tcp.cpp b/src/core/sock/sockinfo_tcp.cpp index 4e68c9e03..1bbb164ab 100644 --- a/src/core/sock/sockinfo_tcp.cpp +++ b/src/core/sock/sockinfo_tcp.cpp @@ -1356,6 +1356,13 @@ err_t sockinfo_tcp::ip_output_doca(struct pbuf *p, struct tcp_seg *seg, void *v_ } else { ret = p_dst->doca_slow_path(p, flags, pcb->mss, p_si_tcp->m_so_ratelimit); } + + if (unlikely(safe_mce_sys().ring_migration_ratio_tx > 0)) { // Condition for cache optimization + if (p_dst->try_migrate_ring_tx(p_si_tcp->m_tcp_con_lock.get_lock_base())) { + IF_STATS_O(p_si_tcp, p_si_tcp->m_p_socket_stats->counters.n_tx_migrations++); + } + } + return (ret > 0 ? ERR_OK : ERR_WOULDBLOCK); } From f1cc7199f7379c97bb03f20b39b3fa10a4299ceb Mon Sep 17 00:00:00 2001 From: Alexander Grissik Date: Mon, 11 Nov 2024 15:07:59 +0000 Subject: [PATCH 3/7] issue: 4154211 Simplifying ring_stats class Removing the 'simple' union substructure used to differentitate ring_tap. Signed-off-by: Alexander Grissik --- src/core/dev/dm_mgr.cpp | 8 +-- src/core/dev/ring_simple.cpp | 20 +++--- src/core/dev/ring_simple.h | 4 +- src/core/util/xlio_stats.h | 30 ++++---- src/stats/stats_reader.cpp | 128 +++++++++++++++++------------------ 5 files changed, 93 insertions(+), 97 deletions(-) diff --git a/src/core/dev/dm_mgr.cpp b/src/core/dev/dm_mgr.cpp index 2fe60e4f8..cd25aa5f8 100644 --- a/src/core/dev/dm_mgr.cpp +++ b/src/core/dev/dm_mgr.cpp @@ -113,7 +113,7 @@ bool dm_mgr::allocate_resources(ib_ctx_handler *ib_ctx, ring_stats_t *ring_stats } m_allocation = allocation_size; - m_p_ring_stat->simple.n_tx_dev_mem_allocated = m_allocation; + m_p_ring_stat->n_tx_dev_mem_allocated = m_allocation; dm_logdbg("Device memory allocation completed successfully! device[%s] bytes[%zu] dm_mr " "handle[%d] dm_mr lkey[%d]", @@ -244,8 +244,8 @@ bool dm_mgr::copy_data(struct mlx5_wqe_data_seg *seg, uint8_t *src, uint32_t len m_used += dev_mem_length; // Update On Device Memory statistics - m_p_ring_stat->simple.n_tx_dev_mem_pkt_count++; - m_p_ring_stat->simple.n_tx_dev_mem_byte_count += length; + m_p_ring_stat->n_tx_dev_mem_pkt_count++; + m_p_ring_stat->n_tx_dev_mem_byte_count += length; NOT_IN_USE(continuous_left); dm_logfunc("Send completed successfully! Buffer[%p] length[%d] length_aligned_8[%d] " @@ -259,7 +259,7 @@ bool dm_mgr::copy_data(struct mlx5_wqe_data_seg *seg, uint8_t *src, uint32_t len "head[%zu] used[%zu]", buff, length, length_aligned_8, continuous_left, m_head, m_used); - m_p_ring_stat->simple.n_tx_dev_mem_oob++; + m_p_ring_stat->n_tx_dev_mem_oob++; return false; } diff --git a/src/core/dev/ring_simple.cpp b/src/core/dev/ring_simple.cpp index 492f5fccb..b5b135fef 100644 --- a/src/core/dev/ring_simple.cpp +++ b/src/core/dev/ring_simple.cpp @@ -412,7 +412,7 @@ bool ring_simple::request_notification(cq_type_t cq_type) { if (likely(CQT_RX == cq_type)) { std::lock_guard lock(m_lock_ring_rx); - ++m_p_ring_stat->simple.n_rx_interrupt_requests; + ++m_p_ring_stat->n_rx_interrupt_requests; return (!safe_mce_sys().doca_rx ? m_p_cq_mgr_rx->request_notification() : m_hqrx->request_notification()); } @@ -425,7 +425,7 @@ bool ring_simple::request_notification(cq_type_t cq_type) void ring_simple::clear_rx_notification() { std::lock_guard lock(m_lock_ring_rx); - ++m_p_ring_stat->simple.n_rx_interrupt_received; + ++m_p_ring_stat->n_rx_interrupt_received; if (!safe_mce_sys().doca_rx) { m_p_cq_mgr_rx->wait_for_notification(); } else { @@ -715,7 +715,7 @@ inline int ring_simple::send_buffer(xlio_ibv_send_wr *p_send_wqe, xlio_wr_tx_pac ring_logdbg("Silent packet drop, SQ is full!"); ret = -1; reinterpret_cast(p_send_wqe->wr_id)->p_next_desc = nullptr; - ++m_p_ring_stat->simple.n_tx_dropped_wqes; + ++m_p_ring_stat->n_tx_dropped_wqes; } return ret; } @@ -880,7 +880,7 @@ void ring_simple::init_tx_buffers(uint32_t count) { request_more_tx_buffers(PBUF_RAM, count, m_tx_lkey); m_tx_num_bufs = m_tx_pool.size(); - m_p_ring_stat->simple.n_tx_num_bufs = m_tx_num_bufs; + m_p_ring_stat->n_tx_num_bufs = m_tx_num_bufs; } void ring_simple::inc_cq_moderation_stats() @@ -904,10 +904,10 @@ mem_buf_desc_t *ring_simple::get_tx_buffers(pbuf_type type, uint32_t n_num_mem_b */ if (type == PBUF_ZEROCOPY) { m_zc_num_bufs += count; - m_p_ring_stat->simple.n_zc_num_bufs = m_zc_num_bufs; + m_p_ring_stat->n_zc_num_bufs = m_zc_num_bufs; } else { m_tx_num_bufs += count; - m_p_ring_stat->simple.n_tx_num_bufs = m_tx_num_bufs; + m_p_ring_stat->n_tx_num_bufs = m_tx_num_bufs; } } @@ -942,14 +942,14 @@ void ring_simple::return_to_global_pool() m_tx_num_bufs >= RING_TX_BUFS_COMPENSATE * 2)) { int return_bufs = m_tx_pool.size() / 2; m_tx_num_bufs -= return_bufs; - m_p_ring_stat->simple.n_tx_num_bufs = m_tx_num_bufs; + m_p_ring_stat->n_tx_num_bufs = m_tx_num_bufs; g_buffer_pool_tx->put_buffers_thread_safe(&m_tx_pool, return_bufs); } if (unlikely(m_zc_pool.size() > (m_zc_num_bufs / 2) && m_zc_num_bufs >= RING_TX_BUFS_COMPENSATE * 2)) { int return_bufs = m_zc_pool.size() / 2; m_zc_num_bufs -= return_bufs; - m_p_ring_stat->simple.n_zc_num_bufs = m_zc_num_bufs; + m_p_ring_stat->n_zc_num_bufs = m_zc_num_bufs; g_buffer_pool_zc->put_buffers_thread_safe(&m_zc_pool, return_bufs); } } @@ -1034,8 +1034,8 @@ void ring_simple::modify_cq_moderation(uint32_t period, uint32_t count) m_cq_moderation_info.period = period; m_cq_moderation_info.count = count; - m_p_ring_stat->simple.n_rx_cq_moderation_period = period; - m_p_ring_stat->simple.n_rx_cq_moderation_count = count; + m_p_ring_stat->n_rx_cq_moderation_period = period; + m_p_ring_stat->n_rx_cq_moderation_count = count; // todo all cqs or just active? what about HA? if (!safe_mce_sys().doca_rx) { diff --git a/src/core/dev/ring_simple.h b/src/core/dev/ring_simple.h index ac52e3f35..e091922c1 100644 --- a/src/core/dev/ring_simple.h +++ b/src/core/dev/ring_simple.h @@ -130,8 +130,8 @@ class ring_simple : public ring_slave { void update_tso_stats(uint64_t bytes) { - ++m_p_ring_stat->simple.n_tx_tso_pkt_count; - m_p_ring_stat->simple.n_tx_tso_byte_count += bytes; + ++m_p_ring_stat->n_tx_tso_pkt_count; + m_p_ring_stat->n_tx_tso_byte_count += bytes; } #ifdef DEFINED_UTLS diff --git a/src/core/util/xlio_stats.h b/src/core/util/xlio_stats.h index fae1e8659..4446b2be0 100644 --- a/src/core/util/xlio_stats.h +++ b/src/core/util/xlio_stats.h @@ -365,23 +365,19 @@ typedef struct { uint32_t n_tx_tls_contexts; uint32_t n_rx_tls_contexts; #endif /* DEFINED_UTLS */ - union { - struct { - uint64_t n_tx_tso_pkt_count; - uint64_t n_tx_tso_byte_count; - uint64_t n_rx_interrupt_requests; - uint64_t n_rx_interrupt_received; - uint32_t n_rx_cq_moderation_count; - uint32_t n_rx_cq_moderation_period; - uint64_t n_tx_dropped_wqes; - uint64_t n_tx_dev_mem_pkt_count; - uint64_t n_tx_dev_mem_byte_count; - uint64_t n_tx_dev_mem_oob; - uint32_t n_tx_dev_mem_allocated; - uint32_t n_tx_num_bufs; - uint32_t n_zc_num_bufs; - } simple; - }; + uint64_t n_tx_tso_pkt_count; + uint64_t n_tx_tso_byte_count; + uint64_t n_rx_interrupt_requests; + uint64_t n_rx_interrupt_received; + uint32_t n_rx_cq_moderation_count; + uint32_t n_rx_cq_moderation_period; + uint64_t n_tx_dropped_wqes; + uint64_t n_tx_dev_mem_pkt_count; + uint64_t n_tx_dev_mem_byte_count; + uint64_t n_tx_dev_mem_oob; + uint32_t n_tx_dev_mem_allocated; + uint32_t n_tx_num_bufs; + uint32_t n_zc_num_bufs; } ring_stats_t; typedef struct { diff --git a/src/stats/stats_reader.cpp b/src/stats/stats_reader.cpp index 802c5c19b..95d454e49 100644 --- a/src/stats/stats_reader.cpp +++ b/src/stats/stats_reader.cpp @@ -353,15 +353,15 @@ void update_delta_ring_stat(ring_stats_t *p_curr_ring_stats, ring_stats_t *p_pre (p_curr_ring_stats->n_tx_pkt_count - p_prev_ring_stats->n_tx_pkt_count) / delay; p_prev_ring_stats->n_tx_retransmits = (p_curr_ring_stats->n_tx_retransmits - p_prev_ring_stats->n_tx_retransmits) / delay; - p_prev_ring_stats->simple.n_tx_dropped_wqes = - (p_curr_ring_stats->simple.n_tx_dropped_wqes - - p_prev_ring_stats->simple.n_tx_dropped_wqes) / + p_prev_ring_stats->n_tx_dropped_wqes = + (p_curr_ring_stats->n_tx_dropped_wqes - + p_prev_ring_stats->n_tx_dropped_wqes) / delay; - p_prev_ring_stats->simple.n_tx_num_bufs = - (p_curr_ring_stats->simple.n_tx_num_bufs - p_prev_ring_stats->simple.n_tx_num_bufs) / + p_prev_ring_stats->n_tx_num_bufs = + (p_curr_ring_stats->n_tx_num_bufs - p_prev_ring_stats->n_tx_num_bufs) / delay; - p_prev_ring_stats->simple.n_zc_num_bufs = - (p_curr_ring_stats->simple.n_zc_num_bufs - p_prev_ring_stats->simple.n_zc_num_bufs) / + p_prev_ring_stats->n_zc_num_bufs = + (p_curr_ring_stats->n_zc_num_bufs - p_prev_ring_stats->n_zc_num_bufs) / delay; #ifdef DEFINED_UTLS p_prev_ring_stats->n_tx_tls_contexts = @@ -369,38 +369,38 @@ void update_delta_ring_stat(ring_stats_t *p_curr_ring_stats, ring_stats_t *p_pre p_prev_ring_stats->n_rx_tls_contexts = (p_curr_ring_stats->n_rx_tls_contexts - p_prev_ring_stats->n_rx_tls_contexts) / delay; #endif /* DEFINED_UTLS */ - p_prev_ring_stats->simple.n_tx_tso_pkt_count = - (p_curr_ring_stats->simple.n_tx_tso_pkt_count - - p_prev_ring_stats->simple.n_tx_tso_pkt_count) / + p_prev_ring_stats->n_tx_tso_pkt_count = + (p_curr_ring_stats->n_tx_tso_pkt_count - + p_prev_ring_stats->n_tx_tso_pkt_count) / delay; - p_prev_ring_stats->simple.n_tx_tso_byte_count = - (p_curr_ring_stats->simple.n_tx_tso_byte_count - - p_prev_ring_stats->simple.n_tx_tso_byte_count) / + p_prev_ring_stats->n_tx_tso_byte_count = + (p_curr_ring_stats->n_tx_tso_byte_count - + p_prev_ring_stats->n_tx_tso_byte_count) / delay; - p_prev_ring_stats->simple.n_rx_interrupt_received = - (p_curr_ring_stats->simple.n_rx_interrupt_received - - p_prev_ring_stats->simple.n_rx_interrupt_received) / + p_prev_ring_stats->n_rx_interrupt_received = + (p_curr_ring_stats->n_rx_interrupt_received - + p_prev_ring_stats->n_rx_interrupt_received) / delay; - p_prev_ring_stats->simple.n_rx_interrupt_requests = - (p_curr_ring_stats->simple.n_rx_interrupt_requests - - p_prev_ring_stats->simple.n_rx_interrupt_requests) / + p_prev_ring_stats->n_rx_interrupt_requests = + (p_curr_ring_stats->n_rx_interrupt_requests - + p_prev_ring_stats->n_rx_interrupt_requests) / delay; - p_prev_ring_stats->simple.n_rx_cq_moderation_count = - p_curr_ring_stats->simple.n_rx_cq_moderation_count; - p_prev_ring_stats->simple.n_rx_cq_moderation_period = - p_curr_ring_stats->simple.n_rx_cq_moderation_period; - p_prev_ring_stats->simple.n_tx_dev_mem_allocated = - p_curr_ring_stats->simple.n_tx_dev_mem_allocated; - p_prev_ring_stats->simple.n_tx_dev_mem_byte_count = - (p_curr_ring_stats->simple.n_tx_dev_mem_byte_count - - p_prev_ring_stats->simple.n_tx_dev_mem_byte_count) / + p_prev_ring_stats->n_rx_cq_moderation_count = + p_curr_ring_stats->n_rx_cq_moderation_count; + p_prev_ring_stats->n_rx_cq_moderation_period = + p_curr_ring_stats->n_rx_cq_moderation_period; + p_prev_ring_stats->n_tx_dev_mem_allocated = + p_curr_ring_stats->n_tx_dev_mem_allocated; + p_prev_ring_stats->n_tx_dev_mem_byte_count = + (p_curr_ring_stats->n_tx_dev_mem_byte_count - + p_prev_ring_stats->n_tx_dev_mem_byte_count) / delay; - p_prev_ring_stats->simple.n_tx_dev_mem_pkt_count = - (p_curr_ring_stats->simple.n_tx_dev_mem_pkt_count - - p_prev_ring_stats->simple.n_tx_dev_mem_pkt_count) / + p_prev_ring_stats->n_tx_dev_mem_pkt_count = + (p_curr_ring_stats->n_tx_dev_mem_pkt_count - + p_prev_ring_stats->n_tx_dev_mem_pkt_count) / delay; - p_prev_ring_stats->simple.n_tx_dev_mem_oob = (p_curr_ring_stats->simple.n_tx_dev_mem_oob - - p_prev_ring_stats->simple.n_tx_dev_mem_oob) / + p_prev_ring_stats->n_tx_dev_mem_oob = (p_curr_ring_stats->n_tx_dev_mem_oob - + p_prev_ring_stats->n_tx_dev_mem_oob) / delay; } } @@ -509,9 +509,9 @@ void print_ring_stats(ring_instance_block_t *p_ring_inst_arr) post_fix); } - if (p_ring_stats->simple.n_tx_dropped_wqes) { + if (p_ring_stats->n_tx_dropped_wqes) { printf(FORMAT_STATS_64bit, - "TX Dropped Send Reqs:", p_ring_stats->simple.n_tx_dropped_wqes, post_fix); + "TX Dropped Send Reqs:", p_ring_stats->n_tx_dropped_wqes, post_fix); } #ifdef DEFINED_UTLS @@ -527,36 +527,36 @@ void print_ring_stats(ring_instance_block_t *p_ring_inst_arr) } #endif /* DEFINED_UTLS */ - if (p_ring_stats->simple.n_tx_tso_pkt_count || - p_ring_stats->simple.n_tx_tso_byte_count) { + if (p_ring_stats->n_tx_tso_pkt_count || + p_ring_stats->n_tx_tso_byte_count) { printf(FORMAT_RING_PACKETS, "TSO Offload:", - p_ring_stats->simple.n_tx_tso_byte_count / BYTES_TRAFFIC_UNIT, - p_ring_stats->simple.n_tx_tso_pkt_count, post_fix); + p_ring_stats->n_tx_tso_byte_count / BYTES_TRAFFIC_UNIT, + p_ring_stats->n_tx_tso_pkt_count, post_fix); } - if (p_ring_stats->simple.n_rx_interrupt_requests || - p_ring_stats->simple.n_rx_interrupt_received) { + if (p_ring_stats->n_rx_interrupt_requests || + p_ring_stats->n_rx_interrupt_received) { printf(FORMAT_RING_INTERRUPT, - "Interrupts:", p_ring_stats->simple.n_rx_interrupt_requests, - p_ring_stats->simple.n_rx_interrupt_received, post_fix); + "Interrupts:", p_ring_stats->n_rx_interrupt_requests, + p_ring_stats->n_rx_interrupt_received, post_fix); } - if (p_ring_stats->simple.n_rx_cq_moderation_count || - p_ring_stats->simple.n_rx_cq_moderation_period) { + if (p_ring_stats->n_rx_cq_moderation_count || + p_ring_stats->n_rx_cq_moderation_period) { printf(FORMAT_RING_MODERATION, - "Moderation:", p_ring_stats->simple.n_rx_cq_moderation_count, - p_ring_stats->simple.n_rx_cq_moderation_period, post_fix); + "Moderation:", p_ring_stats->n_rx_cq_moderation_count, + p_ring_stats->n_rx_cq_moderation_period, post_fix); } - if (p_ring_stats->simple.n_tx_dev_mem_allocated) { + if (p_ring_stats->n_tx_dev_mem_allocated) { printf(FORMAT_STATS_32bit, - "Dev Mem Alloc:", p_ring_stats->simple.n_tx_dev_mem_allocated); + "Dev Mem Alloc:", p_ring_stats->n_tx_dev_mem_allocated); printf(FORMAT_RING_DM_STATS, "Dev Mem Stats:", - p_ring_stats->simple.n_tx_dev_mem_byte_count / BYTES_TRAFFIC_UNIT, - p_ring_stats->simple.n_tx_dev_mem_pkt_count, - p_ring_stats->simple.n_tx_dev_mem_oob, post_fix); + p_ring_stats->n_tx_dev_mem_byte_count / BYTES_TRAFFIC_UNIT, + p_ring_stats->n_tx_dev_mem_pkt_count, + p_ring_stats->n_tx_dev_mem_oob, post_fix); } - printf(FORMAT_STATS_32bit, "TX buffers inflight:", p_ring_stats->simple.n_tx_num_bufs); + printf(FORMAT_STATS_32bit, "TX buffers inflight:", p_ring_stats->n_tx_num_bufs); printf(FORMAT_STATS_32bit, - "TX ZC buffers inflight:", p_ring_stats->simple.n_zc_num_bufs); + "TX ZC buffers inflight:", p_ring_stats->n_zc_num_bufs); } } printf("======================================================\n"); @@ -1799,16 +1799,16 @@ void zero_ring_stats(ring_stats_t *p_ring_stats) p_ring_stats->n_tx_tls_contexts = 0; p_ring_stats->n_rx_tls_contexts = 0; #endif /* DEFINED_UTLS */ - p_ring_stats->simple.n_tx_tso_pkt_count = 0; - p_ring_stats->simple.n_tx_tso_byte_count = 0; - p_ring_stats->simple.n_rx_interrupt_received = 0; - p_ring_stats->simple.n_rx_interrupt_requests = 0; - p_ring_stats->simple.n_tx_dropped_wqes = 0; - p_ring_stats->simple.n_tx_dev_mem_byte_count = 0; - p_ring_stats->simple.n_tx_dev_mem_pkt_count = 0; - p_ring_stats->simple.n_tx_dev_mem_oob = 0; - p_ring_stats->simple.n_tx_num_bufs = 0; - p_ring_stats->simple.n_zc_num_bufs = 0; + p_ring_stats->n_tx_tso_pkt_count = 0; + p_ring_stats->n_tx_tso_byte_count = 0; + p_ring_stats->n_rx_interrupt_received = 0; + p_ring_stats->n_rx_interrupt_requests = 0; + p_ring_stats->n_tx_dropped_wqes = 0; + p_ring_stats->n_tx_dev_mem_byte_count = 0; + p_ring_stats->n_tx_dev_mem_pkt_count = 0; + p_ring_stats->n_tx_dev_mem_oob = 0; + p_ring_stats->n_tx_num_bufs = 0; + p_ring_stats->n_zc_num_bufs = 0; } void zero_cq_stats(cq_stats_t *p_cq_stats) From b0f226f95e830101cdf828b1c7519316c9c01fba Mon Sep 17 00:00:00 2001 From: Alexander Grissik Date: Tue, 12 Nov 2024 11:33:26 +0000 Subject: [PATCH 4/7] issue: 4159519 Introducing hw_queue_tx_stats for DOCA 1. Moving hw_queue_tx related stats from the ring to a seperate object inside hw_queue_tx in order to avoid unnecessary indirections. 2. Fixing TX offload statistics Signed-off-by: Alexander Grissik --- src/core/dev/hw_queue_tx.cpp | 14 ++- src/core/dev/hw_queue_tx.h | 2 + src/core/dev/ring_simple.cpp | 8 +- src/core/dev/ring_simple.h | 6 -- src/core/dev/ring_slave.cpp | 6 -- src/core/util/xlio_stats.h | 25 +++--- src/stats/stats_data_reader.h | 2 +- src/stats/stats_publisher.cpp | 33 ++++--- src/stats/stats_reader.cpp | 162 ++++++++++++++++------------------ 9 files changed, 131 insertions(+), 127 deletions(-) diff --git a/src/core/dev/hw_queue_tx.cpp b/src/core/dev/hw_queue_tx.cpp index 12781180f..bc80e4adc 100644 --- a/src/core/dev/hw_queue_tx.cpp +++ b/src/core/dev/hw_queue_tx.cpp @@ -147,6 +147,7 @@ hw_queue_tx::hw_queue_tx(ring_simple *ring, const slave_data_t *slave, throw_xlio_exception("Failed to create DOCA TXQ"); } + memset(&m_hwq_tx_stats, 0, sizeof(m_hwq_tx_stats)); memset(&m_mlx5_qp, 0, sizeof(m_mlx5_qp)); m_mlx5_qp.cap.max_inline_data = safe_mce_sys().tx_max_inline; @@ -1029,7 +1030,8 @@ inline int hw_queue_tx::fill_wqe_lso(xlio_ibv_send_wr *pswr, int data_len) ctrl->opmod_idx_opcode = htonl(((m_sq_wqe_counter & 0xffff) << 8) | (get_mlx5_opcode(XLIO_IBV_WR_SEND) & 0xff)); } else { - m_p_ring->update_tso_stats(static_cast(data_len)); + ++m_hwq_tx_stats.n_tx_tso_pkt_count; + m_hwq_tx_stats.n_tx_tso_byte_count += static_cast(data_len); } eseg = (struct mlx5_wqe_eth_seg *)((uint8_t *)m_sq_wqe_hot + sizeof(*ctrl)); @@ -2079,8 +2081,12 @@ uint32_t hw_queue_tx::send_doca_single(void *ptr, uint32_t len, mem_buf_desc_t * if (DOCA_IS_ERROR(rc)) { return_doca_task(task); PRINT_DOCA_ERR(hwqtx_logerr, rc, "doca_eth_txq_task_send_as_doca_task"); + return 0; } - return (DOCA_IS_ERROR(rc) ? 0 : len); + + ++m_hwq_tx_stats.n_tx_pkt_count; + m_hwq_tx_stats.n_tx_byte_count += len; + return len; } uint32_t hw_queue_tx::send_doca_lso(struct iovec &h, struct pbuf *p, uint16_t mss, bool is_zerocopy) @@ -2174,6 +2180,10 @@ uint32_t hw_queue_tx::send_doca_lso(struct iovec &h, struct pbuf *p, uint16_t ms return 0; } + ++m_hwq_tx_stats.n_tx_pkt_count; + m_hwq_tx_stats.n_tx_byte_count += len_sent; + ++m_hwq_tx_stats.n_tx_tso_pkt_count; + m_hwq_tx_stats.n_tx_tso_byte_count += len_sent; return len_sent; } diff --git a/src/core/dev/hw_queue_tx.h b/src/core/dev/hw_queue_tx.h index 3804c3a42..c590f6772 100644 --- a/src/core/dev/hw_queue_tx.h +++ b/src/core/dev/hw_queue_tx.h @@ -98,6 +98,7 @@ struct sq_wqe_prop { // Once created it requests from the system a CQ to work with. class hw_queue_tx : public xlio_ti_owner { friend class cq_mgr_tx; + friend class ring_simple; public: hw_queue_tx(ring_simple *ring, const slave_data_t *slave, @@ -343,6 +344,7 @@ class hw_queue_tx : public xlio_ti_owner { doca_ctx *m_doca_ctx_txq = nullptr; doca_notification_handle_t m_notification_handle; doca_lso_metadata *m_p_doca_lso_metadata_list = nullptr; + hw_queue_tx_stats_t m_hwq_tx_stats; static void tx_task_completion_cb(doca_eth_txq_task_send *task_send, doca_data task_user_data, doca_data ctx_user_data); diff --git a/src/core/dev/ring_simple.cpp b/src/core/dev/ring_simple.cpp index b5b135fef..e85e34d12 100644 --- a/src/core/dev/ring_simple.cpp +++ b/src/core/dev/ring_simple.cpp @@ -75,8 +75,8 @@ inline void ring_simple::send_status_handler(int ret, xlio_ibv_send_wr *p_send_w } else { // Update TX statistics sg_array sga(p_send_wqe->sg_list, p_send_wqe->num_sge); - m_p_ring_stat->n_tx_byte_count += sga.length(); - ++m_p_ring_stat->n_tx_pkt_count; + m_hqtx->m_hwq_tx_stats.n_tx_byte_count += sga.length(); + ++m_hqtx->m_hwq_tx_stats.n_tx_pkt_count; // Decrease counter in order to keep track of how many missing buffers we have when // doing ring->restart() and then drain_tx_buffers_to_buffer_pool() @@ -158,6 +158,8 @@ ring_simple::~ring_simple() g_p_fd_collection->del_cq_channel_fd(get_tx_channel_fd(), true); } + xlio_stats_instance_remove_ring_block(m_p_ring_stat.get(), &m_hqtx->m_hwq_tx_stats); + delete m_hqtx; m_hqtx = nullptr; @@ -386,6 +388,8 @@ void ring_simple::create_resources() safe_mce_sys().cq_moderation_count); } + xlio_stats_instance_create_ring_block(m_p_ring_stat.get(), &m_hqtx->m_hwq_tx_stats); + ring_logdbg("new ring_simple() completed"); } diff --git a/src/core/dev/ring_simple.h b/src/core/dev/ring_simple.h index e091922c1..232eac56a 100644 --- a/src/core/dev/ring_simple.h +++ b/src/core/dev/ring_simple.h @@ -128,12 +128,6 @@ class ring_simple : public ring_slave { void modify_cq_moderation(uint32_t period, uint32_t count); - void update_tso_stats(uint64_t bytes) - { - ++m_p_ring_stat->n_tx_tso_pkt_count; - m_p_ring_stat->n_tx_tso_byte_count += bytes; - } - #ifdef DEFINED_UTLS bool tls_tx_supported(void) override { return m_tls.tls_tx; } bool tls_rx_supported(void) override { return m_tls.tls_rx; } diff --git a/src/core/dev/ring_slave.cpp b/src/core/dev/ring_slave.cpp index 2f2cdb7d9..6312a92e8 100644 --- a/src/core/dev/ring_slave.cpp +++ b/src/core/dev/ring_slave.cpp @@ -99,8 +99,6 @@ ring_slave::ring_slave(int if_index, ring *parent, bool use_locks) m_tx_pool.set_id("ring_slave (%p) : m_tx_pool", this); m_zc_pool.set_id("ring_slave (%p) : m_zc_pool", this); - xlio_stats_instance_create_ring_block(m_p_ring_stat.get()); - print_val(); } @@ -108,10 +106,6 @@ ring_slave::~ring_slave() { print_val(); - if (m_p_ring_stat) { - xlio_stats_instance_remove_ring_block(m_p_ring_stat.get()); - } - /* Release TX buffer poll */ g_buffer_pool_tx->put_buffers_thread_safe(&m_tx_pool, m_tx_pool.size()); g_buffer_pool_zc->put_buffers_thread_safe(&m_zc_pool, m_zc_pool.size()); diff --git a/src/core/util/xlio_stats.h b/src/core/util/xlio_stats.h index 4446b2be0..4edd75607 100644 --- a/src/core/util/xlio_stats.h +++ b/src/core/util/xlio_stats.h @@ -355,18 +355,15 @@ typedef struct { // Ring stat info typedef struct { + uint32_t n_tx_num_bufs; + uint32_t n_zc_num_bufs; + uint64_t n_tx_retransmits; uint64_t n_rx_pkt_count; uint64_t n_rx_byte_count; - uint64_t n_tx_pkt_count; - uint64_t n_tx_byte_count; - uint64_t n_tx_retransmits; - void *p_ring_master; #ifdef DEFINED_UTLS uint32_t n_tx_tls_contexts; uint32_t n_rx_tls_contexts; #endif /* DEFINED_UTLS */ - uint64_t n_tx_tso_pkt_count; - uint64_t n_tx_tso_byte_count; uint64_t n_rx_interrupt_requests; uint64_t n_rx_interrupt_received; uint32_t n_rx_cq_moderation_count; @@ -376,13 +373,21 @@ typedef struct { uint64_t n_tx_dev_mem_byte_count; uint64_t n_tx_dev_mem_oob; uint32_t n_tx_dev_mem_allocated; - uint32_t n_tx_num_bufs; - uint32_t n_zc_num_bufs; + void *p_ring_master; } ring_stats_t; +typedef struct { + uint64_t n_tx_pkt_count; + uint64_t n_tx_byte_count; + uint64_t n_tx_tso_pkt_count; + uint64_t n_tx_tso_byte_count; +} hw_queue_tx_stats_t; + typedef struct { bool b_enabled; ring_stats_t ring_stats; + hw_queue_tx_stats_t hwq_tx_stats; + } ring_instance_block_t; // Buffer Pool stat info @@ -528,8 +533,8 @@ void xlio_stats_instance_remove_socket_block(socket_stats_t *); void xlio_stats_mc_group_add(const ip_address &mc_grp, socket_stats_t *p_socket_stats); void xlio_stats_mc_group_remove(const ip_address &mc_grp, socket_stats_t *p_socket_stats); -void xlio_stats_instance_create_ring_block(ring_stats_t *); -void xlio_stats_instance_remove_ring_block(ring_stats_t *); +void xlio_stats_instance_create_ring_block(ring_stats_t *, hw_queue_tx_stats_t *local_hwq_tx_addr); +void xlio_stats_instance_remove_ring_block(ring_stats_t *, hw_queue_tx_stats_t *local_hwq_tx_addr); void xlio_stats_instance_create_cq_block(cq_stats_t *); void xlio_stats_instance_remove_cq_block(cq_stats_t *); diff --git a/src/stats/stats_data_reader.h b/src/stats/stats_data_reader.h index 4b8781ab0..57e1d1b3c 100644 --- a/src/stats/stats_data_reader.h +++ b/src/stats/stats_data_reader.h @@ -244,7 +244,7 @@ struct ring_packet_aggregate { auto count_if_enabled = [](pkt_cnt &val, const ring_instance_block_t &ring_stat) { // coverity[missing_lock:FALSE] /* Turn off coverity missing_lock check*/ if (ring_stat.b_enabled) { - val.tx += ring_stat.ring_stats.n_tx_pkt_count; + val.tx += ring_stat.hwq_tx_stats.n_tx_pkt_count; val.rx += ring_stat.ring_stats.n_rx_pkt_count; } return val; diff --git a/src/stats/stats_publisher.cpp b/src/stats/stats_publisher.cpp index a6d2cb927..86c284a74 100644 --- a/src/stats/stats_publisher.cpp +++ b/src/stats/stats_publisher.cpp @@ -477,15 +477,19 @@ void xlio_stats_mc_group_remove(const ip_address &mc_grp, socket_stats_t *p_sock g_lock_mc_info.unlock(); } -void xlio_stats_instance_create_ring_block(ring_stats_t *local_stats_addr) +void xlio_stats_instance_create_ring_block(ring_stats_t *local_stats_addr, + hw_queue_tx_stats_t *local_hwq_tx_addr) { ring_stats_t *p_instance_ring = NULL; + hw_queue_tx_stats_t *p_instance_hwq_tx = NULL; g_lock_ring_inst_arr.lock(); for (int i = 0; i < NUM_OF_SUPPORTED_RINGS; i++) { if (!g_sh_mem->ring_inst_arr[i].b_enabled) { g_sh_mem->ring_inst_arr[i].b_enabled = true; p_instance_ring = &g_sh_mem->ring_inst_arr[i].ring_stats; + p_instance_hwq_tx = &g_sh_mem->ring_inst_arr[i].hwq_tx_stats; memset(p_instance_ring, 0, sizeof(*p_instance_ring)); + memset(p_instance_hwq_tx, 0, sizeof(*p_instance_hwq_tx)); break; } } @@ -498,34 +502,35 @@ void xlio_stats_instance_create_ring_block(ring_stats_t *local_stats_addr) } else { g_p_stats_data_reader->add_data_reader(local_stats_addr, p_instance_ring, sizeof(ring_stats_t)); - __log_dbg("Added ring local=%p shm=%p", local_stats_addr, p_instance_ring); + if (local_hwq_tx_addr) { + g_p_stats_data_reader->add_data_reader(local_hwq_tx_addr, p_instance_hwq_tx, + sizeof(hw_queue_tx_stats_t)); + } + __log_dbg("Added ring local=%p shm=%p, local_hwq_tx=%p, shm_hwq_tx=%p", local_stats_addr, + p_instance_ring, local_hwq_tx_addr, p_instance_hwq_tx); } g_lock_ring_inst_arr.unlock(); } -void xlio_stats_instance_remove_ring_block(ring_stats_t *local_stats_addr) +void xlio_stats_instance_remove_ring_block(ring_stats_t *local_stats_addr, + hw_queue_tx_stats_t *local_hwq_tx_addr) { g_lock_ring_inst_arr.lock(); - __log_dbg("Remove ring local=%p", local_stats_addr); + __log_dbg("Remove ring local=%p, local_hwq_tx=%p", local_stats_addr, local_hwq_tx_addr); + + if (local_hwq_tx_addr) { + g_p_stats_data_reader->pop_data_reader(local_hwq_tx_addr); + } ring_stats_t *p_ring_stats = (ring_stats_t *)g_p_stats_data_reader->pop_data_reader(local_stats_addr); if (p_ring_stats == NULL) { // happens on the tx cq (why don't we keep tx cq stats?) - __log_dbg("application xlio_stats pointer is NULL"); + __log_dbg("application xlio_stats-ring pointer is NULL"); g_lock_ring_inst_arr.unlock(); return; } - // coverity - g_sh_mem->ring_inst_arr cannot be null - /*BULLSEYE_EXCLUDE_BLOCK_START - if (g_sh_mem->ring_inst_arr == NULL) { - vlog_printf(VLOG_ERROR,"%s:%d: g_sh_mem->instances_arr not init\n", __func__, __LINE__); - g_lock_skt_stats.unlock(); - return; - } - BULLSEYE_EXCLUDE_BLOCK_END*/ - // Search sh_mem block to release for (int i = 0; i < NUM_OF_SUPPORTED_RINGS; i++) { if (&g_sh_mem->ring_inst_arr[i].ring_stats == p_ring_stats) { diff --git a/src/stats/stats_reader.cpp b/src/stats/stats_reader.cpp index 95d454e49..bae55e859 100644 --- a/src/stats/stats_reader.cpp +++ b/src/stats/stats_reader.cpp @@ -339,6 +339,21 @@ void update_delta_iomux_stat(iomux_func_stats_t *p_curr_stats, iomux_func_stats_ } } +void update_delta_hwq_tx_stat(hw_queue_tx_stats_t *p_curr_hwq_tx_stats, + hw_queue_tx_stats_t *p_prev_hwq_tx_stats) +{ + int delay = user_params.interval; + p_prev_hwq_tx_stats->n_tx_byte_count = + (p_curr_hwq_tx_stats->n_tx_byte_count - p_prev_hwq_tx_stats->n_tx_byte_count) / delay; + p_prev_hwq_tx_stats->n_tx_pkt_count = + (p_curr_hwq_tx_stats->n_tx_pkt_count - p_prev_hwq_tx_stats->n_tx_pkt_count) / delay; + p_prev_hwq_tx_stats->n_tx_tso_pkt_count = + (p_curr_hwq_tx_stats->n_tx_tso_pkt_count - p_prev_hwq_tx_stats->n_tx_tso_pkt_count) / delay; + p_prev_hwq_tx_stats->n_tx_tso_byte_count = + (p_curr_hwq_tx_stats->n_tx_tso_byte_count - p_prev_hwq_tx_stats->n_tx_tso_byte_count) / + delay; +} + void update_delta_ring_stat(ring_stats_t *p_curr_ring_stats, ring_stats_t *p_prev_ring_stats) { int delay = user_params.interval; @@ -347,61 +362,37 @@ void update_delta_ring_stat(ring_stats_t *p_curr_ring_stats, ring_stats_t *p_pre (p_curr_ring_stats->n_rx_byte_count - p_prev_ring_stats->n_rx_byte_count) / delay; p_prev_ring_stats->n_rx_pkt_count = (p_curr_ring_stats->n_rx_pkt_count - p_prev_ring_stats->n_rx_pkt_count) / delay; - p_prev_ring_stats->n_tx_byte_count = - (p_curr_ring_stats->n_tx_byte_count - p_prev_ring_stats->n_tx_byte_count) / delay; - p_prev_ring_stats->n_tx_pkt_count = - (p_curr_ring_stats->n_tx_pkt_count - p_prev_ring_stats->n_tx_pkt_count) / delay; p_prev_ring_stats->n_tx_retransmits = (p_curr_ring_stats->n_tx_retransmits - p_prev_ring_stats->n_tx_retransmits) / delay; p_prev_ring_stats->n_tx_dropped_wqes = - (p_curr_ring_stats->n_tx_dropped_wqes - - p_prev_ring_stats->n_tx_dropped_wqes) / - delay; + (p_curr_ring_stats->n_tx_dropped_wqes - p_prev_ring_stats->n_tx_dropped_wqes) / delay; p_prev_ring_stats->n_tx_num_bufs = - (p_curr_ring_stats->n_tx_num_bufs - p_prev_ring_stats->n_tx_num_bufs) / - delay; + (p_curr_ring_stats->n_tx_num_bufs - p_prev_ring_stats->n_tx_num_bufs) / delay; p_prev_ring_stats->n_zc_num_bufs = - (p_curr_ring_stats->n_zc_num_bufs - p_prev_ring_stats->n_zc_num_bufs) / - delay; + (p_curr_ring_stats->n_zc_num_bufs - p_prev_ring_stats->n_zc_num_bufs) / delay; #ifdef DEFINED_UTLS p_prev_ring_stats->n_tx_tls_contexts = (p_curr_ring_stats->n_tx_tls_contexts - p_prev_ring_stats->n_tx_tls_contexts) / delay; p_prev_ring_stats->n_rx_tls_contexts = (p_curr_ring_stats->n_rx_tls_contexts - p_prev_ring_stats->n_rx_tls_contexts) / delay; #endif /* DEFINED_UTLS */ - p_prev_ring_stats->n_tx_tso_pkt_count = - (p_curr_ring_stats->n_tx_tso_pkt_count - - p_prev_ring_stats->n_tx_tso_pkt_count) / - delay; - p_prev_ring_stats->n_tx_tso_byte_count = - (p_curr_ring_stats->n_tx_tso_byte_count - - p_prev_ring_stats->n_tx_tso_byte_count) / + p_prev_ring_stats->n_rx_interrupt_received = (p_curr_ring_stats->n_rx_interrupt_received - + p_prev_ring_stats->n_rx_interrupt_received) / delay; - p_prev_ring_stats->n_rx_interrupt_received = - (p_curr_ring_stats->n_rx_interrupt_received - - p_prev_ring_stats->n_rx_interrupt_received) / + p_prev_ring_stats->n_rx_interrupt_requests = (p_curr_ring_stats->n_rx_interrupt_requests - + p_prev_ring_stats->n_rx_interrupt_requests) / delay; - p_prev_ring_stats->n_rx_interrupt_requests = - (p_curr_ring_stats->n_rx_interrupt_requests - - p_prev_ring_stats->n_rx_interrupt_requests) / + p_prev_ring_stats->n_rx_cq_moderation_count = p_curr_ring_stats->n_rx_cq_moderation_count; + p_prev_ring_stats->n_rx_cq_moderation_period = p_curr_ring_stats->n_rx_cq_moderation_period; + p_prev_ring_stats->n_tx_dev_mem_allocated = p_curr_ring_stats->n_tx_dev_mem_allocated; + p_prev_ring_stats->n_tx_dev_mem_byte_count = (p_curr_ring_stats->n_tx_dev_mem_byte_count - + p_prev_ring_stats->n_tx_dev_mem_byte_count) / delay; - p_prev_ring_stats->n_rx_cq_moderation_count = - p_curr_ring_stats->n_rx_cq_moderation_count; - p_prev_ring_stats->n_rx_cq_moderation_period = - p_curr_ring_stats->n_rx_cq_moderation_period; - p_prev_ring_stats->n_tx_dev_mem_allocated = - p_curr_ring_stats->n_tx_dev_mem_allocated; - p_prev_ring_stats->n_tx_dev_mem_byte_count = - (p_curr_ring_stats->n_tx_dev_mem_byte_count - - p_prev_ring_stats->n_tx_dev_mem_byte_count) / - delay; - p_prev_ring_stats->n_tx_dev_mem_pkt_count = - (p_curr_ring_stats->n_tx_dev_mem_pkt_count - - p_prev_ring_stats->n_tx_dev_mem_pkt_count) / - delay; - p_prev_ring_stats->n_tx_dev_mem_oob = (p_curr_ring_stats->n_tx_dev_mem_oob - - p_prev_ring_stats->n_tx_dev_mem_oob) / + p_prev_ring_stats->n_tx_dev_mem_pkt_count = (p_curr_ring_stats->n_tx_dev_mem_pkt_count - + p_prev_ring_stats->n_tx_dev_mem_pkt_count) / delay; + p_prev_ring_stats->n_tx_dev_mem_oob = + (p_curr_ring_stats->n_tx_dev_mem_oob - p_prev_ring_stats->n_tx_dev_mem_oob) / delay; } } @@ -480,7 +471,6 @@ void update_delta_global_stat(global_stats_t *p_curr_global_stats, void print_ring_stats(ring_instance_block_t *p_ring_inst_arr) { - ring_stats_t *p_ring_stats = NULL; char post_fix[3] = ""; if (user_params.print_details_mode == e_deltas) { @@ -490,19 +480,24 @@ void print_ring_stats(ring_instance_block_t *p_ring_inst_arr) for (int i = 0; i < NUM_OF_SUPPORTED_RINGS; i++) { // coverity[missing_lock:FALSE] /* Turn off coverity missing_lock check*/ if (p_ring_inst_arr[i].b_enabled) { - p_ring_stats = &p_ring_inst_arr[i].ring_stats; + ring_stats_t *p_ring_stats = &p_ring_inst_arr[i].ring_stats; + hw_queue_tx_stats_t *p_hwq_tx_stats = &p_ring_inst_arr[i].hwq_tx_stats; printf("======================================================\n"); + printf("\tETH=[%u]\n", i); if (p_ring_stats->p_ring_master) { printf(FORMAT_RING_MASTER, "Master:", p_ring_stats->p_ring_master); } printf(FORMAT_RING_PACKETS, - "Tx Offload:", p_ring_stats->n_tx_byte_count / BYTES_TRAFFIC_UNIT, - p_ring_stats->n_tx_pkt_count, post_fix); - printf(FORMAT_RING_PACKETS, - "Rx Offload:", p_ring_stats->n_rx_byte_count / BYTES_TRAFFIC_UNIT, - p_ring_stats->n_rx_pkt_count, post_fix); + "TX Offload:", p_hwq_tx_stats->n_tx_byte_count / BYTES_TRAFFIC_UNIT, + p_hwq_tx_stats->n_tx_pkt_count, post_fix); + + if (p_hwq_tx_stats->n_tx_tso_pkt_count || p_hwq_tx_stats->n_tx_tso_byte_count) { + printf(FORMAT_RING_PACKETS, + "TSO Offload:", p_hwq_tx_stats->n_tx_tso_byte_count / BYTES_TRAFFIC_UNIT, + p_hwq_tx_stats->n_tx_tso_pkt_count, post_fix); + } if (p_ring_stats->n_tx_retransmits) { printf(FORMAT_STATS_64bit, "Retransmissions:", p_ring_stats->n_tx_retransmits, @@ -510,8 +505,33 @@ void print_ring_stats(ring_instance_block_t *p_ring_inst_arr) } if (p_ring_stats->n_tx_dropped_wqes) { - printf(FORMAT_STATS_64bit, - "TX Dropped Send Reqs:", p_ring_stats->n_tx_dropped_wqes, post_fix); + printf(FORMAT_STATS_64bit, "TX Dropped Reqs:", p_ring_stats->n_tx_dropped_wqes, + post_fix); + } + + printf(FORMAT_STATS_32bit, "TX buffers in use:", p_ring_stats->n_tx_num_bufs); + printf(FORMAT_STATS_32bit, "TX ZC buffers in use:", p_ring_stats->n_zc_num_bufs); + + if (p_ring_stats->n_tx_dev_mem_allocated) { + printf(FORMAT_STATS_32bit, "Dev Mem Alloc:", p_ring_stats->n_tx_dev_mem_allocated); + printf(FORMAT_RING_DM_STATS, + "Dev Mem Stats:", p_ring_stats->n_tx_dev_mem_byte_count / BYTES_TRAFFIC_UNIT, + p_ring_stats->n_tx_dev_mem_pkt_count, p_ring_stats->n_tx_dev_mem_oob, + post_fix); + } + + printf(FORMAT_RING_PACKETS, + "RX Offload:", p_ring_stats->n_rx_byte_count / BYTES_TRAFFIC_UNIT, + p_ring_stats->n_rx_pkt_count, post_fix); + + if (p_ring_stats->n_rx_interrupt_requests || p_ring_stats->n_rx_interrupt_received) { + printf(FORMAT_RING_INTERRUPT, "Interrupts:", p_ring_stats->n_rx_interrupt_requests, + p_ring_stats->n_rx_interrupt_received, post_fix); + } + if (p_ring_stats->n_rx_cq_moderation_count || p_ring_stats->n_rx_cq_moderation_period) { + printf(FORMAT_RING_MODERATION, + "Moderation:", p_ring_stats->n_rx_cq_moderation_count, + p_ring_stats->n_rx_cq_moderation_period, post_fix); } #ifdef DEFINED_UTLS @@ -526,37 +546,6 @@ void print_ring_stats(ring_instance_block_t *p_ring_inst_arr) post_fix); } #endif /* DEFINED_UTLS */ - - if (p_ring_stats->n_tx_tso_pkt_count || - p_ring_stats->n_tx_tso_byte_count) { - printf(FORMAT_RING_PACKETS, "TSO Offload:", - p_ring_stats->n_tx_tso_byte_count / BYTES_TRAFFIC_UNIT, - p_ring_stats->n_tx_tso_pkt_count, post_fix); - } - if (p_ring_stats->n_rx_interrupt_requests || - p_ring_stats->n_rx_interrupt_received) { - printf(FORMAT_RING_INTERRUPT, - "Interrupts:", p_ring_stats->n_rx_interrupt_requests, - p_ring_stats->n_rx_interrupt_received, post_fix); - } - if (p_ring_stats->n_rx_cq_moderation_count || - p_ring_stats->n_rx_cq_moderation_period) { - printf(FORMAT_RING_MODERATION, - "Moderation:", p_ring_stats->n_rx_cq_moderation_count, - p_ring_stats->n_rx_cq_moderation_period, post_fix); - } - if (p_ring_stats->n_tx_dev_mem_allocated) { - printf(FORMAT_STATS_32bit, - "Dev Mem Alloc:", p_ring_stats->n_tx_dev_mem_allocated); - printf(FORMAT_RING_DM_STATS, "Dev Mem Stats:", - p_ring_stats->n_tx_dev_mem_byte_count / BYTES_TRAFFIC_UNIT, - p_ring_stats->n_tx_dev_mem_pkt_count, - p_ring_stats->n_tx_dev_mem_oob, post_fix); - } - - printf(FORMAT_STATS_32bit, "TX buffers inflight:", p_ring_stats->n_tx_num_bufs); - printf(FORMAT_STATS_32bit, - "TX ZC buffers inflight:", p_ring_stats->n_zc_num_bufs); } } printf("======================================================\n"); @@ -1059,6 +1048,8 @@ void print_ring_deltas(ring_instance_block_t *p_curr_ring_stats, return; } for (int i = 0; i < NUM_OF_SUPPORTED_RINGS; i++) { + update_delta_hwq_tx_stat(&p_curr_ring_stats[i].hwq_tx_stats, + &p_prev_ring_stats[i].hwq_tx_stats); update_delta_ring_stat(&p_curr_ring_stats[i].ring_stats, &p_prev_ring_stats[i].ring_stats); } print_ring_stats(p_prev_ring_stats); @@ -1788,19 +1779,17 @@ void zero_iomux_stats(iomux_stats_t *p_iomux_stats) // memset(p_iomux_stats, 0, sizeof(*p_iomux_stats)); } -void zero_ring_stats(ring_stats_t *p_ring_stats) +void zero_ring_stats(ring_stats_t *p_ring_stats, hw_queue_tx_stats_t *p_hwq_tx_stats) { + memset(p_hwq_tx_stats, 0, sizeof(*p_hwq_tx_stats)); + p_ring_stats->n_rx_pkt_count = 0; p_ring_stats->n_rx_byte_count = 0; - p_ring_stats->n_tx_pkt_count = 0; - p_ring_stats->n_tx_byte_count = 0; p_ring_stats->n_tx_retransmits = 0; #ifdef DEFINED_UTLS p_ring_stats->n_tx_tls_contexts = 0; p_ring_stats->n_rx_tls_contexts = 0; #endif /* DEFINED_UTLS */ - p_ring_stats->n_tx_tso_pkt_count = 0; - p_ring_stats->n_tx_tso_byte_count = 0; p_ring_stats->n_rx_interrupt_received = 0; p_ring_stats->n_rx_interrupt_requests = 0; p_ring_stats->n_tx_dropped_wqes = 0; @@ -1837,7 +1826,8 @@ void zero_counters(sh_mem_t *p_sh_mem) zero_cq_stats(&p_sh_mem->cq_inst_arr[i].cq_stats); } for (int i = 0; i < NUM_OF_SUPPORTED_RINGS; i++) { - zero_ring_stats(&p_sh_mem->ring_inst_arr[i].ring_stats); + zero_ring_stats(&p_sh_mem->ring_inst_arr[i].ring_stats, + &p_sh_mem->ring_inst_arr[i].hwq_tx_stats); } for (int i = 0; i < NUM_OF_SUPPORTED_BPOOLS; i++) { zero_bpool_stats(&p_sh_mem->bpool_inst_arr[i].bpool_stats); From 65a1f68982542dab2d9a4e83e6996bb94e0d907b Mon Sep 17 00:00:00 2001 From: Alexander Grissik Date: Tue, 12 Nov 2024 13:33:40 +0000 Subject: [PATCH 5/7] issue: 4159519 Introducing hw_queue_rx_stats for DOCA 1. Moving hw_queue_rx related stats from the ring to a seperate object inside hw_queue_rx in order to avoid unnecessary indirections. 2. Fixing RX offload statistics 3. Moving DOCA RX related stats from CQ to hw_queue_rx. CQ will be deleted in the future. Signed-off-by: Alexander Grissik --- src/core/dev/cq_mgr_rx.cpp | 63 ++++++------ src/core/dev/cq_mgr_rx.h | 8 +- src/core/dev/cq_mgr_rx_inl.h | 2 + src/core/dev/cq_mgr_rx_regrq.cpp | 21 ++-- src/core/dev/cq_mgr_rx_regrq.h | 4 +- src/core/dev/cq_mgr_rx_strq.cpp | 30 +++--- src/core/dev/cq_mgr_rx_strq.h | 6 +- src/core/dev/hw_queue_rx.cpp | 30 ++++-- src/core/dev/hw_queue_rx.h | 17 +++- src/core/dev/hw_queue_tx.cpp | 5 +- src/core/dev/rfs_uc_tcp_gro.cpp | 11 +-- src/core/dev/ring_simple.cpp | 13 ++- src/core/dev/ring_slave.cpp | 3 - src/core/util/xlio_stats.h | 38 ++++--- src/stats/stats_data_reader.h | 2 +- src/stats/stats_publisher.cpp | 27 ++++- src/stats/stats_reader.cpp | 163 ++++++++++++++++++------------- 17 files changed, 252 insertions(+), 191 deletions(-) diff --git a/src/core/dev/cq_mgr_rx.cpp b/src/core/dev/cq_mgr_rx.cpp index e37ddd162..2da751117 100644 --- a/src/core/dev/cq_mgr_rx.cpp +++ b/src/core/dev/cq_mgr_rx.cpp @@ -71,9 +71,10 @@ atomic_t cq_mgr_rx::m_n_cq_id_counter_rx = ATOMIC_INIT(1); uint64_t cq_mgr_rx::m_n_global_sn_rx = 0; -cq_mgr_rx::cq_mgr_rx(ring_simple *p_ring, ib_ctx_handler *p_ib_ctx_handler, int cq_size, - struct ibv_comp_channel *p_comp_event_channel) - : m_p_ring(p_ring) +cq_mgr_rx::cq_mgr_rx(ring_simple *p_ring, hw_queue_rx *hqrx_ptr, ib_ctx_handler *p_ib_ctx_handler, + int cq_size, struct ibv_comp_channel *p_comp_event_channel) + : m_hqrx_ptr(hqrx_ptr) + , m_p_ring(p_ring) , m_n_sysvar_cq_poll_batch_max(safe_mce_sys().cq_poll_batch_max) , m_n_sysvar_progress_engine_wce_max(safe_mce_sys().progress_engine_wce_max) , m_p_cq_stat(&m_cq_stat_static) // use local copy of stats by default @@ -151,7 +152,7 @@ cq_mgr_rx::~cq_mgr_rx() m_p_cq_stat->n_rx_sw_queue_len = m_rx_queue.size(); g_buffer_pool_rx_rwqe->put_buffers_thread_safe(&m_rx_pool, m_rx_pool.size()); - m_p_cq_stat->n_buffer_pool_len = m_rx_pool.size(); + m_hqrx_ptr->update_rx_buffer_pool_len_stats(); } cq_logfunc("destroying ibv_cq"); @@ -170,19 +171,21 @@ cq_mgr_rx::~cq_mgr_rx() void cq_mgr_rx::statistics_print() { - if (m_p_cq_stat->n_rx_pkt_drop || m_p_cq_stat->n_rx_sw_queue_len || - m_p_cq_stat->n_rx_drained_at_once_max || m_p_cq_stat->n_buffer_pool_len) { + if (m_hqrx_ptr && + (m_p_cq_stat->n_rx_pkt_drop || m_p_cq_stat->n_rx_sw_queue_len || + m_hqrx_ptr->m_hwq_rx_stats.n_rx_drained_at_once_max || + m_hqrx_ptr->m_hwq_rx_stats.n_rx_buffer_pool_len)) { cq_logdbg_no_funcname("Packets dropped: %12llu", (unsigned long long int)m_p_cq_stat->n_rx_pkt_drop); - cq_logdbg_no_funcname("Drained max: %17u", m_p_cq_stat->n_rx_drained_at_once_max); + cq_logdbg_no_funcname("Drained max: %17u", + m_hqrx_ptr->m_hwq_rx_stats.n_rx_drained_at_once_max); cq_logdbg_no_funcname("CQE errors: %18llu", - (unsigned long long int)m_p_cq_stat->n_rx_cqe_error); + (unsigned long long int)m_hqrx_ptr->m_hwq_rx_stats.n_rx_task_error); } } -void cq_mgr_rx::add_hqrx(hw_queue_rx *hqrx_ptr) +void cq_mgr_rx::add_hqrx() { - m_hqrx_ptr = hqrx_ptr; m_hqrx_ptr->m_rq_wqe_counter = 0; // In case of bonded hqrx, wqe_counter must be reset to zero m_rx_hot_buffer = nullptr; @@ -191,19 +194,17 @@ void cq_mgr_rx::add_hqrx(hw_queue_rx *hqrx_ptr) } VALGRIND_MAKE_MEM_DEFINED(&m_mlx5_cq, sizeof(m_mlx5_cq)); - cq_logfunc("hqrx_ptr=%p m_mlx5_cq.dbrec=%p m_mlx5_cq.cq_buf=%p", hqrx_ptr, m_mlx5_cq.dbrec, + cq_logfunc("hqrx_ptr=%p m_mlx5_cq.dbrec=%p m_mlx5_cq.cq_buf=%p", m_hqrx_ptr, m_mlx5_cq.dbrec, m_mlx5_cq.cq_buf); descq_t temp_desc_list; temp_desc_list.set_id("cq_mgr_rx (%p) : temp_desc_list", this); - m_p_cq_stat->n_rx_drained_at_once_max = 0; - /* return_extra_buffers(); */ // todo?? // Initial fill of receiver work requests - uint32_t hqrx_wr_num = hqrx_ptr->get_rx_max_wr_num(); - cq_logdbg("Trying to push %d WRE to allocated hqrx (%p)", hqrx_wr_num, hqrx_ptr); + uint32_t hqrx_wr_num = m_hqrx_ptr->get_rx_max_wr_num(); + cq_logdbg("Trying to push %d WRE to allocated hqrx (%p)", hqrx_wr_num, m_hqrx_ptr); while (hqrx_wr_num) { uint32_t n_num_mem_bufs = m_n_sysvar_rx_num_wr_to_post_recv; if (n_num_mem_bufs > hqrx_wr_num) { @@ -215,14 +216,15 @@ void cq_mgr_rx::add_hqrx(hw_queue_rx *hqrx_ptr) VLOG_PRINTF_INFO_ONCE_THEN_ALWAYS( VLOG_WARNING, VLOG_DEBUG, "Out of mem_buf_desc in global RX buffer pool for hqrx initialization (hqrx=%p)", - hqrx_ptr); + m_hqrx_ptr); break; } - hqrx_ptr->post_recv_buffers(&temp_desc_list, temp_desc_list.size()); + m_hqrx_ptr->post_recv_buffers(&temp_desc_list, temp_desc_list.size()); if (!temp_desc_list.empty()) { cq_logdbg("hqrx_ptr post recv is already full (push=%d, planned=%d)", - hqrx_ptr->get_rx_max_wr_num() - hqrx_wr_num, hqrx_ptr->get_rx_max_wr_num()); + m_hqrx_ptr->get_rx_max_wr_num() - hqrx_wr_num, + m_hqrx_ptr->get_rx_max_wr_num()); g_buffer_pool_rx_rwqe->put_buffers_thread_safe(&temp_desc_list, temp_desc_list.size()); break; } @@ -230,24 +232,15 @@ void cq_mgr_rx::add_hqrx(hw_queue_rx *hqrx_ptr) } cq_logdbg("Successfully post_recv hqrx with %d new Rx buffers (planned=%d)", - hqrx_ptr->get_rx_max_wr_num() - hqrx_wr_num, hqrx_ptr->get_rx_max_wr_num()); + m_hqrx_ptr->get_rx_max_wr_num() - hqrx_wr_num, m_hqrx_ptr->get_rx_max_wr_num()); m_debt = 0; } -void cq_mgr_rx::del_hqrx(hw_queue_rx *hqrx_ptr) +void cq_mgr_rx::del_hqrx() { - BULLSEYE_EXCLUDE_BLOCK_START - if (m_hqrx_ptr != hqrx_ptr) { - cq_logdbg("wrong hqrx_ptr=%p != m_hqrx_ptr=%p", hqrx_ptr, m_hqrx_ptr); - return; - } - BULLSEYE_EXCLUDE_BLOCK_END - cq_logdbg("m_hqrx_ptr=%p", m_hqrx_ptr); return_extra_buffers(); - clean_cq(); - m_hqrx_ptr = nullptr; m_debt = 0; } @@ -318,7 +311,7 @@ bool cq_mgr_rx::request_more_buffers() return false; }; - m_p_cq_stat->n_buffer_pool_len = m_rx_pool.size(); + m_hqrx_ptr->update_rx_buffer_pool_len_stats(); return true; } @@ -331,7 +324,7 @@ void cq_mgr_rx::return_extra_buffers() cq_logfunc("releasing %d buffers to global rx pool", buff_to_rel); g_buffer_pool_rx_rwqe->put_buffers_thread_safe(&m_rx_pool, buff_to_rel); - m_p_cq_stat->n_buffer_pool_len = m_rx_pool.size(); + m_hqrx_ptr->update_rx_buffer_pool_len_stats(); } mem_buf_desc_t *cq_mgr_rx::cqe_process_rx(mem_buf_desc_t *p_mem_buf_desc, enum buff_status_e status) @@ -367,7 +360,7 @@ bool cq_mgr_rx::compensate_qp_poll_success(mem_buf_desc_t *buff_cur) size_t buffers = std::min(m_debt, m_rx_pool.size()); m_hqrx_ptr->post_recv_buffers(&m_rx_pool, buffers); m_debt -= buffers; - m_p_cq_stat->n_buffer_pool_len = m_rx_pool.size(); + m_hqrx_ptr->update_rx_buffer_pool_len_stats(); } else if (m_b_sysvar_cq_keep_qp_full || m_debt >= (int)m_hqrx_ptr->m_rx_num_wr) { m_p_cq_stat->n_rx_pkt_drop++; m_hqrx_ptr->post_recv_buffer(buff_cur); @@ -386,8 +379,8 @@ void cq_mgr_rx::compensate_qp_poll_failed() if (likely(m_rx_pool.size() || request_more_buffers())) { size_t buffers = std::min(m_debt, m_rx_pool.size()); m_hqrx_ptr->post_recv_buffers(&m_rx_pool, buffers); + m_hqrx_ptr->update_rx_buffer_pool_len_stats(); m_debt -= buffers; - m_p_cq_stat->n_buffer_pool_len = m_rx_pool.size(); } } } @@ -409,7 +402,7 @@ void cq_mgr_rx::reclaim_recv_buffer_helper(mem_buf_desc_t *buff) free_lwip_pbuf(&temp->lwip_pbuf); m_rx_pool.push_back(temp); } - m_p_cq_stat->n_buffer_pool_len = m_rx_pool.size(); + m_hqrx_ptr->update_rx_buffer_pool_len_stats(); } else { cq_logfunc("Buffer returned to wrong CQ"); g_buffer_pool_rx_rwqe->put_buffers_thread_safe(buff); @@ -472,6 +465,7 @@ bool cq_mgr_rx::request_notification() else { m_b_notification_armed = true; + ++m_hqrx_ptr->m_hwq_rx_stats.n_rx_interrupt_requests; } ENDIF_VERBS_FAILURE; } @@ -508,6 +502,7 @@ void cq_mgr_rx::wait_for_notification() // Clear flag m_b_notification_armed = false; + ++m_hqrx_ptr->m_hwq_rx_stats.n_rx_interrupt_received; } ENDIF_VERBS_FAILURE; } else { diff --git a/src/core/dev/cq_mgr_rx.h b/src/core/dev/cq_mgr_rx.h index 66b17a894..f2c96a567 100644 --- a/src/core/dev/cq_mgr_rx.h +++ b/src/core/dev/cq_mgr_rx.h @@ -80,8 +80,8 @@ class cq_mgr_rx { BS_GENERAL_ERR }; - cq_mgr_rx(ring_simple *p_ring, ib_ctx_handler *p_ib_ctx_handler, int cq_size, - struct ibv_comp_channel *p_comp_event_channel); + cq_mgr_rx(ring_simple *p_ring, hw_queue_rx *hqrx_ptr, ib_ctx_handler *p_ib_ctx_handler, + int cq_size, struct ibv_comp_channel *p_comp_event_channel); virtual ~cq_mgr_rx(); void configure(int cq_size); @@ -124,8 +124,8 @@ class cq_mgr_rx { void mem_buf_desc_return_to_owner(mem_buf_desc_t *p_mem_buf_desc, void *pv_fd_ready_array = nullptr); - virtual void add_hqrx(hw_queue_rx *hqrx_ptr); - virtual void del_hqrx(hw_queue_rx *hqrx_ptr); + virtual void add_hqrx(); + virtual void del_hqrx(); virtual uint32_t clean_cq() = 0; diff --git a/src/core/dev/cq_mgr_rx_inl.h b/src/core/dev/cq_mgr_rx_inl.h index a5a714a1f..a6ba26b55 100644 --- a/src/core/dev/cq_mgr_rx_inl.h +++ b/src/core/dev/cq_mgr_rx_inl.h @@ -47,6 +47,8 @@ inline void cq_mgr_rx::process_recv_buffer(mem_buf_desc_t *p_mem_buf_desc, void *pv_fd_ready_array) { // Assume locked!!! + m_hqrx_ptr->m_hwq_rx_stats.n_rx_byte_count += p_mem_buf_desc->sz_data; + ++m_hqrx_ptr->m_hwq_rx_stats.n_rx_pkt_count; // Pass the Rx buffer ib_comm_mgr for further IP processing if (!m_p_ring->rx_process_buffer(p_mem_buf_desc, pv_fd_ready_array)) { diff --git a/src/core/dev/cq_mgr_rx_regrq.cpp b/src/core/dev/cq_mgr_rx_regrq.cpp index f523c3921..797073823 100644 --- a/src/core/dev/cq_mgr_rx_regrq.cpp +++ b/src/core/dev/cq_mgr_rx_regrq.cpp @@ -52,9 +52,10 @@ #define cq_logpanic __log_info_panic #define cq_logfuncall __log_info_funcall -cq_mgr_rx_regrq::cq_mgr_rx_regrq(ring_simple *p_ring, ib_ctx_handler *p_ib_ctx_handler, - uint32_t cq_size, struct ibv_comp_channel *p_comp_event_channel) - : cq_mgr_rx(p_ring, p_ib_ctx_handler, cq_size, p_comp_event_channel) +cq_mgr_rx_regrq::cq_mgr_rx_regrq(ring_simple *p_ring, hw_queue_rx *hqrx_ptr, + ib_ctx_handler *p_ib_ctx_handler, uint32_t cq_size, + struct ibv_comp_channel *p_comp_event_channel) + : cq_mgr_rx(p_ring, hqrx_ptr, p_ib_ctx_handler, cq_size, p_comp_event_channel) { cq_logfunc(""); } @@ -65,10 +66,6 @@ uint32_t cq_mgr_rx_regrq::clean_cq() uint64_t cq_poll_sn = 0; mem_buf_desc_t *buff; - if (!m_hqrx_ptr) { // Sanity check - return 0; - } - buff_status_e status = BS_OK; while ((buff = poll(status))) { if (cqe_process_rx(buff, status)) { @@ -153,8 +150,8 @@ void cq_mgr_rx_regrq::cqe_to_mem_buff_desc(struct xlio_mlx5_cqe *cqe, (cqe->hds_ip_ext & MLX5_CQE_L3_OK)); if (cqe->lro_num_seg > 1) { lro_update_hdr(cqe, p_rx_wc_buf_desc); - m_p_cq_stat->n_rx_lro_packets++; - m_p_cq_stat->n_rx_lro_bytes += p_rx_wc_buf_desc->sz_data; + m_hqrx_ptr->m_hwq_rx_stats.n_rx_lro_packets++; + m_hqrx_ptr->m_hwq_rx_stats.n_rx_lro_bytes += p_rx_wc_buf_desc->sz_data; } return; } @@ -199,7 +196,7 @@ void cq_mgr_rx_regrq::cqe_to_mem_buff_desc(struct xlio_mlx5_cqe *cqe, case MLX5_CQE_INVALID: case MLX5_CQE_REQ_ERR: case MLX5_CQE_RESP_ERR: - m_p_cq_stat->n_rx_cqe_error++; + m_hqrx_ptr->m_hwq_rx_stats.n_rx_task_error++; break; } } @@ -268,8 +265,8 @@ int cq_mgr_rx_regrq::drain_and_proccess(uintptr_t *p_recycle_buffers_last_wr_id // Update cq statistics m_p_cq_stat->n_rx_sw_queue_len = m_rx_queue.size(); - m_p_cq_stat->n_rx_drained_at_once_max = - std::max(ret_total, m_p_cq_stat->n_rx_drained_at_once_max); + m_hqrx_ptr->m_hwq_rx_stats.n_rx_drained_at_once_max = + std::max(ret_total, m_hqrx_ptr->m_hwq_rx_stats.n_rx_drained_at_once_max); return ret_total; } diff --git a/src/core/dev/cq_mgr_rx_regrq.h b/src/core/dev/cq_mgr_rx_regrq.h index 897ff501f..f4832f2d4 100644 --- a/src/core/dev/cq_mgr_rx_regrq.h +++ b/src/core/dev/cq_mgr_rx_regrq.h @@ -38,8 +38,8 @@ class cq_mgr_rx_regrq : public cq_mgr_rx { public: - cq_mgr_rx_regrq(ring_simple *p_ring, ib_ctx_handler *p_ib_ctx_handler, uint32_t cq_size, - struct ibv_comp_channel *p_comp_event_channel); + cq_mgr_rx_regrq(ring_simple *p_ring, hw_queue_rx *hqrx_ptr, ib_ctx_handler *p_ib_ctx_handler, + uint32_t cq_size, struct ibv_comp_channel *p_comp_event_channel); virtual ~cq_mgr_rx_regrq() override; diff --git a/src/core/dev/cq_mgr_rx_strq.cpp b/src/core/dev/cq_mgr_rx_strq.cpp index 9152c2a32..ef3b1a25f 100644 --- a/src/core/dev/cq_mgr_rx_strq.cpp +++ b/src/core/dev/cq_mgr_rx_strq.cpp @@ -55,10 +55,11 @@ ##log_args); \ } while (0) -cq_mgr_rx_strq::cq_mgr_rx_strq(ring_simple *p_ring, ib_ctx_handler *p_ib_ctx_handler, - uint32_t cq_size, uint32_t stride_size_bytes, uint32_t strides_num, +cq_mgr_rx_strq::cq_mgr_rx_strq(ring_simple *p_ring, hw_queue_rx *hqrx_ptr, + ib_ctx_handler *p_ib_ctx_handler, uint32_t cq_size, + uint32_t stride_size_bytes, uint32_t strides_num, struct ibv_comp_channel *p_comp_event_channel) - : cq_mgr_rx(p_ring, p_ib_ctx_handler, cq_size, p_comp_event_channel) + : cq_mgr_rx(p_ring, hqrx_ptr, p_ib_ctx_handler, cq_size, p_comp_event_channel) , _owner_ring(p_ring) , _stride_size_bytes(stride_size_bytes) , _strides_num(strides_num) @@ -127,10 +128,6 @@ uint32_t cq_mgr_rx_strq::clean_cq() uint32_t ret_total = 0; uint64_t cq_poll_sn = 0; - if (!m_hqrx_ptr) { // Sanity check - return 0; - } - mem_buf_desc_t *stride_buf = nullptr; buff_status_e status = BS_OK; while (poll(status, stride_buf) || stride_buf) { @@ -257,8 +254,8 @@ inline bool cq_mgr_rx_strq::strq_cqe_to_mem_buff_desc(struct xlio_mlx5_cqe *cqe, #endif /* DEFINED_UTLS */ if (cqe->lro_num_seg > 1) { lro_update_hdr(cqe, _hot_buffer_stride); - m_p_cq_stat->n_rx_lro_packets++; - m_p_cq_stat->n_rx_lro_bytes += _hot_buffer_stride->sz_data; + m_hqrx_ptr->m_hwq_rx_stats.n_rx_lro_packets++; + m_hqrx_ptr->m_hwq_rx_stats.n_rx_lro_bytes += _hot_buffer_stride->sz_data; } break; } @@ -402,8 +399,8 @@ int cq_mgr_rx_strq::drain_and_proccess(uintptr_t *p_recycle_buffers_last_wr_id) // Update cq statistics m_p_cq_stat->n_rx_sw_queue_len = m_rx_queue.size(); - m_p_cq_stat->n_rx_drained_at_once_max = - std::max(ret_total, m_p_cq_stat->n_rx_drained_at_once_max); + m_hqrx_ptr->m_hwq_rx_stats.n_rx_drained_at_once_max = + std::max(ret_total, m_hqrx_ptr->m_hwq_rx_stats.n_rx_drained_at_once_max); return ret_total; } @@ -471,12 +468,12 @@ bool cq_mgr_rx_strq::poll_and_process_element_rx(void *pv_fd_ready_array) return (rx_polled < m_n_sysvar_cq_poll_batch_max); } -void cq_mgr_rx_strq::add_hqrx(hw_queue_rx *hqrx) +void cq_mgr_rx_strq::add_hqrx() { cq_logfunc(""); _hot_buffer_stride = nullptr; _current_wqe_consumed_bytes = 0U; - cq_mgr_rx::add_hqrx(hqrx); + cq_mgr_rx::add_hqrx(); } void cq_mgr_rx_strq::statistics_print() @@ -487,8 +484,9 @@ void cq_mgr_rx_strq::statistics_print() cq_logdbg_no_funcname("Max Strides per Packet: %12" PRIu16, m_p_cq_stat->n_rx_max_stirde_per_packet); cq_logdbg_no_funcname("Strides count: %12" PRIu64, m_p_cq_stat->n_rx_stride_count); - cq_logdbg_no_funcname("LRO packet count: %12" PRIu64, m_p_cq_stat->n_rx_lro_packets); - cq_logdbg_no_funcname("LRO bytes: %12" PRIu64, m_p_cq_stat->n_rx_lro_bytes); + cq_logdbg_no_funcname("LRO packet count: %12" PRIu64, + m_hqrx_ptr->m_hwq_rx_stats.n_rx_lro_packets); + cq_logdbg_no_funcname("LRO bytes: %12" PRIu64, m_hqrx_ptr->m_hwq_rx_stats.n_rx_lro_bytes); } void cq_mgr_rx_strq::reclaim_recv_buffer_helper(mem_buf_desc_t *buff) @@ -523,7 +521,7 @@ void cq_mgr_rx_strq::reclaim_recv_buffer_helper(mem_buf_desc_t *buff) return_stride(temp); } - m_p_cq_stat->n_buffer_pool_len = m_rx_pool.size(); + m_hqrx_ptr->update_rx_buffer_pool_len_stats(); } else { cq_logfunc("Stride returned to wrong CQ"); g_buffer_pool_rx_ptr->put_buffers_thread_safe(buff); diff --git a/src/core/dev/cq_mgr_rx_strq.h b/src/core/dev/cq_mgr_rx_strq.h index 03d5bcb5d..2f9bcb566 100644 --- a/src/core/dev/cq_mgr_rx_strq.h +++ b/src/core/dev/cq_mgr_rx_strq.h @@ -40,15 +40,15 @@ class cq_mgr_rx_strq : public cq_mgr_rx { public: - cq_mgr_rx_strq(ring_simple *p_ring, ib_ctx_handler *p_ib_ctx_handler, uint32_t cq_size, - uint32_t stride_size_bytes, uint32_t strides_num, + cq_mgr_rx_strq(ring_simple *p_ring, hw_queue_rx *hqrx_ptr, ib_ctx_handler *p_ib_ctx_handler, + uint32_t cq_size, uint32_t stride_size_bytes, uint32_t strides_num, struct ibv_comp_channel *p_comp_event_channel); virtual ~cq_mgr_rx_strq() override; virtual int drain_and_proccess(uintptr_t *p_recycle_buffers_last_wr_id = NULL) override; virtual bool poll_and_process_element_rx(void *pv_fd_ready_array = NULL) override; - virtual void add_hqrx(hw_queue_rx *qp) override; + virtual void add_hqrx() override; virtual uint32_t clean_cq() override; protected: diff --git a/src/core/dev/hw_queue_rx.cpp b/src/core/dev/hw_queue_rx.cpp index 23535e706..f87232823 100644 --- a/src/core/dev/hw_queue_rx.cpp +++ b/src/core/dev/hw_queue_rx.cpp @@ -69,6 +69,8 @@ hw_queue_rx::hw_queue_rx(ring_simple *ring, ib_ctx_handler *ib_ctx, { hwqrx_logfunc(""); + memset(&m_hwq_rx_stats, 0, sizeof(m_hwq_rx_stats)); + if (!prepare_doca_rxq()) { throw_xlio_exception("Failed to create DOCA RXQ"); } @@ -258,6 +260,7 @@ void hw_queue_rx::submit_rxq_tasks() } submit_rxq_task(DOCA_TASK_SUBMIT_FLAG_FLUSH); + update_rx_buffer_pool_len_stats(); } } @@ -309,8 +312,7 @@ bool hw_queue_rx::fill_buffers_from_global_pool() } } - // TODO DOCA: Add Statistics - // m_p_cq_stat->n_buffer_pool_len = m_rx_pool.size(); + update_rx_buffer_pool_len_stats(); return true; } @@ -433,7 +435,7 @@ void hw_queue_rx::reclaim_rx_buffer_chain_loop(mem_buf_desc_t *buff) free_lwip_pbuf(&temp->lwip_pbuf); m_rx_pool.push_front(temp); } - // m_p_cq_stat->n_buffer_pool_len = m_rx_pool.size(); + update_rx_buffer_pool_len_stats(); } else if (buff->lwip_pbuf.ref != (unsigned int)buff->get_ref_count()) { hwqrx_logwarn("Uneven lwip.ref and buf.ref %u,%d", buff->lwip_pbuf.ref, buff->get_ref_count()); @@ -472,7 +474,7 @@ void hw_queue_rx::return_extra_buffers() hwqrx_logfunc("Returning %zu buffers to global RX pool", return_buffs_num); g_buffer_pool_rx_rwqe->put_buffers_thread_safe(&m_rx_pool, return_buffs_num); - // m_p_cq_stat->n_buffer_pool_len = m_rx_pool.size(); + update_rx_buffer_pool_len_stats(); } } @@ -536,6 +538,7 @@ void hw_queue_rx::rx_task_error_cb(doca_eth_rxq_task_recv *task_recv, doca_data hw_rx->return_doca_task(task_recv); hw_rx->reclaim_rx_buffer_chain_loop(mem_buf); hw_rx->m_polled_buf = nullptr; + ++hw_rx->m_hwq_rx_stats.n_rx_task_error; } bool hw_queue_rx::poll_and_process_rx() @@ -566,6 +569,9 @@ bool hw_queue_rx::poll_and_process_rx() void hw_queue_rx::process_recv_buffer(mem_buf_desc_t *p_mem_buf_desc) { + m_hwq_rx_stats.n_rx_byte_count += p_mem_buf_desc->sz_data; + ++m_hwq_rx_stats.n_rx_pkt_count; + if (!m_p_ring->rx_process_buffer(p_mem_buf_desc, nullptr)) { // If buffer is dropped by callback - return to RX pool reclaim_rx_buffer_chain_loop(p_mem_buf_desc); @@ -580,6 +586,8 @@ bool hw_queue_rx::request_notification() PRINT_DOCA_ERR(hwqrx_logerr, rc, "doca_pe_request_notification"); return false; } + + ++m_hwq_rx_stats.n_rx_interrupt_requests; } hwqrx_logfunc("Requested notification hw_queue_rx: %p", this); @@ -594,6 +602,8 @@ void hw_queue_rx::clear_notification() doca_error_t rc = doca_pe_clear_notification(m_doca_pe.get(), m_notification_handle); if (unlikely(DOCA_IS_ERROR(rc))) { PRINT_DOCA_ERR(hwqrx_logerr, rc, "doca_pe_clear_notification"); + } else { + ++m_hwq_rx_stats.n_rx_interrupt_received; } } else { hwqrx_logwarn("Clear notification attempt on unarmed PE. hw_queue_rx: %p", this); @@ -606,6 +616,9 @@ void hw_queue_rx::modify_moderation(uint16_t period_usec, uint16_t comp_count) doca_eth_rxq_set_notification_moderation(m_doca_rxq.get(), period_usec, comp_count); if (unlikely(DOCA_IS_ERROR(rc))) { PRINT_DOCA_ERR(hwqrx_logerr, rc, "doca_eth_rxq_set_notification_moderation"); + } else { + m_hwq_rx_stats.n_rx_cq_moderation_period = period_usec; + m_hwq_rx_stats.n_rx_cq_moderation_count = comp_count; } } @@ -676,7 +689,7 @@ void hw_queue_rx::up() modify_queue_to_ready_state(); - m_p_cq_mgr_rx->add_hqrx(this); + m_p_cq_mgr_rx->add_hqrx(); } void hw_queue_rx::down() @@ -692,7 +705,7 @@ void hw_queue_rx::down() usleep(1000); release_rx_buffers(); - m_p_cq_mgr_rx->del_hqrx(this); + m_p_cq_mgr_rx->del_hqrx(); } void hw_queue_rx::release_rx_buffers() @@ -932,13 +945,14 @@ int hw_queue_rx::xlio_raw_post_recv(struct ibv_recv_wr **bad_wr) cq_mgr_rx *hw_queue_rx::init_rx_cq_mgr(struct ibv_comp_channel *p_rx_comp_event_channel) { if (safe_mce_sys().enable_striding_rq) { - return new cq_mgr_rx_strq(m_p_ring, m_p_ib_ctx_handler, + return new cq_mgr_rx_strq(m_p_ring, this, m_p_ib_ctx_handler, safe_mce_sys().strq_stride_num_per_rwqe * m_rx_num_wr, safe_mce_sys().strq_stride_size_bytes, safe_mce_sys().strq_stride_num_per_rwqe, p_rx_comp_event_channel); } - return new cq_mgr_rx_regrq(m_p_ring, m_p_ib_ctx_handler, m_rx_num_wr, p_rx_comp_event_channel); + return new cq_mgr_rx_regrq(m_p_ring, this, m_p_ib_ctx_handler, m_rx_num_wr, + p_rx_comp_event_channel); } #if defined(DEFINED_UTLS) diff --git a/src/core/dev/hw_queue_rx.h b/src/core/dev/hw_queue_rx.h index ca65d5055..0df1e3961 100644 --- a/src/core/dev/hw_queue_rx.h +++ b/src/core/dev/hw_queue_rx.h @@ -58,6 +58,7 @@ class hw_queue_rx : public xlio_ti_owner { friend class cq_mgr_rx; friend class cq_mgr_rx_regrq; friend class cq_mgr_rx_strq; + friend class ring_simple; public: hw_queue_rx(ring_simple *ring, ib_ctx_handler *ib_ctx, ibv_comp_channel *rx_comp_event_channel, @@ -85,6 +86,14 @@ class hw_queue_rx : public xlio_ti_owner { cq_mgr_rx *get_rx_cq_mgr() const { return m_p_cq_mgr_rx; } uint32_t get_rx_max_wr_num() const { return m_rx_num_wr; } uint16_t get_vlan() const { return m_vlan; }; + + void update_gro_stats(uint64_t gro_frags, uint64_t gro_bytes) + { + m_hwq_rx_stats.n_rx_gro_packets++; + m_hwq_rx_stats.n_rx_gro_frags += gro_frags; + m_hwq_rx_stats.n_rx_gro_bytes += gro_bytes; + } + void modify_queue_to_ready_state(); void modify_queue_to_error_state(); void release_rx_buffers(); @@ -133,6 +142,11 @@ class hw_queue_rx : public xlio_ti_owner { int xlio_raw_post_recv(struct ibv_recv_wr **bad_wr); bool is_rq_empty() const { return (m_rq_data.head == m_rq_data.tail); } + void update_rx_buffer_pool_len_stats() + { + m_hwq_rx_stats.n_rx_buffer_pool_len = static_cast(m_rx_pool.size()); + } + dpcp::tir *create_tir(bool is_tls = false); dpcp::tir *xlio_tir_to_dpcp_tir(xlio_tir *tir) { return tir->m_p_tir.get(); } @@ -162,10 +176,11 @@ class hw_queue_rx : public xlio_ti_owner { descq_t m_rx_pool; mem_buf_desc_t *m_polled_buf = nullptr; uint32_t m_rxq_task_debt = 0U; + uint32_t m_rx_debt_submit_treshold = 0U; + hw_queue_rx_stats_t m_hwq_rx_stats; uint32_t m_rxq_burst_size = 0U; uint32_t m_rx_buff_pool_treshold_max = 0U; uint32_t m_rx_buff_pool_treshold_min = 0U; - uint32_t m_rx_debt_submit_treshold = 0U; doca_notification_handle_t m_notification_handle; ring_simple *m_p_ring; bool m_notification_armed = false; diff --git a/src/core/dev/hw_queue_tx.cpp b/src/core/dev/hw_queue_tx.cpp index bc80e4adc..b55173f70 100644 --- a/src/core/dev/hw_queue_tx.cpp +++ b/src/core/dev/hw_queue_tx.cpp @@ -143,11 +143,12 @@ hw_queue_tx::hw_queue_tx(ring_simple *ring, const slave_data_t *slave, { hwqtx_logfunc(""); + memset(&m_hwq_tx_stats, 0, sizeof(m_hwq_tx_stats)); + if (!prepare_doca_txq()) { throw_xlio_exception("Failed to create DOCA TXQ"); } - memset(&m_hwq_tx_stats, 0, sizeof(m_hwq_tx_stats)); memset(&m_mlx5_qp, 0, sizeof(m_mlx5_qp)); m_mlx5_qp.cap.max_inline_data = safe_mce_sys().tx_max_inline; @@ -455,7 +456,7 @@ int hw_queue_tx::configure(const slave_data_t *slave, hwqtx_logerr("Failed allocating m_p_cq_mgr_tx (errno=%d %m)", errno); return -1; } - m_p_cq_mgr_rx_unused = new cq_mgr_rx_regrq(m_p_ring, m_p_ib_ctx_handler, 2, nullptr); + m_p_cq_mgr_rx_unused = new cq_mgr_rx_regrq(m_p_ring, nullptr, m_p_ib_ctx_handler, 2, nullptr); if (!m_p_cq_mgr_rx_unused) { hwqtx_logerr("Failed allocating m_p_cq_mgr_rx_unused (errno=%d %m)", errno); return -1; diff --git a/src/core/dev/rfs_uc_tcp_gro.cpp b/src/core/dev/rfs_uc_tcp_gro.cpp index 6863bb01d..c3b88a083 100644 --- a/src/core/dev/rfs_uc_tcp_gro.cpp +++ b/src/core/dev/rfs_uc_tcp_gro.cpp @@ -157,10 +157,7 @@ bool rfs_uc_tcp_gro::rx_dispatch_packet(mem_buf_desc_t *p_rx_pkt_mem_buf_desc_in flush_gro_desc(pv_fd_ready_array); } - cq_stats_t &cq_stats = *m_p_ring_simple->m_p_cq_mgr_rx->m_p_cq_stat; - cq_stats.n_rx_gro_packets++; - cq_stats.n_rx_gro_frags += 1; - cq_stats.n_rx_gro_bytes += p_rx_pkt_mem_buf_desc_info->lwip_pbuf.tot_len; + m_p_ring_simple->m_hqrx->update_gro_stats(1U, p_rx_pkt_mem_buf_desc_info->lwip_pbuf.tot_len); return rfs_uc::rx_dispatch_packet(p_rx_pkt_mem_buf_desc_info, pv_fd_ready_array); } @@ -255,10 +252,8 @@ void rfs_uc_tcp_gro::flush_gro_desc(void *pv_fd_ready_array) ntohl(m_gro_desc.p_tcp_h->seq), ntohl(m_gro_desc.p_tcp_h->ack_seq), ntohs(m_gro_desc.p_tcp_h->window), m_gro_desc.ip_tot_len, m_gro_desc.buf_count); - cq_stats_t &cq_stats = *m_p_ring_simple->m_p_cq_mgr_rx->m_p_cq_stat; - cq_stats.n_rx_gro_packets++; - cq_stats.n_rx_gro_frags += m_gro_desc.buf_count; - cq_stats.n_rx_gro_bytes += m_gro_desc.p_first->lwip_pbuf.tot_len; + m_p_ring_simple->m_hqrx->update_gro_stats(m_gro_desc.buf_count, + m_gro_desc.p_first->lwip_pbuf.tot_len); if (!rfs_uc::rx_dispatch_packet(m_gro_desc.p_first, pv_fd_ready_array)) { m_p_ring_simple->reclaim_recv_buffers_no_lock(m_gro_desc.p_first); diff --git a/src/core/dev/ring_simple.cpp b/src/core/dev/ring_simple.cpp index e85e34d12..a0daaa816 100644 --- a/src/core/dev/ring_simple.cpp +++ b/src/core/dev/ring_simple.cpp @@ -158,7 +158,8 @@ ring_simple::~ring_simple() g_p_fd_collection->del_cq_channel_fd(get_tx_channel_fd(), true); } - xlio_stats_instance_remove_ring_block(m_p_ring_stat.get(), &m_hqtx->m_hwq_tx_stats); + xlio_stats_instance_remove_ring_block(m_p_ring_stat.get(), &m_hqtx->m_hwq_tx_stats, + &m_hqrx->m_hwq_rx_stats); delete m_hqtx; m_hqtx = nullptr; @@ -388,7 +389,8 @@ void ring_simple::create_resources() safe_mce_sys().cq_moderation_count); } - xlio_stats_instance_create_ring_block(m_p_ring_stat.get(), &m_hqtx->m_hwq_tx_stats); + xlio_stats_instance_create_ring_block(m_p_ring_stat.get(), &m_hqtx->m_hwq_tx_stats, + &m_hqrx->m_hwq_rx_stats); ring_logdbg("new ring_simple() completed"); } @@ -416,7 +418,6 @@ bool ring_simple::request_notification(cq_type_t cq_type) { if (likely(CQT_RX == cq_type)) { std::lock_guard lock(m_lock_ring_rx); - ++m_p_ring_stat->n_rx_interrupt_requests; return (!safe_mce_sys().doca_rx ? m_p_cq_mgr_rx->request_notification() : m_hqrx->request_notification()); } @@ -429,7 +430,6 @@ bool ring_simple::request_notification(cq_type_t cq_type) void ring_simple::clear_rx_notification() { std::lock_guard lock(m_lock_ring_rx); - ++m_p_ring_stat->n_rx_interrupt_received; if (!safe_mce_sys().doca_rx) { m_p_cq_mgr_rx->wait_for_notification(); } else { @@ -1038,12 +1038,11 @@ void ring_simple::modify_cq_moderation(uint32_t period, uint32_t count) m_cq_moderation_info.period = period; m_cq_moderation_info.count = count; - m_p_ring_stat->n_rx_cq_moderation_period = period; - m_p_ring_stat->n_rx_cq_moderation_count = count; - // todo all cqs or just active? what about HA? if (!safe_mce_sys().doca_rx) { priv_ibv_modify_cq_moderation(m_p_cq_mgr_rx->get_ibv_cq_hndl(), period, count); + m_hqrx->m_hwq_rx_stats.n_rx_cq_moderation_period = period; + m_hqrx->m_hwq_rx_stats.n_rx_cq_moderation_count = count; } else if (m_hqrx) { m_hqrx->modify_moderation(static_cast(period), static_cast(count)); } diff --git a/src/core/dev/ring_slave.cpp b/src/core/dev/ring_slave.cpp index 6312a92e8..de28d228d 100644 --- a/src/core/dev/ring_slave.cpp +++ b/src/core/dev/ring_slave.cpp @@ -586,9 +586,6 @@ bool ring_slave::rx_process_buffer(mem_buf_desc_t *p_rx_wc_buf_desc, void *pv_fd inc_cq_moderation_stats(); - m_p_ring_stat->n_rx_byte_count += sz_data; - ++m_p_ring_stat->n_rx_pkt_count; - // This is an internal function (within ring and 'friends'). No need for lock mechanism. if (likely(m_flow_tag_enabled && p_rx_wc_buf_desc->rx.flow_tag_id && p_rx_wc_buf_desc->rx.flow_tag_id != FLOW_TAG_MASK && diff --git a/src/core/util/xlio_stats.h b/src/core/util/xlio_stats.h index 4edd75607..1417777c7 100644 --- a/src/core/util/xlio_stats.h +++ b/src/core/util/xlio_stats.h @@ -336,15 +336,7 @@ typedef struct { uint64_t n_rx_packet_count; uint64_t n_rx_consumed_rwqe_count; uint64_t n_rx_pkt_drop; - uint64_t n_rx_lro_packets; - uint64_t n_rx_lro_bytes; - uint64_t n_rx_gro_packets; - uint64_t n_rx_gro_bytes; - uint64_t n_rx_gro_frags; uint32_t n_rx_sw_queue_len; - uint32_t n_rx_drained_at_once_max; - uint32_t n_buffer_pool_len; - uint32_t n_rx_cqe_error; uint16_t n_rx_max_stirde_per_packet; } cq_stats_t; @@ -358,16 +350,10 @@ typedef struct { uint32_t n_tx_num_bufs; uint32_t n_zc_num_bufs; uint64_t n_tx_retransmits; - uint64_t n_rx_pkt_count; - uint64_t n_rx_byte_count; #ifdef DEFINED_UTLS uint32_t n_tx_tls_contexts; uint32_t n_rx_tls_contexts; #endif /* DEFINED_UTLS */ - uint64_t n_rx_interrupt_requests; - uint64_t n_rx_interrupt_received; - uint32_t n_rx_cq_moderation_count; - uint32_t n_rx_cq_moderation_period; uint64_t n_tx_dropped_wqes; uint64_t n_tx_dev_mem_pkt_count; uint64_t n_tx_dev_mem_byte_count; @@ -383,10 +369,28 @@ typedef struct { uint64_t n_tx_tso_byte_count; } hw_queue_tx_stats_t; +typedef struct { + uint64_t n_rx_pkt_count; + uint64_t n_rx_byte_count; + uint64_t n_rx_lro_packets; + uint64_t n_rx_lro_bytes; + uint64_t n_rx_gro_packets; + uint64_t n_rx_gro_bytes; + uint64_t n_rx_gro_frags; + uint32_t n_rx_buffer_pool_len; + uint32_t n_rx_drained_at_once_max; + uint64_t n_rx_task_error; + uint64_t n_rx_interrupt_requests; + uint64_t n_rx_interrupt_received; + uint32_t n_rx_cq_moderation_count; + uint32_t n_rx_cq_moderation_period; +} hw_queue_rx_stats_t; + typedef struct { bool b_enabled; ring_stats_t ring_stats; hw_queue_tx_stats_t hwq_tx_stats; + hw_queue_rx_stats_t hwq_rx_stats; } ring_instance_block_t; @@ -533,8 +537,10 @@ void xlio_stats_instance_remove_socket_block(socket_stats_t *); void xlio_stats_mc_group_add(const ip_address &mc_grp, socket_stats_t *p_socket_stats); void xlio_stats_mc_group_remove(const ip_address &mc_grp, socket_stats_t *p_socket_stats); -void xlio_stats_instance_create_ring_block(ring_stats_t *, hw_queue_tx_stats_t *local_hwq_tx_addr); -void xlio_stats_instance_remove_ring_block(ring_stats_t *, hw_queue_tx_stats_t *local_hwq_tx_addr); +void xlio_stats_instance_create_ring_block(ring_stats_t *, hw_queue_tx_stats_t *local_hwq_tx_addr, + hw_queue_rx_stats_t *local_hwq_rx_addr); +void xlio_stats_instance_remove_ring_block(ring_stats_t *, hw_queue_tx_stats_t *local_hwq_tx_addr, + hw_queue_rx_stats_t *local_hwq_rx_addr); void xlio_stats_instance_create_cq_block(cq_stats_t *); void xlio_stats_instance_remove_cq_block(cq_stats_t *); diff --git a/src/stats/stats_data_reader.h b/src/stats/stats_data_reader.h index 57e1d1b3c..5250a7611 100644 --- a/src/stats/stats_data_reader.h +++ b/src/stats/stats_data_reader.h @@ -245,7 +245,7 @@ struct ring_packet_aggregate { // coverity[missing_lock:FALSE] /* Turn off coverity missing_lock check*/ if (ring_stat.b_enabled) { val.tx += ring_stat.hwq_tx_stats.n_tx_pkt_count; - val.rx += ring_stat.ring_stats.n_rx_pkt_count; + val.rx += ring_stat.hwq_rx_stats.n_rx_pkt_count; } return val; }; diff --git a/src/stats/stats_publisher.cpp b/src/stats/stats_publisher.cpp index 86c284a74..32993917e 100644 --- a/src/stats/stats_publisher.cpp +++ b/src/stats/stats_publisher.cpp @@ -478,18 +478,22 @@ void xlio_stats_mc_group_remove(const ip_address &mc_grp, socket_stats_t *p_sock } void xlio_stats_instance_create_ring_block(ring_stats_t *local_stats_addr, - hw_queue_tx_stats_t *local_hwq_tx_addr) + hw_queue_tx_stats_t *local_hwq_tx_addr, + hw_queue_rx_stats_t *local_hwq_rx_addr) { ring_stats_t *p_instance_ring = NULL; hw_queue_tx_stats_t *p_instance_hwq_tx = NULL; + hw_queue_rx_stats_t *p_instance_hwq_rx = NULL; g_lock_ring_inst_arr.lock(); for (int i = 0; i < NUM_OF_SUPPORTED_RINGS; i++) { if (!g_sh_mem->ring_inst_arr[i].b_enabled) { g_sh_mem->ring_inst_arr[i].b_enabled = true; p_instance_ring = &g_sh_mem->ring_inst_arr[i].ring_stats; p_instance_hwq_tx = &g_sh_mem->ring_inst_arr[i].hwq_tx_stats; + p_instance_hwq_rx = &g_sh_mem->ring_inst_arr[i].hwq_rx_stats; memset(p_instance_ring, 0, sizeof(*p_instance_ring)); memset(p_instance_hwq_tx, 0, sizeof(*p_instance_hwq_tx)); + memset(p_instance_hwq_rx, 0, sizeof(*p_instance_hwq_rx)); break; } } @@ -506,22 +510,35 @@ void xlio_stats_instance_create_ring_block(ring_stats_t *local_stats_addr, g_p_stats_data_reader->add_data_reader(local_hwq_tx_addr, p_instance_hwq_tx, sizeof(hw_queue_tx_stats_t)); } - __log_dbg("Added ring local=%p shm=%p, local_hwq_tx=%p, shm_hwq_tx=%p", local_stats_addr, - p_instance_ring, local_hwq_tx_addr, p_instance_hwq_tx); + + if (local_hwq_rx_addr) { + g_p_stats_data_reader->add_data_reader(local_hwq_rx_addr, p_instance_hwq_rx, + sizeof(hw_queue_rx_stats_t)); + } + __log_dbg("Added ring local=%p shm=%p, local_hwq_tx=%p, shm_hwq_tx=%p, local_hwq_rx=%p, " + "shm_hwq_rx=%p", + local_stats_addr, p_instance_ring, local_hwq_tx_addr, p_instance_hwq_tx, + local_hwq_rx_addr, p_instance_hwq_rx); } g_lock_ring_inst_arr.unlock(); } void xlio_stats_instance_remove_ring_block(ring_stats_t *local_stats_addr, - hw_queue_tx_stats_t *local_hwq_tx_addr) + hw_queue_tx_stats_t *local_hwq_tx_addr, + hw_queue_rx_stats_t *local_hwq_rx_addr) { g_lock_ring_inst_arr.lock(); - __log_dbg("Remove ring local=%p, local_hwq_tx=%p", local_stats_addr, local_hwq_tx_addr); + __log_dbg("Remove ring local=%p, local_hwq_tx=%p, local_hwq_rx=%p", local_stats_addr, + local_hwq_tx_addr, local_hwq_rx_addr); if (local_hwq_tx_addr) { g_p_stats_data_reader->pop_data_reader(local_hwq_tx_addr); } + if (local_hwq_rx_addr) { + g_p_stats_data_reader->pop_data_reader(local_hwq_rx_addr); + } + ring_stats_t *p_ring_stats = (ring_stats_t *)g_p_stats_data_reader->pop_data_reader(local_stats_addr); diff --git a/src/stats/stats_reader.cpp b/src/stats/stats_reader.cpp index bae55e859..194bfa626 100644 --- a/src/stats/stats_reader.cpp +++ b/src/stats/stats_reader.cpp @@ -109,10 +109,10 @@ typedef enum { e_K = 1024, e_M = 1048576 } units_t; #define FORMAT_STATS_s_32bit "%-20s %d\n" #define FORMAT_STATS_64bit "%-20s %" PRIu64 " %-3s\n" #define FORMAT_STATS_double "%-20s %.1f\n" -#define FORMAT_RING_PACKETS "%-20s %zu / %zu [kilobytes/packets] %-3s\n" +#define FORMAT_RING_PACKETS "%-20s %zu / %zu [KBs/pkts] %-3s\n" #define FORMAT_RING_STRIDES "%-20s %zu / %zu / %zu [total/max-per-packet/packets-per-rwqe] %-3s\n" #define FORMAT_RING_INTERRUPT "%-20s %zu / %zu [requests/received] %-3s\n" -#define FORMAT_RING_MODERATION "%-20s %u / %u [frames/usec period] %-3s\n" +#define FORMAT_RING_MODERATION "%-20s %u / %u [frames/usec period]\n" #define FORMAT_RING_DM_STATS "%-20s %zu / %zu / %zu [kilobytes/packets/oob] %-3s\n" #define FORMAT_RING_MASTER "%-20s %p\n" @@ -354,14 +354,42 @@ void update_delta_hwq_tx_stat(hw_queue_tx_stats_t *p_curr_hwq_tx_stats, delay; } +void update_delta_hwq_rx_stat(hw_queue_rx_stats_t *p_curr_hwq_rx_stats, + hw_queue_rx_stats_t *p_prev_hwq_rx_stats) +{ + int delay = user_params.interval; + p_prev_hwq_rx_stats->n_rx_byte_count = + (p_curr_hwq_rx_stats->n_rx_byte_count - p_prev_hwq_rx_stats->n_rx_byte_count) / delay; + p_prev_hwq_rx_stats->n_rx_pkt_count = + (p_curr_hwq_rx_stats->n_rx_pkt_count - p_prev_hwq_rx_stats->n_rx_pkt_count) / delay; + p_prev_hwq_rx_stats->n_rx_lro_packets = + (p_curr_hwq_rx_stats->n_rx_lro_packets - p_prev_hwq_rx_stats->n_rx_lro_packets) / delay; + p_prev_hwq_rx_stats->n_rx_lro_bytes = + (p_curr_hwq_rx_stats->n_rx_lro_bytes - p_prev_hwq_rx_stats->n_rx_lro_bytes) / delay; + p_prev_hwq_rx_stats->n_rx_gro_packets = + (p_curr_hwq_rx_stats->n_rx_gro_packets - p_prev_hwq_rx_stats->n_rx_gro_packets) / delay; + p_prev_hwq_rx_stats->n_rx_gro_frags = + (p_curr_hwq_rx_stats->n_rx_gro_frags - p_prev_hwq_rx_stats->n_rx_gro_frags) / delay; + p_prev_hwq_rx_stats->n_rx_gro_bytes = + (p_curr_hwq_rx_stats->n_rx_gro_bytes - p_prev_hwq_rx_stats->n_rx_gro_bytes) / delay; + p_prev_hwq_rx_stats->n_rx_buffer_pool_len = p_curr_hwq_rx_stats->n_rx_buffer_pool_len; + p_prev_hwq_rx_stats->n_rx_task_error = + (p_curr_hwq_rx_stats->n_rx_task_error - p_prev_hwq_rx_stats->n_rx_task_error) / delay; + p_prev_hwq_rx_stats->n_rx_drained_at_once_max = p_curr_hwq_rx_stats->n_rx_drained_at_once_max; + p_prev_hwq_rx_stats->n_rx_interrupt_received = (p_curr_hwq_rx_stats->n_rx_interrupt_received - + p_prev_hwq_rx_stats->n_rx_interrupt_received) / + delay; + p_prev_hwq_rx_stats->n_rx_interrupt_requests = (p_curr_hwq_rx_stats->n_rx_interrupt_requests - + p_prev_hwq_rx_stats->n_rx_interrupt_requests) / + delay; + p_prev_hwq_rx_stats->n_rx_cq_moderation_count = p_curr_hwq_rx_stats->n_rx_cq_moderation_count; + p_prev_hwq_rx_stats->n_rx_cq_moderation_period = p_curr_hwq_rx_stats->n_rx_cq_moderation_period; +} + void update_delta_ring_stat(ring_stats_t *p_curr_ring_stats, ring_stats_t *p_prev_ring_stats) { int delay = user_params.interval; if (p_curr_ring_stats && p_prev_ring_stats) { - p_prev_ring_stats->n_rx_byte_count = - (p_curr_ring_stats->n_rx_byte_count - p_prev_ring_stats->n_rx_byte_count) / delay; - p_prev_ring_stats->n_rx_pkt_count = - (p_curr_ring_stats->n_rx_pkt_count - p_prev_ring_stats->n_rx_pkt_count) / delay; p_prev_ring_stats->n_tx_retransmits = (p_curr_ring_stats->n_tx_retransmits - p_prev_ring_stats->n_tx_retransmits) / delay; p_prev_ring_stats->n_tx_dropped_wqes = @@ -376,14 +404,6 @@ void update_delta_ring_stat(ring_stats_t *p_curr_ring_stats, ring_stats_t *p_pre p_prev_ring_stats->n_rx_tls_contexts = (p_curr_ring_stats->n_rx_tls_contexts - p_prev_ring_stats->n_rx_tls_contexts) / delay; #endif /* DEFINED_UTLS */ - p_prev_ring_stats->n_rx_interrupt_received = (p_curr_ring_stats->n_rx_interrupt_received - - p_prev_ring_stats->n_rx_interrupt_received) / - delay; - p_prev_ring_stats->n_rx_interrupt_requests = (p_curr_ring_stats->n_rx_interrupt_requests - - p_prev_ring_stats->n_rx_interrupt_requests) / - delay; - p_prev_ring_stats->n_rx_cq_moderation_count = p_curr_ring_stats->n_rx_cq_moderation_count; - p_prev_ring_stats->n_rx_cq_moderation_period = p_curr_ring_stats->n_rx_cq_moderation_period; p_prev_ring_stats->n_tx_dev_mem_allocated = p_curr_ring_stats->n_tx_dev_mem_allocated; p_prev_ring_stats->n_tx_dev_mem_byte_count = (p_curr_ring_stats->n_tx_dev_mem_byte_count - p_prev_ring_stats->n_tx_dev_mem_byte_count) / @@ -400,21 +420,9 @@ void update_delta_cq_stat(cq_stats_t *p_curr_cq_stats, cq_stats_t *p_prev_cq_sta { int delay = user_params.interval; if (p_curr_cq_stats && p_prev_cq_stats) { - p_prev_cq_stats->n_rx_drained_at_once_max = p_curr_cq_stats->n_rx_drained_at_once_max; p_prev_cq_stats->n_rx_pkt_drop = (p_curr_cq_stats->n_rx_pkt_drop - p_prev_cq_stats->n_rx_pkt_drop) / delay; p_prev_cq_stats->n_rx_sw_queue_len = p_curr_cq_stats->n_rx_sw_queue_len; - p_prev_cq_stats->n_buffer_pool_len = p_curr_cq_stats->n_buffer_pool_len; - p_prev_cq_stats->n_rx_lro_packets = - (p_curr_cq_stats->n_rx_lro_packets - p_prev_cq_stats->n_rx_lro_packets) / delay; - p_prev_cq_stats->n_rx_lro_bytes = - (p_curr_cq_stats->n_rx_lro_bytes - p_prev_cq_stats->n_rx_lro_bytes) / delay; - p_prev_cq_stats->n_rx_gro_packets = - (p_curr_cq_stats->n_rx_gro_packets - p_prev_cq_stats->n_rx_gro_packets) / delay; - p_prev_cq_stats->n_rx_gro_frags = - (p_curr_cq_stats->n_rx_gro_frags - p_prev_cq_stats->n_rx_gro_frags) / delay; - p_prev_cq_stats->n_rx_gro_bytes = - (p_curr_cq_stats->n_rx_gro_bytes - p_prev_cq_stats->n_rx_gro_bytes) / delay; p_prev_cq_stats->n_rx_consumed_rwqe_count = (p_curr_cq_stats->n_rx_consumed_rwqe_count - p_prev_cq_stats->n_rx_consumed_rwqe_count) / delay; @@ -423,8 +431,6 @@ void update_delta_cq_stat(cq_stats_t *p_curr_cq_stats, cq_stats_t *p_prev_cq_sta p_prev_cq_stats->n_rx_packet_count = (p_curr_cq_stats->n_rx_packet_count - p_prev_cq_stats->n_rx_packet_count) / delay; p_prev_cq_stats->n_rx_max_stirde_per_packet = p_curr_cq_stats->n_rx_max_stirde_per_packet; - p_prev_cq_stats->n_rx_cqe_error = - (p_curr_cq_stats->n_rx_cqe_error - p_prev_cq_stats->n_rx_cqe_error) / delay; } } @@ -482,6 +488,7 @@ void print_ring_stats(ring_instance_block_t *p_ring_inst_arr) if (p_ring_inst_arr[i].b_enabled) { ring_stats_t *p_ring_stats = &p_ring_inst_arr[i].ring_stats; hw_queue_tx_stats_t *p_hwq_tx_stats = &p_ring_inst_arr[i].hwq_tx_stats; + hw_queue_rx_stats_t *p_hwq_rx_stats = &p_ring_inst_arr[i].hwq_rx_stats; printf("======================================================\n"); printf("\tETH=[%u]\n", i); @@ -495,12 +502,12 @@ void print_ring_stats(ring_instance_block_t *p_ring_inst_arr) if (p_hwq_tx_stats->n_tx_tso_pkt_count || p_hwq_tx_stats->n_tx_tso_byte_count) { printf(FORMAT_RING_PACKETS, - "TSO Offload:", p_hwq_tx_stats->n_tx_tso_byte_count / BYTES_TRAFFIC_UNIT, + "TX TSO Offload:", p_hwq_tx_stats->n_tx_tso_byte_count / BYTES_TRAFFIC_UNIT, p_hwq_tx_stats->n_tx_tso_pkt_count, post_fix); } if (p_ring_stats->n_tx_retransmits) { - printf(FORMAT_STATS_64bit, "Retransmissions:", p_ring_stats->n_tx_retransmits, + printf(FORMAT_STATS_64bit, "TX Retransmissions:", p_ring_stats->n_tx_retransmits, post_fix); } @@ -509,31 +516,66 @@ void print_ring_stats(ring_instance_block_t *p_ring_inst_arr) post_fix); } - printf(FORMAT_STATS_32bit, "TX buffers in use:", p_ring_stats->n_tx_num_bufs); - printf(FORMAT_STATS_32bit, "TX ZC buffers in use:", p_ring_stats->n_zc_num_bufs); + printf(FORMAT_STATS_32bit, "TX buff in use:", p_ring_stats->n_tx_num_bufs); + printf(FORMAT_STATS_32bit, "TX ZC buff in use:", p_ring_stats->n_zc_num_bufs); if (p_ring_stats->n_tx_dev_mem_allocated) { - printf(FORMAT_STATS_32bit, "Dev Mem Alloc:", p_ring_stats->n_tx_dev_mem_allocated); - printf(FORMAT_RING_DM_STATS, - "Dev Mem Stats:", p_ring_stats->n_tx_dev_mem_byte_count / BYTES_TRAFFIC_UNIT, - p_ring_stats->n_tx_dev_mem_pkt_count, p_ring_stats->n_tx_dev_mem_oob, - post_fix); + printf(FORMAT_STATS_32bit, + "TX Dev Mem Alloc:", p_ring_stats->n_tx_dev_mem_allocated); + printf( + FORMAT_RING_DM_STATS, + "TX Dev Mem Stats:", p_ring_stats->n_tx_dev_mem_byte_count / BYTES_TRAFFIC_UNIT, + p_ring_stats->n_tx_dev_mem_pkt_count, p_ring_stats->n_tx_dev_mem_oob, post_fix); } printf(FORMAT_RING_PACKETS, - "RX Offload:", p_ring_stats->n_rx_byte_count / BYTES_TRAFFIC_UNIT, - p_ring_stats->n_rx_pkt_count, post_fix); + "RX Offload:", p_hwq_rx_stats->n_rx_byte_count / BYTES_TRAFFIC_UNIT, + p_hwq_rx_stats->n_rx_pkt_count, post_fix); - if (p_ring_stats->n_rx_interrupt_requests || p_ring_stats->n_rx_interrupt_received) { - printf(FORMAT_RING_INTERRUPT, "Interrupts:", p_ring_stats->n_rx_interrupt_requests, - p_ring_stats->n_rx_interrupt_received, post_fix); + if (p_hwq_rx_stats->n_rx_lro_packets) { + printf(FORMAT_RING_PACKETS, + "RX LRO Offload:", p_hwq_rx_stats->n_rx_lro_bytes / BYTES_TRAFFIC_UNIT, + p_hwq_rx_stats->n_rx_lro_packets, post_fix); } - if (p_ring_stats->n_rx_cq_moderation_count || p_ring_stats->n_rx_cq_moderation_period) { + + if (p_hwq_rx_stats->n_rx_gro_packets) { + printf(FORMAT_RING_PACKETS, + "RX GRO:", p_hwq_rx_stats->n_rx_gro_bytes / BYTES_TRAFFIC_UNIT, + p_hwq_rx_stats->n_rx_gro_packets, post_fix); + printf(FORMAT_STATS_64bit, "RX GRO avg pkt size:", + p_hwq_rx_stats->n_rx_gro_bytes / p_hwq_rx_stats->n_rx_gro_packets, post_fix); + printf(FORMAT_STATS_double, "RX GRO frags/pkt:", + static_cast(p_hwq_rx_stats->n_rx_gro_frags) / + p_hwq_rx_stats->n_rx_gro_packets); + } + + if (p_hwq_rx_stats->n_rx_interrupt_requests || + p_hwq_rx_stats->n_rx_interrupt_received) { + printf(FORMAT_RING_INTERRUPT, + "RX Interrupts:", p_hwq_rx_stats->n_rx_interrupt_requests, + p_hwq_rx_stats->n_rx_interrupt_received, post_fix); + } + + if (p_hwq_rx_stats->n_rx_cq_moderation_count || + p_hwq_rx_stats->n_rx_cq_moderation_period) { printf(FORMAT_RING_MODERATION, - "Moderation:", p_ring_stats->n_rx_cq_moderation_count, - p_ring_stats->n_rx_cq_moderation_period, post_fix); + "RX Moderation:", p_hwq_rx_stats->n_rx_cq_moderation_count, + p_hwq_rx_stats->n_rx_cq_moderation_period); + } + + if (p_hwq_rx_stats->n_rx_drained_at_once_max) { + printf(FORMAT_STATS_32bit, + "RX Drained max:", p_hwq_rx_stats->n_rx_drained_at_once_max); + } + + if (p_hwq_rx_stats->n_rx_task_error) { + printf(FORMAT_STATS_64bit, "RX Task errors:", p_hwq_rx_stats->n_rx_task_error, + post_fix); } + printf(FORMAT_STATS_32bit, + "RX Buffer pool size:", p_hwq_rx_stats->n_rx_buffer_pool_len); + #ifdef DEFINED_UTLS if (p_ring_stats->n_tx_tls_contexts) { printf(FORMAT_STATS_64bit, @@ -568,13 +610,10 @@ void print_cq_stats(cq_instance_block_t *p_cq_inst_arr) printf("\tCQ=[%u]\n", i); printf(FORMAT_STATS_64bit, "Packets dropped:", p_cq_stats->n_rx_pkt_drop, post_fix); printf(FORMAT_STATS_32bit, "Packets queue len:", p_cq_stats->n_rx_sw_queue_len); - printf(FORMAT_STATS_32bit, "Drained max:", p_cq_stats->n_rx_drained_at_once_max); - printf(FORMAT_STATS_32bit, "Buffer pool size:", p_cq_stats->n_buffer_pool_len); printf(FORMAT_STATS_64bit, "Packets received:", p_cq_stats->n_rx_packet_count, post_fix); printf(FORMAT_STATS_64bit, "Strides received:", p_cq_stats->n_rx_stride_count, post_fix); - printf(FORMAT_STATS_32bit, "CQE errors:", p_cq_stats->n_rx_cqe_error); printf(FORMAT_STATS_64bit, "Consumed rwqes:", p_cq_stats->n_rx_consumed_rwqe_count, post_fix); printf(FORMAT_STATS_32bit, "Max strides/packet:", @@ -585,21 +624,6 @@ void print_cq_stats(cq_instance_block_t *p_cq_inst_arr) printf(FORMAT_STATS_double, "Avg packets/rwqe:", p_cq_stats->n_rx_packet_count / static_cast(p_cq_stats->n_rx_consumed_rwqe_count + 1U)); - if (p_cq_stats->n_rx_lro_packets) { - printf(FORMAT_RING_PACKETS, - "Rx lro:", p_cq_stats->n_rx_lro_bytes / BYTES_TRAFFIC_UNIT, - p_cq_stats->n_rx_lro_packets, post_fix); - } - if (p_cq_stats->n_rx_gro_packets) { - printf(FORMAT_RING_PACKETS, - "Rx GRO:", p_cq_stats->n_rx_gro_bytes / BYTES_TRAFFIC_UNIT, - p_cq_stats->n_rx_gro_packets, post_fix); - printf(FORMAT_STATS_64bit, "Avg GRO packet size:", - p_cq_stats->n_rx_gro_bytes / p_cq_stats->n_rx_gro_packets, post_fix); - printf( - FORMAT_STATS_double, "GRO frags per packet:", - static_cast(p_cq_stats->n_rx_gro_frags) / p_cq_stats->n_rx_gro_packets); - } } } printf("======================================================\n"); @@ -1050,6 +1074,8 @@ void print_ring_deltas(ring_instance_block_t *p_curr_ring_stats, for (int i = 0; i < NUM_OF_SUPPORTED_RINGS; i++) { update_delta_hwq_tx_stat(&p_curr_ring_stats[i].hwq_tx_stats, &p_prev_ring_stats[i].hwq_tx_stats); + update_delta_hwq_rx_stat(&p_curr_ring_stats[i].hwq_rx_stats, + &p_prev_ring_stats[i].hwq_rx_stats); update_delta_ring_stat(&p_curr_ring_stats[i].ring_stats, &p_prev_ring_stats[i].ring_stats); } print_ring_stats(p_prev_ring_stats); @@ -1779,19 +1805,17 @@ void zero_iomux_stats(iomux_stats_t *p_iomux_stats) // memset(p_iomux_stats, 0, sizeof(*p_iomux_stats)); } -void zero_ring_stats(ring_stats_t *p_ring_stats, hw_queue_tx_stats_t *p_hwq_tx_stats) +void zero_ring_stats(ring_stats_t *p_ring_stats, hw_queue_tx_stats_t *p_hwq_tx_stats, + hw_queue_rx_stats_t *p_hwq_rx_stats) { memset(p_hwq_tx_stats, 0, sizeof(*p_hwq_tx_stats)); + memset(p_hwq_rx_stats, 0, sizeof(*p_hwq_rx_stats)); - p_ring_stats->n_rx_pkt_count = 0; - p_ring_stats->n_rx_byte_count = 0; p_ring_stats->n_tx_retransmits = 0; #ifdef DEFINED_UTLS p_ring_stats->n_tx_tls_contexts = 0; p_ring_stats->n_rx_tls_contexts = 0; #endif /* DEFINED_UTLS */ - p_ring_stats->n_rx_interrupt_received = 0; - p_ring_stats->n_rx_interrupt_requests = 0; p_ring_stats->n_tx_dropped_wqes = 0; p_ring_stats->n_tx_dev_mem_byte_count = 0; p_ring_stats->n_tx_dev_mem_pkt_count = 0; @@ -1827,7 +1851,8 @@ void zero_counters(sh_mem_t *p_sh_mem) } for (int i = 0; i < NUM_OF_SUPPORTED_RINGS; i++) { zero_ring_stats(&p_sh_mem->ring_inst_arr[i].ring_stats, - &p_sh_mem->ring_inst_arr[i].hwq_tx_stats); + &p_sh_mem->ring_inst_arr[i].hwq_tx_stats, + &p_sh_mem->ring_inst_arr[i].hwq_rx_stats); } for (int i = 0; i < NUM_OF_SUPPORTED_BPOOLS; i++) { zero_bpool_stats(&p_sh_mem->bpool_inst_arr[i].bpool_stats); From aa93da023d50d07337644e1b1df90420d071c530 Mon Sep 17 00:00:00 2001 From: Alexander Grissik Date: Tue, 12 Nov 2024 14:28:01 +0000 Subject: [PATCH 6/7] issue: 4082814 Adding DOCA send failure handling for future TLS It is crucial to revert expected seq num in case of TX drop for TLS offload. Adding this handling now to avoid serious debugging in the future. Signed-off-by: Alexander Grissik --- src/core/sock/sockinfo_nvme.cpp | 4 ++-- src/core/sock/sockinfo_nvme.h | 2 +- src/core/sock/sockinfo_tcp.cpp | 8 +++++++- src/core/sock/sockinfo_ulp.cpp | 6 +++--- src/core/sock/sockinfo_ulp.h | 4 ++-- 5 files changed, 15 insertions(+), 9 deletions(-) diff --git a/src/core/sock/sockinfo_nvme.cpp b/src/core/sock/sockinfo_nvme.cpp index d085d2e9d..843a3d044 100644 --- a/src/core/sock/sockinfo_nvme.cpp +++ b/src/core/sock/sockinfo_nvme.cpp @@ -222,9 +222,9 @@ int sockinfo_tcp_ops_nvme::postrouting(pbuf *p, tcp_seg *seg, xlio_send_attr &at return ERR_OK; } -bool sockinfo_tcp_ops_nvme::handle_send_ret(ssize_t ret, tcp_seg *seg) +bool sockinfo_tcp_ops_nvme::handle_send_ret(uint32_t ret, tcp_seg *seg) { - if (ret < 0 && seg) { + if (ret == 0U && seg) { m_expected_seqno -= seg->len; return false; } diff --git a/src/core/sock/sockinfo_nvme.h b/src/core/sock/sockinfo_nvme.h index 18da706a5..e162dbe13 100644 --- a/src/core/sock/sockinfo_nvme.h +++ b/src/core/sock/sockinfo_nvme.h @@ -66,7 +66,7 @@ class sockinfo_tcp_ops_nvme : public sockinfo_tcp_ops { int setsockopt(int __level, int __optname, const void *__optval, socklen_t __optlen) override; ssize_t tx(xlio_tx_call_attr_t &tx_arg) override; int postrouting(struct pbuf *p, struct tcp_seg *seg, xlio_send_attr &attr) override; - bool handle_send_ret(ssize_t ret, struct tcp_seg *seg) override; + bool handle_send_ret(uint32_t ret, struct tcp_seg *seg) override; err_t recv(struct pbuf *p) override; private: diff --git a/src/core/sock/sockinfo_tcp.cpp b/src/core/sock/sockinfo_tcp.cpp index 1bbb164ab..245759872 100644 --- a/src/core/sock/sockinfo_tcp.cpp +++ b/src/core/sock/sockinfo_tcp.cpp @@ -1357,6 +1357,12 @@ err_t sockinfo_tcp::ip_output_doca(struct pbuf *p, struct tcp_seg *seg, void *v_ ret = p_dst->doca_slow_path(p, flags, pcb->mss, p_si_tcp->m_so_ratelimit); } + bool rc = p_si_tcp->m_ops->handle_send_ret(ret, seg); + + if (unlikely(p_si_tcp->m_p_socket_stats && (flags & XLIO_TX_PACKET_REXMIT) && rc)) { + ++p_si_tcp->m_p_socket_stats->counters.n_tx_retransmits; + } + if (unlikely(safe_mce_sys().ring_migration_ratio_tx > 0)) { // Condition for cache optimization if (p_dst->try_migrate_ring_tx(p_si_tcp->m_tcp_con_lock.get_lock_base())) { IF_STATS_O(p_si_tcp, p_si_tcp->m_p_socket_stats->counters.n_tx_migrations++); @@ -1457,7 +1463,7 @@ err_t sockinfo_tcp::ip_output(struct pbuf *p, struct tcp_seg *seg, void *v_p_con ? p_dst->fast_send((struct iovec *)lwip_iovec, count, attr) : p_dst->slow_send((struct iovec *)lwip_iovec, count, attr, p_si_tcp->m_so_ratelimit); - rc = p_si_tcp->m_ops->handle_send_ret(ret, seg); + rc = p_si_tcp->m_ops->handle_send_ret(ret >= 0 ? 1U : 0U, seg); if (unlikely(safe_mce_sys().ring_migration_ratio_tx > 0)) { // Condition for cache optimization if (p_dst->try_migrate_ring_tx(p_si_tcp->m_tcp_con_lock.get_lock_base())) { diff --git a/src/core/sock/sockinfo_ulp.cpp b/src/core/sock/sockinfo_ulp.cpp index 9d29f9635..c97b99703 100644 --- a/src/core/sock/sockinfo_ulp.cpp +++ b/src/core/sock/sockinfo_ulp.cpp @@ -75,7 +75,7 @@ int sockinfo_tcp_ops::postrouting(struct pbuf *p, struct tcp_seg *seg, xlio_send } /*virtual*/ -bool sockinfo_tcp_ops::handle_send_ret(ssize_t ret, struct tcp_seg *seg) +bool sockinfo_tcp_ops::handle_send_ret(uint32_t ret, struct tcp_seg *seg) { NOT_IN_USE(ret); NOT_IN_USE(seg); @@ -997,9 +997,9 @@ int sockinfo_tcp_ops_tls::postrouting(struct pbuf *p, struct tcp_seg *seg, xlio_ return 0; } -bool sockinfo_tcp_ops_tls::handle_send_ret(ssize_t ret, struct tcp_seg *seg) +bool sockinfo_tcp_ops_tls::handle_send_ret(uint32_t ret, struct tcp_seg *seg) { - if (ret < 0 && seg) { + if (ret == 0U && seg) { m_expected_seqno -= seg->len; return false; } diff --git a/src/core/sock/sockinfo_ulp.h b/src/core/sock/sockinfo_ulp.h index 21176365e..9e7446dbc 100644 --- a/src/core/sock/sockinfo_ulp.h +++ b/src/core/sock/sockinfo_ulp.h @@ -61,7 +61,7 @@ class sockinfo_tcp_ops { virtual int setsockopt(int __level, int __optname, const void *__optval, socklen_t __optlen); virtual ssize_t tx(xlio_tx_call_attr_t &tx_arg); virtual int postrouting(struct pbuf *p, struct tcp_seg *seg, xlio_send_attr &attr); - virtual bool handle_send_ret(ssize_t ret, struct tcp_seg *seg); + virtual bool handle_send_ret(uint32_t ret, struct tcp_seg *seg); virtual err_t recv(struct pbuf *p) { @@ -90,7 +90,7 @@ class sockinfo_tcp_ops_tls : public sockinfo_tcp_ops { int setsockopt(int, int, const void *, socklen_t) override; ssize_t tx(xlio_tx_call_attr_t &tx_arg) override; int postrouting(struct pbuf *p, struct tcp_seg *seg, xlio_send_attr &attr) override; - bool handle_send_ret(ssize_t ret, struct tcp_seg *seg) override; + bool handle_send_ret(uint32_t ret, struct tcp_seg *seg) override; void get_record_buf(mem_buf_desc_t *&buf, uint8_t *&data, bool is_zerocopy); From 49be19af73e471213bcb372c17f0b5ad5bf84777 Mon Sep 17 00:00:00 2001 From: Alexander Grissik Date: Wed, 13 Nov 2024 11:31:34 +0000 Subject: [PATCH 7/7] issue: 4159519 Adding TX interrupt stats Adding statistics for TX interrupt request and receive. Signed-off-by: Alexander Grissik --- src/core/dev/hw_queue_tx.cpp | 4 ++++ src/core/util/xlio_stats.h | 2 ++ src/stats/stats_reader.cpp | 13 +++++++++++++ 3 files changed, 19 insertions(+) diff --git a/src/core/dev/hw_queue_tx.cpp b/src/core/dev/hw_queue_tx.cpp index b55173f70..4dcff7a7b 100644 --- a/src/core/dev/hw_queue_tx.cpp +++ b/src/core/dev/hw_queue_tx.cpp @@ -2025,6 +2025,8 @@ bool hw_queue_tx::request_notification() PRINT_DOCA_ERR(hwqtx_logerr, rc, "doca_pe_request_notification"); return false; } + + ++m_hwq_tx_stats.n_tx_interrupt_requests; } m_notification_armed = true; @@ -2038,6 +2040,8 @@ void hw_queue_tx::clear_notification() doca_error_t rc = doca_pe_clear_notification(m_doca_pe.get(), m_notification_handle); if (unlikely(DOCA_IS_ERROR(rc))) { PRINT_DOCA_ERR(hwqtx_logerr, rc, "doca_pe_clear_notification"); + } else { + ++m_hwq_tx_stats.n_tx_interrupt_received; } } else { hwqtx_logwarn("Clear notification attempt on unarmed PE. hw_queue_tx: %p", this); diff --git a/src/core/util/xlio_stats.h b/src/core/util/xlio_stats.h index 1417777c7..ebe48d4cc 100644 --- a/src/core/util/xlio_stats.h +++ b/src/core/util/xlio_stats.h @@ -367,6 +367,8 @@ typedef struct { uint64_t n_tx_byte_count; uint64_t n_tx_tso_pkt_count; uint64_t n_tx_tso_byte_count; + uint64_t n_tx_interrupt_requests; + uint64_t n_tx_interrupt_received; } hw_queue_tx_stats_t; typedef struct { diff --git a/src/stats/stats_reader.cpp b/src/stats/stats_reader.cpp index 194bfa626..09786f928 100644 --- a/src/stats/stats_reader.cpp +++ b/src/stats/stats_reader.cpp @@ -352,6 +352,12 @@ void update_delta_hwq_tx_stat(hw_queue_tx_stats_t *p_curr_hwq_tx_stats, p_prev_hwq_tx_stats->n_tx_tso_byte_count = (p_curr_hwq_tx_stats->n_tx_tso_byte_count - p_prev_hwq_tx_stats->n_tx_tso_byte_count) / delay; + p_prev_hwq_tx_stats->n_tx_interrupt_received = (p_curr_hwq_tx_stats->n_tx_interrupt_received - + p_prev_hwq_tx_stats->n_tx_interrupt_received) / + delay; + p_prev_hwq_tx_stats->n_tx_interrupt_requests = (p_curr_hwq_tx_stats->n_tx_interrupt_requests - + p_prev_hwq_tx_stats->n_tx_interrupt_requests) / + delay; } void update_delta_hwq_rx_stat(hw_queue_rx_stats_t *p_curr_hwq_rx_stats, @@ -519,6 +525,13 @@ void print_ring_stats(ring_instance_block_t *p_ring_inst_arr) printf(FORMAT_STATS_32bit, "TX buff in use:", p_ring_stats->n_tx_num_bufs); printf(FORMAT_STATS_32bit, "TX ZC buff in use:", p_ring_stats->n_zc_num_bufs); + if (p_hwq_tx_stats->n_tx_interrupt_requests || + p_hwq_tx_stats->n_tx_interrupt_received) { + printf(FORMAT_RING_INTERRUPT, + "TX Interrupts:", p_hwq_tx_stats->n_tx_interrupt_requests, + p_hwq_tx_stats->n_tx_interrupt_received, post_fix); + } + if (p_ring_stats->n_tx_dev_mem_allocated) { printf(FORMAT_STATS_32bit, "TX Dev Mem Alloc:", p_ring_stats->n_tx_dev_mem_allocated);