diff --git a/conf/dpvs.conf.items b/conf/dpvs.conf.items index df579f7f..29eda38f 100644 --- a/conf/dpvs.conf.items +++ b/conf/dpvs.conf.items @@ -43,6 +43,11 @@ netif_defs { ! mtu 1500 <1500,0-9000> ! promisc_mode ! kni_name dpdk0.kni + ! kni_isolate on + ! kni_ipaddress { + ! ipv4 + ! ipv6 + ! } } device dpdk1 { @@ -59,6 +64,11 @@ netif_defs { ! mtu 1500 ! promisc_mode ! kni_name dpdk1.kni + ! kni_isolate on + ! kni_ipaddress { + ! ipv4 + ! ipv6 + ! } } device bond0 { diff --git a/conf/dpvs.conf.sample b/conf/dpvs.conf.sample index f9baf3f7..e8c0c0a6 100644 --- a/conf/dpvs.conf.sample +++ b/conf/dpvs.conf.sample @@ -41,6 +41,11 @@ netif_defs { ! mtu 1500 ! promisc_mode kni_name dpdk0.kni + kni_isolate on + kni_ipaddress { + ipv4 192.168.0.1 + ipv6 0000:0000:0000:0000:0000:FFFF:C0A8:0001 + } } device dpdk1 { @@ -61,6 +66,11 @@ netif_defs { ! mtu 1500 ! promisc_mode kni_name dpdk1.kni + kni_isolate on + kni_ipaddress { + ipv4 192.168.0.2 + ipv6 0000:0000:0000:0000:0000:FFFF:C0A8:0002 + } } ! bonding bond0 { diff --git a/conf/dpvs.conf.single-nic.sample b/conf/dpvs.conf.single-nic.sample index 40a34dd9..5a82b8ff 100644 --- a/conf/dpvs.conf.single-nic.sample +++ b/conf/dpvs.conf.single-nic.sample @@ -40,6 +40,11 @@ netif_defs { ! mtu 1500 ! promisc_mode kni_name dpdk0.kni + kni_isolate on + kni_ipaddress { + ipv4 192.168.0.2 + ipv6 0000:0000:0000:0000:0000:FFFF:C0A8:0002 + } } } diff --git a/include/kni.h b/include/kni.h index d9fcd95d..8266a2a0 100644 --- a/include/kni.h +++ b/include/kni.h @@ -29,6 +29,28 @@ #include #include "netif.h" +/* + * NOTE: + * 1. local ip filter will make input set fixed on ixgbe/i40e. + * 2. dip filter is not supported by ixgbe and i40e under the + * premise of local ip filter. + * 3. use dip + dport + dst_port_mask filters to cover port range + * [0-65535] to replace dip filter on ixgbe/i40e. + * 4. kni fdir filter support tcp and udp, icmp not supported. + * 5. if (fdir_conf.mask.dst_port_mask & pkt.dport) equal to an + * element in the port_base_array, pkt will match kni fdir + * filter and redirected to kni rx queue. + * 6. rss rte_flow to specfic rss queue region should with lower + * priority than lip and kni fdir filter. + */ +typedef struct kni_fdir { + bool init_success; /* kni fdir init flag */ + uint16_t filter_mask; /* kni filter's port mask */ + uint16_t port_base_num; /* kni port_base num */ + __be16 port_base_array[DPVS_MAX_LCORE]; /* kni port_base set */ + uint32_t soft_id_array[DPVS_MAX_LCORE][MAX_FDIR_PROTO]; +} dp_vs_kni_fdir; + /* * @dev - real device kni attach to. * @kniname - optional, kni device name or auto generate. @@ -38,6 +60,11 @@ int kni_add_dev(struct netif_port *dev, const char *kniname); int kni_del_dev(struct netif_port *dev); int kni_init(void); +int kni_fdir_init(void); +int kni_fdir_filter_add(struct netif_port *dev, + const union inet_addr *kni_ip, + int af); + static inline bool kni_dev_exist(const struct netif_port *dev) { return dev->kni.kni ? true : false; @@ -51,4 +78,21 @@ static inline void kni_handle_request(const struct netif_port *dev) rte_kni_handle_request(dev->kni.kni); } +static inline bool kni_fwd_valid(const struct netif_port *dev, + kni_fwd_mode_t fwd_mode) +{ + if (fwd_mode == KNI_FWD_MODE_DEFAULT) { + return true; + } + + if ((fwd_mode == KNI_FWD_MODE_ISOLATE_RX) + && (dev->kni.fwd_mode == fwd_mode) + && (dev->kni.rx_queue_id != NETIF_QUEUE_ID_INVALID)) + { + return true; + } + + return false; +} + #endif /* __DPVS_KNI_H__ */ diff --git a/include/netdev_flow.h b/include/netdev_flow.h new file mode 100644 index 00000000..f44f40a5 --- /dev/null +++ b/include/netdev_flow.h @@ -0,0 +1,163 @@ +/* + * DPVS is a software load balancer (Virtual Server) based on DPDK. + * + * Copyright (C) 2020 ByteDance (www.bytedance.com). + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * Copyright (C) 2020 ByteDance (www.bytedance.com). + * All Rights Reserved. + * + * wanlebing@bytedance.com, 12/2020. + */ +#ifndef __NETDEV_FLOW_H__ +#define __NETDEV_FLOW_H__ + +#include +#include + +#include "conf/common.h" +#include "netif.h" + +#ifndef NETDEV +#define NETDEV +#define RTE_LOGTYPE_NETDEV RTE_LOGTYPE_USER1 +#endif + +#define DEFAULT_MAX_PATTERNS 6 +#define DEFAULT_MAX_ACTIONS 6 + +#define NETDEV_FLOW_DEFAULT_MARK_ID 1 +#define NETDEV_FLOW_DEFAULT_RSS_LEVEL 0 + +/* fuzzy match level with signature mode */ +#define DEFAULT_FUZZY_SPEC 2 +#define DEFAULT_FUZZY_LAST 0xfffffff0 +#define DEFAULT_FUZZY_MASK 0xffffffff + +#define NETDEV_IXGBE_DRIVER_NAME "ixgbe" +#define NETDEV_I40E_DRIVER_NAME "i40e" +#define NETDEV_MLNX_DRIVER_NAME "net_mlx5" + +/* flags for netdev flow */ +#define NETDEV_FLOW_F_SIP_FIELD (1 << 0) +#define NETDEV_FLOW_F_DIP_FIELD (1 << 1) +#define NETDEV_FLOW_F_SPORT_FIELD (1 << 2) +#define NETDEV_FLOW_F_DPORT_FIELD (1 << 3) +#define NETDEV_FLOW_F_L3_PROTO_FIELD (1 << 4) +#define NETDEV_FLOW_F_L4_PROTO_FIELD (1 << 5) + +/* + * assign static priority on various flow + * the smaller the priority higher on mellanox nic. + */ +enum netdev_flow_priority { + NETDEV_FLOW_PRIORITY_NONE = 0, + NETDEV_FLOW_PRIORITY_FILTER, + NETDEV_FLOW_PRIORITY_VXLAN, + NETDEV_FLOW_PRIORITY_RSS, +}; + +/* move to next acts index, abort on failure */ +#define get_next_acts_index(index) do { \ + assert((index) < DEFAULT_MAX_ACTIONS - 1); \ + (index)++; \ + } while(0) + +/* move to next patts index, abort on failure */ +#define get_next_patts_index(index) do { \ + assert((index) < DEFAULT_MAX_PATTERNS - 1); \ + (index)++; \ + } while(0) + +/* netdev rss flow init */ +#define NETDEV_RSS_FLOW_INIT(flow, port) do { \ + flow->type = NETDEV_FLOW_TYPE_RSS; \ + flow->port_id = port->id; \ + flow->flow_handle = NULL; \ + flow->hw_offloaded = false; \ + flow->flow_id = netdev_flow_hash(flow); \ + } while(0) + +enum netdev_flow_type { + NETDEV_FLOW_TYPE_RSS, + NETDEV_FLOW_TYPE_FILTER, + NETDEV_FLOW_TYPE_MAX +}; + +union netdev_flow_query { + struct rte_flow_query_count count; + struct rte_flow_action_queue queue; + struct rte_flow_action_rss rss_conf; +}; + +struct netdev_flow_stats { + uint64_t n_pkts; + uint64_t n_bytes; +}; + +struct netdev_flow { + enum netdev_flow_type type; + portid_t port_id; + + /* flow meta data */ + union { + struct { + queueid_t rss_queues[NETIF_MAX_QUEUES]; + uint32_t rss_queue_num; + } rss_info; + struct { + queueid_t queue_id; + uint16_t sport; + uint16_t dport; + uint8_t l3_proto; + uint8_t l4_proto; + union inet_addr saddr; + union inet_addr daddr; + } filter_info; + } data; + + uint32_t flags; + /* unique flow id */ + uint32_t flow_id; + + /* pointer to rte flow in hardware */ + struct rte_flow *flow_handle; + bool hw_offloaded; + struct list_head list; + struct netdev_flow_stats stats; +}; + +/* l4_proto used by i40e only */ +int netdev_flow_add_kni_filter(struct netif_port *port, + const union inet_addr *kni_ip, + queueid_t kni_queue_id, + uint8_t l3_proto, + uint8_t l4_proto); +/* called on dpvs initial */ +int netdev_flow_add_rss_filter(struct netif_port *port); + +/* + * NOTE: netdev flow api, operate flow on initial or terminal, + * need to use lock on rte_flow_* in case of concurrent. + */ +int netdev_flow_init(struct netif_port *port); +int netdev_flow_add(struct netif_port *port, + struct netdev_flow *netdev_flow); +int netdev_flow_del(struct netif_port *port, + struct netdev_flow *netdev_flow); +int netdev_flow_query(struct netif_port *port, + struct netdev_flow *netdev_flow, + union netdev_flow_query *query); +int netdev_flow_flush(struct netif_port *port); + +#endif /* __NETDEV_FLOW_H__ */ diff --git a/include/netif.h b/include/netif.h index cf6a4acc..43dd6c41 100644 --- a/include/netif.h +++ b/include/netif.h @@ -49,6 +49,10 @@ enum { /* max tx/rx queue number for each nic */ #define NETIF_MAX_QUEUES 16 +/* invalid queue id for initial val */ +#define NETIF_QUEUE_ID_INVALID -1 +/* max addr count on kni interface */ +#define NETIF_KNI_ADDR_MAX_NUM 32 /* max nic number used in the program */ #define NETIF_MAX_PORTS 4096 /* maximum pkt number at a single burst */ @@ -73,6 +77,8 @@ enum { #define NETIF_LCORE_ID_INVALID 0xFF +#define MAX_FDIR_PROTO 2 + /************************* lcore conf ***************************/ struct rx_partner; @@ -165,13 +171,28 @@ typedef enum { PORT_TYPE_INVAL, } port_type_t; +typedef enum { + KNI_FWD_MODE_DEFAULT, + KNI_FWD_MODE_ISOLATE_RX, + KNI_FWD_MODE_MAX, +} kni_fwd_mode_t; + +struct kni_addr { + int af; + union inet_addr addr; +} __rte_cache_aligned; + struct netif_kni { char name[IFNAMSIZ]; struct rte_kni *kni; struct ether_addr addr; struct dpvs_timer kni_rtnl_timer; + int ip_addr_cnt; /* total count of kni addrs */ int kni_rtnl_fd; struct rte_ring *rx_ring; + struct kni_addr ip[NETIF_KNI_ADDR_MAX_NUM]; /* ipv4 or ipv6 */ + queueid_t rx_queue_id; /* only one kni queue supported by default */ + kni_fwd_mode_t fwd_mode; /* kni fwd mode: default or isolated rx */ } __rte_cache_aligned; union netif_bond { @@ -227,6 +248,12 @@ struct netif_hw_addr_list { int count; }; +struct flow_info { + struct list_head flow_list; /* store rte flow related on port */ + int flow_cnt; /* current flow count */ + int flow_err; /* error flow count */ +}; + struct netif_port { char name[IFNAMSIZ]; /* device name */ portid_t id; /* device id */ @@ -254,6 +281,9 @@ struct netif_port { struct vlan_info *vlan_info; /* VLANs info for real device */ struct netif_tc tc; /* traffic control */ struct netif_ops *netif_ops; + int rss_queue_num; + queueid_t rss_queues[NETIF_MAX_QUEUES]; + struct flow_info hw_flow_info; /* hardware rte flow on port */ } __rte_cache_aligned; /**************************** lcore API *******************************/ @@ -316,6 +346,7 @@ int netif_ctrl_term(void); /* netif ctrl plane cleanup */ void netif_cfgfile_init(void); void netif_keyword_value_init(void); void install_netif_keywords(void); +lcoreid_t netif_get_kni_lcore_id(void); void kni_ingress(struct rte_mbuf *mbuf, struct netif_port *dev); static inline void *netif_priv(struct netif_port *dev) @@ -339,4 +370,10 @@ static inline uint16_t dpvs_rte_eth_dev_count(void) extern bool dp_vs_fdir_filter_enable; +extern bool dp_vs_kni_isolate_rx_enable; + +typedef int (* netif_filter_op_func)(int af, struct netif_port *dev, lcoreid_t cid, + const union inet_addr *dip, __be16 dport, + uint32_t filter_id[], bool add); + #endif /* __DPVS_NETIF_H__ */ diff --git a/src/kni.c b/src/kni.c index 15b61890..0d2c7dd6 100644 --- a/src/kni.c +++ b/src/kni.c @@ -35,6 +35,7 @@ #include "dpdk.h" #include "netif.h" #include "netif_addr.h" +#include "netdev_flow.h" #include "kni.h" #define Kni /* KNI is defined */ @@ -46,6 +47,11 @@ static struct rte_mempool *kni_mbuf_pool[DPVS_MAX_SOCKET]; +extern netif_filter_op_func g_netif_filter_func; + +/* kni fdir info */ +static struct kni_fdir kni_fdir = { .init_success = false, .port_base_num = 0 }; + static void kni_fill_conf(const struct netif_port *dev, const char *ifname, struct rte_kni_conf *conf) { @@ -404,6 +410,130 @@ int kni_del_dev(struct netif_port *dev) return EDPVS_OK; } +/* + * @brief: init kni fdir, set dip + proto + dport + dst_port_mask + * filters for kni_ip to replace dip + proto filter + * + * @return EDPVS_OK on success, EDPVS_INVAL on failure + */ +int kni_fdir_init(void) +{ + int i, shift; + uint16_t port_base; + uint8_t total_nlcore; + uint8_t expected_nlcore; + uint64_t total_lcore_mask; + + /* get total lcore mask, including worker only */ + netif_get_slave_lcores(&total_nlcore, &total_lcore_mask); + + RTE_LOG(INFO, Kni, "[%s] total_nlcore: %d total_lcore_mask: %llx\n", + __func__, total_nlcore, total_lcore_mask); + + for (shift = 0; (0x1 << shift) < total_nlcore; shift++) + ; + /* exceed maxnum lcore num */ + if (shift >= 16) { + RTE_LOG(ERR, Kni, "[%s] invalid shift: %d total_nlcore: %d\n", + __func__, shift, total_nlcore); + return EDPVS_INVAL; + } + + port_base = 0; + expected_nlcore = (0x1 << shift); + kni_fdir.filter_mask = ~((~0x0) << shift); + + /* specify the num of filters to cover port range [0-65535] */ + for (i = 0; i < expected_nlcore; i++) { + kni_fdir.port_base_array[kni_fdir.port_base_num] = htons(port_base); + memset(kni_fdir.soft_id_array[kni_fdir.port_base_num], 0, MAX_FDIR_PROTO); + RTE_LOG(INFO, Kni, "[%s] port_base_array[%d]: %d\n", + __func__, kni_fdir.port_base_num, htons(port_base)); + kni_fdir.port_base_num++; + port_base++; + } + + RTE_LOG(INFO, Kni, "[%s] kni fdir init success, port_base_num: %d\n", + __func__, kni_fdir.port_base_num); + + /* set kni fdir init flag */ + kni_fdir.init_success = true; + + return EDPVS_OK; +} + +/* + * @brief: add kni fdir, set high priority for bgp/hc + * which will be redirected to kni core on hardware + * + * @param af - AF_INET or AF_INET6 on kni ip + * @param dev - lan or wan link to add kni fdir + * @param kni_ip - kni ip address + * + * @return EDPVS_OK on success, negative on failure + */ +int kni_fdir_filter_add(struct netif_port *dev, + const union inet_addr *kni_ip, + int af) +{ + int err = EDPVS_OK; + int i = 0; + char dst[64]; + union inet_addr addr; + lcoreid_t kni_lcore_id = netif_get_kni_lcore_id(); + + /* params assert */ + if (unlikely(dev == NULL || kni_ip == NULL + || kni_lcore_id == 0)) { + RTE_LOG(ERR, Kni, "[%s] invalid kni fdir params info on port\n", __func__); + return EDPVS_INVAL; + } + + if (unlikely(!kni_fdir.init_success || kni_fdir.port_base_num == 0 + || g_netif_filter_func == NULL)) { + RTE_LOG(ERR, Kni, "[%s] kni fdir info uninitialized on port\n", __func__); + return EDPVS_INVAL; + } + + /* signature mode is required for ipv6 filter on ixgbe */ + if (af == AF_INET6 + && dev->dev_conf.fdir_conf.mode != RTE_FDIR_MODE_SIGNATURE + && strstr(dev->dev_info.driver_name, NETDEV_IXGBE_DRIVER_NAME) != NULL) { + RTE_LOG(ERR, Kni, "[%s] kni ipv6 fdir filter require signature mode on ixgbe\n", __func__); + return EDPVS_INVAL; + } + + for (i = 0; i < kni_fdir.port_base_num; i++) { + err = g_netif_filter_func(af, dev, + kni_lcore_id, kni_ip, + kni_fdir.port_base_array[i], + kni_fdir.soft_id_array[i], + true); + if (unlikely(err != EDPVS_OK)) { + RTE_LOG(ERR, Kni, "[%s] fail to add kni fdir %s filter " + "on port: %s for port_base: %d\n", + __func__, af == AF_INET ? "ipv4" : "ipv6", + dev->name, kni_fdir.port_base_array[i]); + return EDPVS_NOTSUPP; + } + } + + if (af == AF_INET) { + addr.in.s_addr = kni_ip->in.s_addr; + RTE_LOG(INFO, Kni, "[%s] success to add kni fdir ipv4 filter " + "on port: %s for kni_ip: %s\n", + __func__, dev->name, + inet_ntop(AF_INET, &addr, dst, sizeof(dst)) ? dst: ""); + } else { + inet_ntop(AF_INET6, kni_ip, dst, sizeof(dst)); + RTE_LOG(INFO, Kni, "[%s] success to add kni fdir ipv6 filter " + "on port: %s for kni_ip: %s\n", + __func__, dev->name, dst); + } + + return EDPVS_OK; +} + int kni_init(void) { int i; diff --git a/src/main.c b/src/main.c index 6e5021d4..db52f0a6 100644 --- a/src/main.c +++ b/src/main.c @@ -38,6 +38,7 @@ #include "ipv4.h" #include "neigh.h" #include "sa_pool.h" +#include "kni.h" #include "ipvs/ipvs.h" #include "cfgfile.h" #include "ip_tunnel.h" @@ -90,6 +91,8 @@ extern int log_slave_init(void); inet_init, inet_term), \ DPVS_MODULE(MODULE_SA_POOL, "sa_pool", \ sa_pool_init, sa_pool_term), \ + DPVS_MODULE(MODULE_KNI_FDIR, "kni_fdir", \ + kni_fdir_init, NULL), \ DPVS_MODULE(MODULE_IP_TUNNEL, "tunnel", \ ip_tunnel_init, ip_tunnel_term), \ DPVS_MODULE(MODULE_VS, "ipvs", \ diff --git a/src/netdev_flow.c b/src/netdev_flow.c new file mode 100644 index 00000000..c068786a --- /dev/null +++ b/src/netdev_flow.c @@ -0,0 +1,1063 @@ +/* + * DPVS is a software load balancer (Virtual Server) based on DPDK. + * + * Copyright (C) 2020 ByteDance (www.bytedance.com). + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * Copyright (C) 2020 ByteDance (www.bytedance.com). + * All Rights Reserved. + * + * wanlebing@bytedance.com, 12/2020. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "netdev_flow.h" + +/* init seed for unique flow id */ +static uint32_t init_val = 0xdeadbeef; + +/* + * rte flow priority use for mellanox nic + * filter priority should take over other + * we will see something wrong if the priority swapped. + */ +static const uint32_t priority_map[NETDEV_FLOW_TYPE_MAX] = { + [NETDEV_FLOW_TYPE_RSS] = NETDEV_FLOW_PRIORITY_RSS, + [NETDEV_FLOW_TYPE_FILTER] = NETDEV_FLOW_PRIORITY_FILTER, +}; + +/* + * @brief print detailed err msg from rte_flow_* api + */ +static void +netdev_flow_print_err_msg(struct rte_flow_error *error) +{ + static const char *const errstrlist[] = { + [RTE_FLOW_ERROR_TYPE_NONE] = "no error", + [RTE_FLOW_ERROR_TYPE_UNSPECIFIED] = "cause unspecified", + [RTE_FLOW_ERROR_TYPE_HANDLE] = "flow rule (handle)", + [RTE_FLOW_ERROR_TYPE_ATTR_GROUP] = "group field", + [RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY] = "priority field", + [RTE_FLOW_ERROR_TYPE_ATTR_INGRESS] = "ingress field", + [RTE_FLOW_ERROR_TYPE_ATTR_EGRESS] = "egress field", + [RTE_FLOW_ERROR_TYPE_ATTR_TRANSFER] = "transfer field", + [RTE_FLOW_ERROR_TYPE_ATTR] = "attributes structure", + [RTE_FLOW_ERROR_TYPE_ITEM_NUM] = "pattern length", + [RTE_FLOW_ERROR_TYPE_ITEM_SPEC] = "item specification", + [RTE_FLOW_ERROR_TYPE_ITEM_LAST] = "item specification range", + [RTE_FLOW_ERROR_TYPE_ITEM_MASK] = "item specification mask", + [RTE_FLOW_ERROR_TYPE_ITEM] = "specific pattern item", + [RTE_FLOW_ERROR_TYPE_ACTION_NUM] = "number of actions", + [RTE_FLOW_ERROR_TYPE_ACTION_CONF] = "action configuration", + [RTE_FLOW_ERROR_TYPE_ACTION] = "specific action", + }; + + const char *errstr; + char buf[32]; + int err = rte_errno; + + if ((unsigned int)error->type >= RTE_DIM(errstrlist) || + !errstrlist[error->type]) + errstr = "unknown type"; + else + errstr = errstrlist[error->type]; + + RTE_LOG(ERR, NETDEV,"Caught error type %d (%s): %s%s: %s\n", + error->type, errstr, + error->cause ? (snprintf(buf, sizeof(buf), "cause: %p, ", + error->cause), buf) : "", + error->message ? error->message : "(no stated reason)", + rte_strerror(err)); +} + +/* + * @brief generate a unique flow id + * + * @return crc hash on flow key + */ +static inline uint32_t +netdev_flow_hash(struct netdev_flow *flow) +{ + return rte_hash_crc((void *)flow, + offsetof(struct netdev_flow, flow_id), + init_val); +} + +/* + * @brief find netdev_flow with flow id + * + * @return netdev flow associated with flow id + */ +static struct netdev_flow * +netdev_flow_lookup_by_uuid(struct netif_port *port, uint32_t uuid) +{ + struct netdev_flow *flow; + + rte_rwlock_write_lock(&port->dev_lock); + + /* lookup with flow list on port */ + list_for_each_entry(flow, &port->hw_flow_info.flow_list, list) { + if (flow->flow_id == uuid) { + rte_rwlock_write_unlock(&port->dev_lock); + return flow; + } + } + + rte_rwlock_write_unlock(&port->dev_lock); + return NULL; +} + +/* + * @brief add ingress flow attr + * + * @return void + */ +static inline void +netdev_flow_add_ingress_attribute(struct netdev_flow *netdev_flow, + struct rte_flow_attr *attr) +{ + struct netif_port *port = netif_port_get(netdev_flow->port_id); + if (unlikely(port == NULL)) { + RTE_LOG(ERR, NETDEV, + "[%s]: invalid netdev info on port\n", + __func__); + return; + } + + attr->ingress = 1; + /* priority supported by mellanox only */ + if (strstr(port->dev_info.driver_name, NETDEV_MLNX_DRIVER_NAME) != NULL) { + attr->priority = priority_map[netdev_flow->type]; + } +} + +/* + * @brief add fuzzy pattern + * refer to RTE_FLOW_ITEM_TYPE_FUZZY and signature_match(), + * Threshold 0 means perfect match (no fuzziness), while threshold + * 0xffffffff means fuzziest match. + * + * @return void + */ +static inline void +netdev_flow_add_fuzzy_pattern(struct netif_port *port, + struct rte_flow_item patts[], + int *index) +{ + static struct rte_flow_item_fuzzy fuzzy_spec = { .thresh = DEFAULT_FUZZY_SPEC }; + static struct rte_flow_item_fuzzy fuzzy_last = { .thresh = DEFAULT_FUZZY_LAST }; + static struct rte_flow_item_fuzzy fuzzy_mask = { .thresh = DEFAULT_FUZZY_MASK }; + + patts[*index].type = RTE_FLOW_ITEM_TYPE_FUZZY; + patts[*index].spec = &fuzzy_spec; + patts[*index].last = &fuzzy_last; + patts[*index].mask = &fuzzy_mask; +} + +/* + * @brief add mark action + * + * @return void + */ +static inline void +netdev_flow_add_mark_action(struct netif_port *port, + struct rte_flow_action acts[], + int *index) +{ + static struct rte_flow_action_mark mark = { .id = NETDEV_FLOW_DEFAULT_MARK_ID}; + + /* mark action not supported by ixgbe */ + if (strstr(port->dev_info.driver_name, NETDEV_IXGBE_DRIVER_NAME) == NULL) { + get_next_acts_index(*index); + acts[*index].type = RTE_FLOW_ACTION_TYPE_MARK; + acts[*index].conf = &mark; + } +} + +/* + * @brief add count action + * + * @return void + */ +static inline void +netdev_flow_add_count_action(struct netif_port *port, + struct rte_flow_action acts[], + int *index) +{ + static struct rte_flow_action_count count; + + /* count action supported by mellanox only */ + if (strstr(port->dev_info.driver_name, NETDEV_MLNX_DRIVER_NAME) != NULL) { + get_next_acts_index(*index); + acts[*index].type = RTE_FLOW_ACTION_TYPE_COUNT; + acts[*index].conf = &count; + } +} + +/* + * @brief add rss patterns + * + * @return void + */ +static inline void +netdev_flow_add_rss_patterns(struct netdev_flow *netdev_flow, + struct rte_flow_item patts[]) +{ + int index = 0; + struct netif_port *port = netif_port_get(netdev_flow->port_id); + if (unlikely(port == NULL)) { + RTE_LOG(ERR, NETDEV, + "[%s]: invalid netdev info on port\n", + __func__); + return; + } + + static struct rte_flow_item_eth eth_spec; + static struct rte_flow_item_eth eth_mask; + memset(ð_spec, 0, sizeof(struct rte_flow_item_eth)); + memset(ð_mask, 0, sizeof(struct rte_flow_item_eth)); + + /* + * rss pattern not supported on i40e + * TODO: without rss pattern maybe perf better + */ + if (strstr(port->dev_info.driver_name, NETDEV_I40E_DRIVER_NAME) == NULL) { + patts[index].type = RTE_FLOW_ITEM_TYPE_ETH; + patts[index].spec = ð_spec; + patts[index].mask = ð_mask; + } + + get_next_patts_index(index); + patts[index].type = RTE_FLOW_ITEM_TYPE_END; +} + +/* + * @brief add rss actions + * + * @return void + */ +static inline void +netdev_flow_add_rss_actions(portid_t port_id, + struct netdev_flow *netdev_flow, + struct rte_flow_action acts[]) +{ + int i, index = 0; + struct netif_port *port; + static struct rte_flow_action_rss rss; + static uint16_t queue[RTE_MAX_QUEUES_PER_PORT]; + + port = netif_port_get(port_id); + if (unlikely(port == NULL)) { + RTE_LOG(ERR, NETDEV, + "[%s]: invalid netdev info on port\n", + __func__); + return; + } + + /* queue region exclude kni and ha queue */ + for (i = 0; i < port->rss_queue_num; i++) { + queue[i] = port->rss_queues[i]; + } + + rss = (struct rte_flow_action_rss) { + .func = RTE_ETH_HASH_FUNCTION_DEFAULT, + .types = port->dev_conf.rx_adv_conf.rss_conf.rss_hf, + .key_len = port->dev_conf.rx_adv_conf.rss_conf.rss_key_len, + .queue_num = port->rss_queue_num, + .key = port->dev_conf.rx_adv_conf.rss_conf.rss_key, + .queue = queue, + .level = NETDEV_FLOW_DEFAULT_RSS_LEVEL, + }; + + acts[index].type = RTE_FLOW_ACTION_TYPE_RSS; + acts[index].conf = &rss; + + /* TODO: remove count action if perf degraded */ + netdev_flow_add_count_action(port, acts, &index); + + get_next_acts_index(index); + acts[index].type = RTE_FLOW_ACTION_TYPE_END; +} + +/* + * @brief add l3/l4 filter patterns + * + * @return void + */ +static inline void +netdev_flow_add_filter_patterns(struct netdev_flow *netdev_flow, + struct rte_flow_item patts[]) +{ + int index = 0; + struct netif_port *port; + static struct rte_flow_item_eth eth_spec; + static struct rte_flow_item_eth eth_mask; + static struct rte_flow_item_ipv4 ip4_spec; + static struct rte_flow_item_ipv4 ip4_mask; + static struct rte_flow_item_ipv6 ip6_spec; + static struct rte_flow_item_ipv6 ip6_mask; + static struct rte_flow_item_tcp tcp_spec; + static struct rte_flow_item_tcp tcp_mask; + static struct rte_flow_item_udp udp_spec; + static struct rte_flow_item_udp udp_mask; + + memset(ð_spec, 0, sizeof(struct rte_flow_item_eth)); + memset(ð_mask, 0, sizeof(struct rte_flow_item_eth)); + + port = netif_port_get(netdev_flow->port_id); + if (unlikely(port == NULL)) { + RTE_LOG(ERR, NETDEV, + "[%s]: invalid netdev info on port\n", + __func__); + return; + } + + /* fuzzy pattern is used by ipv6 flows on ixgbe */ + if (strstr(port->dev_info.driver_name, NETDEV_IXGBE_DRIVER_NAME) != NULL + && netdev_flow->data.filter_info.l3_proto == AF_INET6) { + netdev_flow_add_fuzzy_pattern(port, patts, &index); + } else { + /* mellanox and i40e fall into here */ + patts[index].type = RTE_FLOW_ITEM_TYPE_ETH; + patts[index].spec = ð_spec; + patts[index].mask = ð_mask; + } + + /* Fill inner L3 item */ + switch (netdev_flow->data.filter_info.l3_proto) { + case AF_INET: + get_next_patts_index(index); + memset(&ip4_spec, 0, sizeof(struct rte_flow_item_ipv4)); + memset(&ip4_mask, 0, sizeof(struct rte_flow_item_ipv4)); + + /* set dst ipv4 */ + ip4_spec.hdr.dst_addr = netdev_flow->data.filter_info.daddr.in.s_addr; + memset(&ip4_mask.hdr.dst_addr, 0xff, sizeof(ip4_mask.hdr.dst_addr)); + + patts[index].type = RTE_FLOW_ITEM_TYPE_IPV4; + patts[index].spec = &ip4_spec; + patts[index].mask = &ip4_mask; + break; + case AF_INET6: + get_next_patts_index(index); + memset(&ip6_spec, 0, sizeof(struct rte_flow_item_ipv6)); + memset(&ip6_mask, 0, sizeof(struct rte_flow_item_ipv6)); + + /* set src ipv6 */ + if (netdev_flow->flags & NETDEV_FLOW_F_SIP_FIELD) { + rte_memcpy(ip6_spec.hdr.src_addr, netdev_flow->data.filter_info.saddr.in6.s6_addr, + sizeof(ip6_spec.hdr.src_addr)); + memset(ip6_mask.hdr.src_addr, 0xff, sizeof(ip6_mask.hdr.src_addr)); + } + + /* set dst ipv6 */ + rte_memcpy(ip6_spec.hdr.dst_addr, netdev_flow->data.filter_info.daddr.in6.s6_addr, + sizeof(ip6_spec.hdr.dst_addr)); + memset(ip6_mask.hdr.dst_addr, 0xff, sizeof(ip6_mask.hdr.dst_addr)); + + patts[index].type = RTE_FLOW_ITEM_TYPE_IPV6; + patts[index].spec = &ip6_spec; + patts[index].mask = &ip6_mask; + break; + default: + RTE_LOG(WARNING, NETDEV, "[%s]: unknown l3 proto\n", __func__); + break; + } + + /* Fill inner L4 item */ + switch (netdev_flow->data.filter_info.l4_proto) { + case IPPROTO_TCP: + get_next_patts_index(index); + memset(&tcp_spec, 0, sizeof(struct rte_flow_item_tcp)); + memset(&tcp_mask, 0, sizeof(struct rte_flow_item_tcp)); + + /* set dst port */ + tcp_spec.hdr.dst_port = netdev_flow->data.filter_info.dport; + tcp_mask.hdr.dst_port = tcp_spec.hdr.dst_port == 0 ? 0x0 : 0xffff; + + /* set src port */ + tcp_spec.hdr.src_port = netdev_flow->data.filter_info.sport; + tcp_mask.hdr.src_port = tcp_spec.hdr.src_port == 0 ? 0x0 : 0xffff; + + patts[index].type = RTE_FLOW_ITEM_TYPE_TCP; + patts[index].spec = &tcp_spec; + patts[index].mask = &tcp_mask; + break; + case IPPROTO_UDP: + get_next_patts_index(index); + memset(&udp_spec, 0, sizeof(struct rte_flow_item_udp)); + memset(&udp_mask, 0, sizeof(struct rte_flow_item_udp)); + + /* set dst port */ + udp_spec.hdr.dst_port = netdev_flow->data.filter_info.dport; + udp_mask.hdr.dst_port = udp_spec.hdr.dst_port == 0 ? 0x0 : 0xffff; + + patts[index].type = RTE_FLOW_ITEM_TYPE_UDP; + patts[index].spec = &udp_spec; + patts[index].mask = &udp_mask; + break; + default: + RTE_LOG(WARNING, NETDEV, "[%s]: unknown l4 proto\n", __func__); + break; + } + + get_next_patts_index(index); + patts[index].type = RTE_FLOW_ITEM_TYPE_END; +} + +/* + * @brief add l3/l4 filter actions + * + * @return void + */ +static inline void +netdev_flow_add_filter_actions(struct netdev_flow *netdev_flow, + struct rte_flow_action acts[]) +{ + int index = 0; + struct netif_port *port; + static struct rte_flow_action_queue queue = { .index = 0 }; + + port = netif_port_get(netdev_flow->port_id); + if (unlikely(port == NULL)) { + RTE_LOG(ERR, NETDEV, + "[%s]: invalid netdev info on port\n", + __func__); + return; + } + + queue = (struct rte_flow_action_queue) { + .index = netdev_flow->data.filter_info.queue_id, + }; + + /* queue action is essential */ + acts[index].type = RTE_FLOW_ACTION_TYPE_QUEUE; + acts[index].conf = &queue; + + /* + * attach an integer value to packets and + * set PKT_RX_FDIR and PKT_RX_FDIR_ID mbuf flags + */ + netdev_flow_add_mark_action(port, acts, &index); + + /* count action supported by mellanox only */ + netdev_flow_add_count_action(port, acts, &index); + + get_next_acts_index(index); + acts[index].type = RTE_FLOW_ACTION_TYPE_END; +} + +/* + * @brief add egress flow attr + * + * @return void + */ +static inline void +netdev_flow_add_egress_attribute(struct rte_flow_attr *attr) +{ + attr->egress = 1; +} + +/* + * @brief add netdev flow on port + * + * @param port - dpdk or other type port + * @param netdev_flow - flow store on netif_port + * + * @return EDPVS_OK on success, EDPVS_DPDKAPIFAIL on failure + */ +int netdev_flow_add(struct netif_port *port, + struct netdev_flow *netdev_flow) +{ + int err = EDPVS_OK; + struct rte_flow_error error; + struct rte_flow_attr attr; + struct rte_flow_item patts[DEFAULT_MAX_PATTERNS]; + struct rte_flow_action acts[DEFAULT_MAX_ACTIONS]; + union netdev_flow_query query; + struct rte_flow *flow = NULL; + portid_t port_id; + + if (unlikely(netdev_flow == NULL || port == NULL)) { + RTE_LOG(ERR, NETDEV, + "[%s]: invalid netdev flow info on port\n", + __func__); + return EDPVS_INVAL; + } + + port_id = port->id; + + memset(&error, 0, sizeof(error)); + memset(&attr, 0, sizeof(attr)); + memset(patts, 0, sizeof(patts)); + memset(acts, 0, sizeof(acts)); + + rte_rwlock_write_lock(&port->dev_lock); + + switch (netdev_flow->type) { + case NETDEV_FLOW_TYPE_RSS: + /* setup rss queues info */ + netdev_flow_add_ingress_attribute(netdev_flow, &attr); + netdev_flow_add_rss_patterns(netdev_flow, patts); + netdev_flow_add_rss_actions(port_id, netdev_flow, acts); + break; + case NETDEV_FLOW_TYPE_FILTER: + /* setup filter flow */ + netdev_flow_add_ingress_attribute(netdev_flow, &attr); + netdev_flow_add_filter_patterns(netdev_flow, patts); + netdev_flow_add_filter_actions(netdev_flow, acts); + break; + default: + RTE_LOG(WARNING, NETDEV, + "[%s]: unsupported netdev flow type\n", __func__); + rte_flow_error_set(&error, EINVAL, + RTE_FLOW_ERROR_TYPE_UNSPECIFIED, + NULL, "unsupported netdev flow type."); + goto err_out; + }; + + err = rte_flow_validate(port_id, &attr, patts, acts, &error); + if (unlikely(err == -ENOTSUP || err == -ENOSYS)) { + RTE_LOG(ERR, NETDEV, + "[%s]: rte_flow not supported on port %d\n", + __func__, port_id); + goto err_out; + } else if (err != EDPVS_OK) { + RTE_LOG(ERR, NETDEV, + "[%s]: failed to validate netdev flow on port %d\n", + __func__, port_id); + goto err_out; + } + + flow = rte_flow_create(port_id, &attr, patts, acts, &error); + if (unlikely(flow == NULL)) { + RTE_LOG(ERR, NETDEV, + "[%s]: failed to add netdev flow on port %d\n", + __func__, port_id); + goto err_out; + } + + netdev_flow->flow_handle = flow; + netdev_flow->hw_offloaded = true; + + /* store to flow list */ + port->hw_flow_info.flow_cnt++; + list_add_tail(&netdev_flow->list, &port->hw_flow_info.flow_list); + + rte_rwlock_write_unlock(&port->dev_lock); + + RTE_LOG(INFO, NETDEV, + "[%s]: success to add netdev flow on port %d\n", + __func__, port_id); + + /* + * verify flow existed in hardware + * supported only on mellanox. + */ + if (strstr(port->dev_info.driver_name, NETDEV_MLNX_DRIVER_NAME) != NULL) { + netdev_flow_query(port, + netdev_flow, + &query); + } + + return EDPVS_OK; + +err_out: + port->hw_flow_info.flow_err++; + rte_rwlock_write_unlock(&port->dev_lock); + netdev_flow_print_err_msg(&error); + return EDPVS_DPDKAPIFAIL; +} + +/* + * @brief destroy netdev flow on port + * + * @param port - dpdk or other type port + * @param netdev_flow - flow store on netif_port + * + * @return EDPVS_OK on success, negative on failure + */ +int netdev_flow_del(struct netif_port *port, + struct netdev_flow *netdev_flow) +{ + int err = EDPVS_OK; + struct rte_flow_error error; + + if (unlikely(NULL == netdev_flow || NULL == port)) { + RTE_LOG(ERR, NETDEV, + "[%s]: invalid netdev flow info on port\n", + __func__); + return EDPVS_INVAL; + } + + memset(&error, 0, sizeof(error)); + + rte_rwlock_write_lock(&port->dev_lock); + + err = rte_flow_destroy(port->id, netdev_flow->flow_handle, &error); + if (unlikely(err != EDPVS_OK)) { + RTE_LOG(ERR, NETDEV, + "[%s]: failed to remove netdev flow %#x on port %d\n", + __func__, netdev_flow->flow_id, port->id); + goto err_out; + } + + /* remove from flow list */ + port->hw_flow_info.flow_cnt--; + list_del(&netdev_flow->list); + + /* free flow on destroyed */ + rte_free(netdev_flow); + + rte_rwlock_write_unlock(&port->dev_lock); + return EDPVS_OK; + +err_out: + port->hw_flow_info.flow_err++; + rte_rwlock_write_unlock(&port->dev_lock); + netdev_flow_print_err_msg(&error); + return EDPVS_DPDKAPIFAIL; +} + +/* + * @brief print query info on port + */ +static void +print_query_info(const struct rte_flow_action *action, + union netdev_flow_query *query) +{ + if (unlikely(action == NULL)) + return; + + /* flow action query */ + switch (action->type) { + case RTE_FLOW_ACTION_TYPE_QUEUE: + RTE_LOG(INFO, NETDEV, + "[%s]: flow queue query index: %d\n", + __func__, query->queue.index); + break; + case RTE_FLOW_ACTION_TYPE_COUNT: + RTE_LOG(INFO, NETDEV, + "[%s]: flow count query:" + " hits_set: %u bytes_set: %u" + " hits: %"PRIu64" bytes: %"PRIu64"\n", + __func__, + query->count.hits_set, query->count.bytes_set, + query->count.hits, query->count.bytes); + break; + case RTE_FLOW_ACTION_TYPE_RSS: + break; + default: + break; + } +} + +/* + * @brief query netdev flow on port + * + * @param port - dpdk or other type port + * @param netdev_flow - flow store on netif_port + * @param query - flow query result + * + * @return EDPVS_OK on success, EDPVS_DPDKAPIFAIL on failure + */ +int netdev_flow_query(struct netif_port *port, + struct netdev_flow *netdev_flow, + union netdev_flow_query *query) +{ + int err = EDPVS_OK; + struct rte_flow_error error; + + /* default flow count query */ + struct rte_flow_action_count count = { .shared = 0, .id = 0 }; + const struct rte_flow_action action[] = { + { + .type = RTE_FLOW_ACTION_TYPE_COUNT, + .conf = &count, + }, + { + .type = RTE_FLOW_ACTION_TYPE_END, + }, + }; + + memset(&error, 0, sizeof(error)); + + rte_rwlock_write_lock(&port->dev_lock); + + err = rte_flow_query(port->id, netdev_flow->flow_handle, action, query, &error); + if (unlikely(err != EDPVS_OK)) { + RTE_LOG(ERR, NETDEV, "[%s]: failed to query flow %#x on" + " port %d, err %d\n", __func__, + netdev_flow->flow_id, port->id, err); + goto err_out; + } + + rte_rwlock_write_unlock(&port->dev_lock); + + print_query_info(action, query); + + return EDPVS_OK; + +err_out: + rte_rwlock_write_unlock(&port->dev_lock); + netdev_flow_print_err_msg(&error); + return EDPVS_DPDKAPIFAIL; +} + +/* + * @brief flush netdev flow on port + * + * @param port - dpdk or other type port + * + * @return EDPVS_OK on success, negative on failure + */ +int netdev_flow_flush(struct netif_port *port) +{ + int err = EDPVS_OK; + struct netdev_flow *flow; + struct rte_flow_error error; + + if (unlikely(port == NULL)) { + RTE_LOG(ERR, NETDEV, + "[%s]: invalid netdev info on port\n", + __func__); + return EDPVS_INVAL; + } + + memset(&error, 0, sizeof(error)); + + rte_rwlock_write_lock(&port->dev_lock); + + /* flush flows on port */ + err = rte_flow_flush(port->id, &error); + if (unlikely(err != EDPVS_OK)) { + RTE_LOG(ERR, NETDEV, + "[%s]: failed to flush flow on port %d, err %d\n", + __func__, port->id, err); + goto err_out; + } + + /* empty flow list, need lock here */ + list_for_each_entry(flow, &port->hw_flow_info.flow_list, list) { + port->hw_flow_info.flow_cnt--; + list_del(&flow->list); + rte_free(flow); + } + + rte_rwlock_write_unlock(&port->dev_lock); + return EDPVS_OK; + +err_out: + port->hw_flow_info.flow_err++; + rte_rwlock_write_unlock(&port->dev_lock); + netdev_flow_print_err_msg(&error); + return EDPVS_DPDKAPIFAIL; +} + +/* + * @brief init kni flow, params validated + * + * @param flow - netdev flow + * @param port - dpdk or other type port + * @param kni_ip - ip addr of kni port + * @param kni_queue_id - queue polled by kni core + * @param l3_proto - AF_INET or AF_INET6 + * @param l4_proto - IPPROTO_UDP IPPROTO_TCP or IPPROTO_IP + * + * @return EDPVS_OK on success, EDPVS_INVAL on failure + */ +static inline +int netdev_flow_init_kni_filter(struct netdev_flow *flow, + struct netif_port *port, + const union inet_addr *kni_ip, + queueid_t kni_queue_id, + uint8_t l3_proto, + uint8_t l4_proto) +{ + if (unlikely(flow == NULL || port == NULL + || kni_ip == NULL)) { + RTE_LOG(ERR, NETDEV, + "[%s]: invalid input info on port\n", + __func__); + return EDPVS_INVAL; + } + + flow->type = NETDEV_FLOW_TYPE_FILTER; + flow->port_id = port->id; + flow->data.filter_info.l3_proto = l3_proto; + flow->data.filter_info.l4_proto = l4_proto; + flow->data.filter_info.queue_id = kni_queue_id; + flow->flow_handle = NULL; + flow->hw_offloaded = false; + rte_memcpy(&(flow->data.filter_info.daddr), kni_ip, + sizeof(union inet_addr)); + flow->flags |= NETDEV_FLOW_F_DIP_FIELD; + flow->flags |= NETDEV_FLOW_F_L3_PROTO_FIELD; + flow->flags |= NETDEV_FLOW_F_L4_PROTO_FIELD; + flow->flow_id = netdev_flow_hash(flow); + + return EDPVS_OK; +} + +/* + * @brief log kni flow, params validated + * + * @param flow - netdev flow + * @param port - dpdk or other type port + * @param kni_ip - ip addr of kni port + * @param kni_queue_id - queue polled by kni core + * @param l3_proto - AF_INET or AF_INET6 + * @param l4_proto - IPPROTO_UDP IPPROTO_TCP or IPPROTO_IP + * + * @return EDPVS_OK on success, EDPVS_INVAL on failure + */ +static inline +int netdev_flow_log_kni_filter(struct netdev_flow *flow, + struct netif_port *port, + const union inet_addr *kni_ip, + queueid_t kni_queue_id, + uint8_t l3_proto, + uint8_t l4_proto) +{ + char dst[64]; + union inet_addr addr; + + if (unlikely(flow == NULL || port == NULL + || kni_ip == NULL)) { + RTE_LOG(ERR, NETDEV, + "[%s]: invalid input info on port\n", + __func__); + return EDPVS_INVAL; + } + + if (l3_proto == AF_INET) { + addr.in.s_addr = kni_ip->in.s_addr; + RTE_LOG(INFO, NETDEV, "[%s] success to alloc kni ipv4 flow %#x " + "on port %d kni_ip %s kni_queue_id %d\n", + __func__, flow->flow_id, port->id, + inet_ntop(AF_INET, &addr, dst, sizeof(dst)) ? dst: "", + kni_queue_id); + } else { + inet_ntop(AF_INET6, kni_ip, dst, sizeof(dst)); + RTE_LOG(INFO, NETDEV, "[%s] success to alloc ha kni ipv6 flow %#x " + "on port %d kni_ip %s kni_queue_id %d\n", + __func__, flow->flow_id, port->id, + dst, kni_queue_id); + } + + return EDPVS_OK; +} + +/* + * @brief configure kni flow for kni port + * + * @param port - dpdk or other type port + * @param kni_ip - ip addr of kni port + * @param kni_queue_id - queue polled by kni core + * @param l3_proto - AF_INET or AF_INET6 + * @param l4_proto - IPPROTO_UDP IPPROTO_TCP or IPPROTO_IP + * + * @return EDPVS_OK on success, negative on failure + */ +int netdev_flow_add_kni_filter(struct netif_port *port, + const union inet_addr *kni_ip, + queueid_t kni_queue_id, + uint8_t l3_proto, + uint8_t l4_proto) +{ + int err = EDPVS_OK; + struct netdev_flow *flow; + + /* params assert */ + if (unlikely(port == NULL)) { + RTE_LOG(ERR, NETDEV, + "[%s]: invalid netdev info on port\n", + __func__); + return EDPVS_INVAL; + } + + if (unlikely(kni_ip == NULL)) { + RTE_LOG(ERR, NETDEV, + "[%s]: invalid kni ip info on port\n", + __func__); + return EDPVS_INVAL; + } + + flow = rte_zmalloc("kni_flow", + sizeof(struct netdev_flow), + RTE_CACHE_LINE_SIZE); + if (unlikely(flow == NULL)) { + RTE_LOG(ERR, NETDEV, + "[%s]: failed to alloc kni flow on port %d\n", + __func__, port->id); + port->hw_flow_info.flow_err++; + return EDPVS_NOMEM; + } + + /* init kni flow */ + netdev_flow_init_kni_filter(flow, port, + kni_ip, kni_queue_id, + l3_proto, l4_proto); + + /* log kni flow */ + netdev_flow_log_kni_filter(flow, port, + kni_ip, kni_queue_id, + l3_proto, l4_proto); + + /* lookup netdev flow on port */ + if (netdev_flow_lookup_by_uuid(port, flow->flow_id)) { + RTE_LOG(INFO, NETDEV, + "[%s]: netdev flow %#x already exists on port %d\n", + __func__, flow->flow_id, port->id); + err = EDPVS_INVAL; + goto done; + } + + /* add netdev flow on port */ + err = netdev_flow_add(port, flow); + if (unlikely(err == EDPVS_INVAL)) { + RTE_LOG(ERR, NETDEV, + "[%s]: failed to create kni flow %#x on port %d\n", + __func__, flow->flow_id, port->id); + goto done; + } + + return EDPVS_OK; + +done: + rte_free(flow); + return err; +} + +/* + * @brief configure rss queues region, + * exclude kni and ha queues, + * should called after rte_eth_rx_queue_setup(). + * + * @param port - dpdk or other type port + * + * @return EDPVS_OK on success, negative on failure + */ +int netdev_flow_add_rss_filter(struct netif_port *port) +{ + int err = EDPVS_OK; + struct netdev_flow *flow; + + if (unlikely(port == NULL)) { + RTE_LOG(ERR, NETDEV, + "[%s]: invalid netdev info on port\n", + __func__); + return EDPVS_INVAL; + } + + flow = rte_zmalloc("rss_flow", + sizeof(struct netdev_flow), + RTE_CACHE_LINE_SIZE); + if (unlikely(flow == NULL)) { + RTE_LOG(ERR, NETDEV, + "[%s]: failed to alloc rss flow on port %d\n", + __func__, port->id); + port->hw_flow_info.flow_err++; + return EDPVS_NOMEM; + } + + /* init rss flow */ + NETDEV_RSS_FLOW_INIT(flow, port); + + RTE_LOG(INFO, NETDEV, + "[%s]: success to alloc rss flow %#x on port %d\n", + __func__, flow->flow_id, port->id); + + /* lookup netdev flow on port */ + if (netdev_flow_lookup_by_uuid(port, flow->flow_id)) { + RTE_LOG(INFO, NETDEV, + "[%s]: netdev flow %#x already exists on port %d\n", + __func__, flow->flow_id, port->id); + err = EDPVS_INVAL; + goto done; + } + + /* add netdev flow on port */ + err = netdev_flow_add(port, flow); + if (unlikely(err != EDPVS_OK)) { + RTE_LOG(ERR, NETDEV, + "[%s]: failed to create rss flow %#x on port %d err %d\n", + __func__, flow->flow_id, port->id, err); + goto done; + } + + return EDPVS_OK; + +done: + rte_free(flow); + return err; +} + +int netdev_flow_init(struct netif_port *port) +{ + int err = EDPVS_OK; + + /* flush rte flows, exception occured on i40e driver */ + if (strstr(port->dev_info.driver_name, NETDEV_I40E_DRIVER_NAME) == NULL) { + err = netdev_flow_flush(port); + if (unlikely(err != EDPVS_OK)) { + RTE_LOG(ERR, NETDEV, + "[%s]: failed to flush netdev flow on port %s\n", + __func__, port->name); + return EDPVS_DPDKAPIFAIL; + } + + RTE_LOG(INFO, NETDEV, + "[%s]: success to flush netdev flow on port %s\n", + __func__, port->name); + } + + /* config rss rte flows on port init */ + err = netdev_flow_add_rss_filter(port); + if (unlikely(err != EDPVS_OK)) { + RTE_LOG(ERR, NETDEV, + "[%s]: failed to config rss flow on port %s\n", + __func__, port->name); + return EDPVS_DPDKAPIFAIL; + } + + RTE_LOG(INFO, NETDEV, + "[%s]: success to config rss flow on port %s\n", + __func__, port->name); + + return err; +} diff --git a/src/netif.c b/src/netif.c index 730d86a1..190e558c 100644 --- a/src/netif.c +++ b/src/netif.c @@ -28,6 +28,7 @@ #include "conf/common.h" #include "netif.h" #include "netif_addr.h" +#include "netdev_flow.h" #include "vlan.h" #include "ctrl.h" #include "list.h" @@ -102,6 +103,9 @@ struct port_conf_stream { bool promisc_mode; + int kni_addr_cnt; + struct kni_addr kni_ip[NETIF_KNI_ADDR_MAX_NUM]; /* ipv4 or ipv6 */ + struct list_head port_list_node; }; @@ -161,6 +165,36 @@ static uint64_t g_isol_rx_lcore_mask; bool dp_vs_fdir_filter_enable = true; +bool dp_vs_kni_isolate_rx_enable = true; + +/* + * @brief assert kni rx is isolated on dev, kni + * isolated rx benefits for hc/ssh traffic + * on lan and for bgp connection on wan. + * + * @param dev - lan or wan link + * + * @return true on enabled, false on disabled + */ +static inline bool netif_kni_isolate_rx_enable(const struct netif_port *dev) +{ + /* the premise of kni isolate rx */ + if (likely(dp_vs_kni_isolate_rx_enable) + && dev->kni.rx_queue_id != NETIF_QUEUE_ID_INVALID + && dev->nrxq != dev->rss_queue_num + && dev->kni.ip_addr_cnt != 0 + && dev->type == PORT_TYPE_GENERAL) { + return true; + } + + return false; +} + +lcoreid_t netif_get_kni_lcore_id(void) +{ + return g_kni_lcore_id; +} + bool is_lcore_id_valid(lcoreid_t cid) { if (unlikely(cid >= DPVS_MAX_LCORE)) @@ -507,6 +541,74 @@ static void fdir_filter_handler(vector_t tokens) FREE_PTR(str); } +static void kni_isolate_handler(vector_t tokens) +{ + char *str = set_value(tokens); + + assert(str); + + if (strcasecmp(str, "on") == 0) + dp_vs_kni_isolate_rx_enable = true; + else if (strcasecmp(str, "off") == 0) + dp_vs_kni_isolate_rx_enable = false; + else + RTE_LOG(WARNING, IPVS, "invalid kni:isolated %s\n", str); + + RTE_LOG(INFO, IPVS, "kni:isolated = %s\n", dp_vs_kni_isolate_rx_enable ? "on" : "off"); + + FREE_PTR(str); +} + +static void kni_ipv4_address_handler(vector_t tokens) +{ + char *str = set_value(tokens); + struct port_conf_stream *current_device = list_entry(port_list.next, + struct port_conf_stream, port_list_node); + + assert(str); + + if (unlikely(strstr(str, ".") == NULL + || current_device->kni_addr_cnt >= NETIF_KNI_ADDR_MAX_NUM)) { + RTE_LOG(WARNING, NETIF, "%s: invalid kni v4 ip %s\n", current_device->name, str); + return; + } + + if (inet_pton(AF_INET, str, ¤t_device->kni_ip[current_device->kni_addr_cnt].addr) <= 0) { + RTE_LOG(WARNING, NETIF, "%s: parse kni v4 ip %s failed\n", current_device->name, str); + return; + } + + current_device->kni_ip[current_device->kni_addr_cnt++].af = AF_INET; + + RTE_LOG(INFO, NETIF, "%s: kni v4 addr = %s, count = %d\n", current_device->name, + str, current_device->kni_addr_cnt); +} + +static void kni_ipv6_address_handler(vector_t tokens) +{ + char *str = set_value(tokens); + struct port_conf_stream *current_device = list_entry(port_list.next, + struct port_conf_stream, port_list_node); + + assert(str); + + if (unlikely(strstr(str, ":") == NULL + || current_device->kni_addr_cnt >= NETIF_KNI_ADDR_MAX_NUM)) { + RTE_LOG(WARNING, NETIF, "%s: invalid kni v6 ip %s\n", current_device->name, str); + return; + } + + if (inet_pton(AF_INET6, str, ¤t_device->kni_ip[current_device->kni_addr_cnt].addr) <= 0) { + RTE_LOG(WARNING, NETIF, "%s: parse kni v6 addr %s failed\n", current_device->name, str); + return; + } + + current_device->kni_ip[current_device->kni_addr_cnt++].af = AF_INET6; + + RTE_LOG(INFO, NETIF, "%s: kni v6 addr = %s, count = %d\n", current_device->name, + str, current_device->kni_addr_cnt); +} + static void promisc_mode_handler(vector_t tokens) { struct port_conf_stream *current_device = list_entry(port_list.next, @@ -918,6 +1020,12 @@ void install_netif_keywords(void) install_keyword("promisc_mode", promisc_mode_handler, KW_TYPE_INIT); install_keyword("mtu", custom_mtu_handler,KW_TYPE_INIT); install_keyword("kni_name", kni_name_handler, KW_TYPE_INIT); + install_keyword("kni_isolate", kni_isolate_handler, KW_TYPE_INIT); + install_keyword("kni_ipaddress", NULL, KW_TYPE_INIT); + install_sublevel(); + install_keyword("ipv4", kni_ipv4_address_handler, KW_TYPE_INIT); + install_keyword("ipv6", kni_ipv6_address_handler, KW_TYPE_INIT); + install_sublevel_end(); install_sublevel_end(); install_keyword("bonding", bonding_handler, KW_TYPE_INIT); install_sublevel(); @@ -1165,6 +1273,31 @@ static int isol_rxq_add(lcoreid_t cid, portid_t pid, queueid_t qid, unsigned rb_sz, struct netif_queue_conf *rxq); static void isol_rxq_del(struct rx_partner *isol_rxq, bool force); +static void +classify_port_queues(struct netif_port *port, + dpvs_lcore_role_t cid_role, + queueid_t queue_id) +{ + if (unlikely(port == NULL)) { + RTE_LOG(INFO, NETIF, + "[%s]: invalid port info.\n", __func__); + return; + } + + switch (cid_role) { + case LCORE_ROLE_FWD_WORKER: + port->rss_queues[port->rss_queue_num++] = queue_id; + break; + case LCORE_ROLE_KNI_WORKER: + port->kni.rx_queue_id = queue_id; + port->kni.fwd_mode = KNI_FWD_MODE_ISOLATE_RX; + break; + default: + RTE_LOG(INFO, NETIF, "[%s]: ignore idle worker.\n", __func__); + break; + } +} + static void config_lcores(struct list_head *worker_list) { int ii, tk; @@ -1174,6 +1307,7 @@ static void config_lcores(struct list_head *worker_list) struct netif_port *port; struct queue_conf_stream *queue; struct worker_conf_stream *worker, *worker_next, *worker_min; + dpvs_lcore_role_t cid_role; memset(lcore_conf, 0, sizeof(lcore_conf)); @@ -1211,6 +1345,7 @@ static void config_lcores(struct list_head *worker_list) else lcore_conf[id].type = LCORE_ROLE_IDLE; + cid_role = lcore_conf[id].type; list_for_each_entry_reverse(queue, &worker_min->port_list, queue_list_node) { port = netif_port_get_by_name(queue->port_name); if (port) @@ -1222,6 +1357,10 @@ static void config_lcores(struct list_head *worker_list) for (ii = 0; queue->rx_queues[ii] != NETIF_MAX_QUEUES && ii < NETIF_MAX_QUEUES; ii++) { lcore_conf[id].pqs[tk].rxqs[ii].id = queue->rx_queues[ii]; + + /* classify various queues on port, kni and rss */ + classify_port_queues(port, cid_role, queue->rx_queues[ii]); + if (queue->isol_rxq_lcore_ids[ii] != NETIF_LCORE_ID_INVALID) { if (isol_rxq_add(queue->isol_rxq_lcore_ids[ii], port->id, queue->rx_queues[ii], @@ -2766,7 +2905,7 @@ static void kni_egress_process(void) /* * KNI rx rte_ring use mode as multi-producers and the single-consumer. */ -static void kni_ingress_process(void) +static void kni_ingress_process(kni_fwd_mode_t fwd_mode) { struct rte_mbuf *mbufs[NETIF_MAX_PKT_BURST]; struct netif_port *dev; @@ -2776,11 +2915,18 @@ static void kni_ingress_process(void) for (id = 0; id < g_nports; id++) { dev = netif_port_get(id); - if (!dev || !kni_dev_exist(dev)) + if (!dev || !kni_dev_exist(dev) || + !kni_fwd_valid(dev, fwd_mode)) continue; - nb_rb = rte_ring_dequeue_burst(dev->kni.rx_ring, (void**)mbufs, - NETIF_MAX_PKT_BURST, NULL); + if (fwd_mode == KNI_FWD_MODE_ISOLATE_RX) { + nb_rb = rte_eth_rx_burst(dev->id, dev->kni.rx_queue_id, + mbufs, NETIF_MAX_PKT_BURST); + } else { + nb_rb = rte_ring_dequeue_burst(dev->kni.rx_ring, (void**)mbufs, + NETIF_MAX_PKT_BURST, NULL); + } + if (nb_rb == 0) continue; lcore_stats[cid].ipackets += nb_rb; @@ -2806,7 +2952,8 @@ static void kni_ingress_process(void) */ void kni_lcore_loop(void *dummy) { - kni_ingress_process(); + kni_ingress_process(KNI_FWD_MODE_ISOLATE_RX); + kni_ingress_process(KNI_FWD_MODE_DEFAULT); kni_egress_process(); /* This is a lazy solution. @@ -3204,6 +3351,40 @@ static inline void setup_dev_of_flags(struct netif_port *port) port->flag |= NETIF_PORT_FLAG_RX_IP_CSUM_OFFLOAD; } +/* + * @brief load address from dpvs conf file. + * + * @param dev - lan or wan link + * + * @return true on success, false on failure + */ +static bool +netif_kni_addr_init(struct netif_port *port) +{ + struct port_conf_stream *cfg_stream; + + if (unlikely(port == NULL || strlen(port->name) == 0)) { + RTE_LOG(ERR, NETIF, "[%s]: invalid port info\n", __func__); + return false; + } + + cfg_stream = get_port_conf_stream(port->name); + if (unlikely(cfg_stream == NULL || cfg_stream->kni_addr_cnt == 0)) { + RTE_LOG(ERR, NETIF, "[%s]: invalid port cfg stream\n", __func__); + return false; + } + + /* init kni info */ + rte_memcpy(&port->kni.ip, &cfg_stream->kni_ip, + sizeof(struct kni_addr) * cfg_stream->kni_addr_cnt); + port->kni.ip_addr_cnt = cfg_stream->kni_addr_cnt; + + RTE_LOG(INFO, NETIF, "[%s] success to init kni addr on port: %s, ip_addr_cnt: %d\n", + __func__, port->name, port->kni.ip_addr_cnt); + + return true; +} + /* TODO: refactor it with netif_alloc */ static struct netif_port* netif_rte_port_alloc(portid_t id, int nrxq, int ntxq, const struct rte_eth_conf *conf) @@ -3267,12 +3448,20 @@ static struct netif_port* netif_rte_port_alloc(portid_t id, int nrxq, INIT_LIST_HEAD(&port->in_ptr->ifm_list[ii]); } + port->rss_queue_num = 0; + port->kni.ip_addr_cnt = 0; + port->kni.rx_queue_id = NETIF_QUEUE_ID_INVALID; + port->kni.fwd_mode = KNI_FWD_MODE_DEFAULT; + INIT_LIST_HEAD(&port->hw_flow_info.flow_list); if (tc_init_dev(port) != EDPVS_OK) { RTE_LOG(ERR, NETIF, "%s: fail to init TC\n", __func__); rte_free(port); return NULL; } + /* load address from dpvs conf */ + netif_kni_addr_init(port); + return port; } @@ -3678,12 +3867,81 @@ static int fdir_filter_flush(const struct netif_port *port) return EDPVS_OK; } +/* + * @brief netif kni isolate rx filter config + * + * @note 1.config rss queue region excluding kni queue on port + * 2.should after fdir_filter_flush(), otherwise rte_flow + * will be flushed either on mellanox + * 3.use rte_flow on mellanox, dev_filter on others + * 4.signature mode is essential for ixgbe ipv6 filter, which + * match the first 4 bytes for ipv6 addr, please refer to + * chapter 7.1.2.7.15 of intel 85299 datasheet + * + * @param port - lan or wan physical device + * + * @return EDPVS_OK on success, negative on failure + */ +static int netif_kni_filter_config(struct netif_port *port) +{ + int j, ret; + queueid_t kni_queue_id; + + if (unlikely(port == NULL + || port->type != PORT_TYPE_GENERAL)) { + RTE_LOG(ERR, NETIF, + "[%s]: invalid netif info on port\n", + __func__); + return EDPVS_INVAL; + } + + ret = netdev_flow_init(port); + if (ret != EDPVS_OK) { + RTE_LOG(ERR, NETIF, + "[%s]: failed to init netdev flow for device %s\n", + __func__, port->name); + return ret; + } + + ret = netif_get_queue(port, g_kni_lcore_id, &kni_queue_id); + if (ret != EDPVS_OK) { + RTE_LOG(ERR, NETIF, + "[%s]: failed to get kni rx queue for device %s\n", + __func__, port->name); + return ret; + } + + for (j = 0; j < port->kni.ip_addr_cnt; j++) { + /* assert mellanox device */ + if (strstr(port->dev_info.driver_name, NETDEV_MLNX_DRIVER_NAME) != NULL) { + ret = netdev_flow_add_kni_filter(port, + &port->kni.ip[j].addr, + kni_queue_id, + port->kni.ip[j].af, + IPPROTO_IP); + } else { + ret = kni_fdir_filter_add(port, + &port->kni.ip[j].addr, + port->kni.ip[j].af); + } + + if (ret != EDPVS_OK) { + RTE_LOG(ERR, NETIF, + "[%s]: failed to add kni filter %d for device %s\n", + __func__, j, port->name); + return ret; + } + } + + return EDPVS_OK; +} + /* * Note: Invoke the function after port is allocated and lcores are configured. */ int netif_port_start(struct netif_port *port) { - int ii, ret; + int ii, j, ret; queueid_t qid; char promisc_on; char buf[512]; @@ -3779,6 +4037,22 @@ int netif_port_start(struct netif_port *port) // build port-queue-lcore mapping array build_port_queue_lcore_map(); + // print rss queues info, which exclude kni queue + if (port->rss_queue_num <= 0 || port->rss_queue_num > port->nrxq) { + rte_exit(EXIT_FAILURE, "%s: invalid rss queues num on port %d," + " exiting ...\n", __func__, port->id); + } else { + RTE_LOG(INFO, NETIF, "%s: port %d rss queues num: %d\n", + __func__, port->id, port->rss_queue_num); + for (j = 0; j < port->rss_queue_num; j++) { + RTE_LOG(INFO, NETIF, + "%s: rss queue id: %d\n", + __func__, port->rss_queues[j]); + } + RTE_LOG(INFO, NETIF, "%s: kni queue id: %d\n", + __func__, port->kni.rx_queue_id); + } + // start the device ret = rte_eth_dev_start(port->id); if (ret < 0) { @@ -3835,6 +4109,16 @@ int netif_port_start(struct netif_port *port) return ret; } + /* add kni isolate rx filter */ + if (netif_kni_isolate_rx_enable(port)) { + ret = netif_kni_filter_config(port); + if (ret != EDPVS_OK) { + RTE_LOG(WARNING, NETIF, + "fail to add kni isolate rx filters for device %s\n", port->name); + return ret; + } + } + return EDPVS_OK; } diff --git a/src/sa_pool.c b/src/sa_pool.c index e115cc56..67647f06 100644 --- a/src/sa_pool.c +++ b/src/sa_pool.c @@ -87,6 +87,8 @@ static uint64_t sa_lcore_mask; static uint8_t sa_pool_hash_size = SAPOOL_DEF_HASH_SZ; +netif_filter_op_func g_netif_filter_func = NULL; + static int __add_del_filter(int af, struct netif_port *dev, lcoreid_t cid, const union inet_addr *dip, __be16 dport, uint32_t filter_id[MAX_FDIR_PROTO], bool add) @@ -845,6 +847,9 @@ int sa_pool_init(void) port_base++; } + /* set filter op func */ + g_netif_filter_func = __add_del_filter; + return EDPVS_OK; }