From 9ebb163369af867cd437dce0f6263947ce759154 Mon Sep 17 00:00:00 2001 From: gray Date: Fri, 14 Jun 2024 18:13:28 +0800 Subject: [PATCH] Collect MTU from skb->_skb_refdst According to source code, kernel uses MTU from skb->_skb_refdst. Let pwru collect MTU from there using the same logic. It requires to cast skb->_skb_refdst to dst_entry*, then fetch dst_metric_raw(dst, RTAX_MTU) and dst->dev->mtu. ``` // https://elixir.bootlin.com/linux/v6.5/source/net/ipv4/ip_forward.c#L86 // net/ipv4/ip_forward.c int ip_forward(struct sk_buff *skb) { [...] rt = skb_rtable(skb); [...] mtu = ip_dst_mtu_maybe_forward(&rt->dst, true); if (ip_exceeds_mtu(skb, mtu)) { IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS); icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); SKB_DR_SET(reason, PKT_TOO_BIG); goto drop; } [...] } // include/linux/skbuff.h static inline struct rtable *skb_rtable(const struct sk_buff *skb) { return (struct rtable *)skb_dst(skb); } // include/linux/skbuff.h static inline struct dst_entry *skb_dst(const struct sk_buff *skb) { [...] return (struct dst_entry *)(skb->_skb_refdst & SKB_DST_PTRMASK); } // include/net/ip.h static inline unsigned int ip_dst_mtu_maybe_forward(const struct dst_entry *dst, bool forwarding) { [...] mtu = dst_metric_raw(dst, RTAX_MTU); if (mtu) goto out; mtu = READ_ONCE(dst->dev->mtu); [...] } // include/net/dst.h ((u32 *)((Y) & ~DST_METRICS_FLAGS)) ``` With this patch, pwru can output the correct MTU used by OS. Case 1: Cilium could reduce the route MTU to 1423 inside a pod. This can't be detected by pwru because it only checks link MTU. Case 2: Xfrm could reduce the route MTU to 1446 in ip_forward(). Pwru must inspect route MTU from skb->_skb_dstref to understand that. Signed-off-by: gray --- bpf/kprobe_pwru.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/bpf/kprobe_pwru.c b/bpf/kprobe_pwru.c index 43896598..e1fe41e4 100644 --- a/bpf/kprobe_pwru.c +++ b/bpf/kprobe_pwru.c @@ -14,6 +14,17 @@ #define ETH_P_IP 0x800 #define ETH_P_IPV6 0x86dd +#define RTAX_MTU 2 +#define SKB_DST_NOREF 1UL +#define SKB_DST_PTRMASK ~(SKB_DST_NOREF) +#define __SKB_DST_PTR(X) \ + ((struct dst_entry *)((X) & SKB_DST_PTRMASK)) + +#define DST_METRICS_FLAGS 0x3UL +#define __DST_METRICS_PTR(X) \ + ((u32 *)((X) & ~DST_METRICS_FLAGS)) + + const static bool TRUE = true; volatile const static __u64 BPF_PROG_ADDR = 0; @@ -230,6 +241,13 @@ set_meta(struct sk_buff *skb, struct skb_meta *meta) { meta->protocol = BPF_CORE_READ(skb, protocol); meta->ifindex = BPF_CORE_READ(skb, dev, ifindex); meta->mtu = BPF_CORE_READ(skb, dev, mtu); + struct dst_entry *dst = __SKB_DST_PTR(BPF_CORE_READ(skb, _skb_refdst)); + if (dst) { + u32 *metrics = __DST_METRICS_PTR(BPF_CORE_READ(dst, _metrics)); + bpf_probe_read_kernel(&meta->mtu, sizeof(meta->mtu), metrics + RTAX_MTU - 1); + if (!meta->mtu) + meta->mtu = BPF_CORE_READ(dst, dev, mtu); + } } static __always_inline void