From 121fdd2d8f2172c5ea77e0705c5f3d4cd28506f4 Mon Sep 17 00:00:00 2001 From: Vincent Li Date: Sun, 28 Jan 2024 00:03:49 +0000 Subject: [PATCH] ebpf: add XDP Syncookie program Add XDP Syncookie program to enable Suricata in af-packet IDS mode to stop host from SYN flooding attack. Signed-off-by: Vincent Li --- ebpf/Makefile.am | 6 +- ebpf/include/vmlinux/vmlinux_common.h | 11 + ebpf/include/vmlinux/vmlinux_net.h | 16 + ebpf/include/vmlinux/vmlinux_types.h | 14 + ebpf/include/vmlinux_local.h | 27 + ebpf/xdp_synproxy_kern.c | 818 ++++++++++++++++++++++++++ src/runmode-af-packet.c | 34 ++ src/source-af-packet.h | 2 + 8 files changed, 927 insertions(+), 1 deletion(-) create mode 100644 ebpf/include/vmlinux/vmlinux_common.h create mode 100644 ebpf/include/vmlinux/vmlinux_net.h create mode 100644 ebpf/include/vmlinux/vmlinux_types.h create mode 100644 ebpf/include/vmlinux_local.h create mode 100644 ebpf/xdp_synproxy_kern.c diff --git a/ebpf/Makefile.am b/ebpf/Makefile.am index 450bd19ff49d..b797aa1cc3ed 100644 --- a/ebpf/Makefile.am +++ b/ebpf/Makefile.am @@ -1,5 +1,5 @@ EXTRA_DIST= include bypass_filter.c filter.c lb.c vlan_filter.c xdp_filter.c \ - xdp_lb.c hash_func01.h + xdp_lb.c xdp_synproxy_kern.c hash_func01.h if BUILD_EBPF @@ -12,6 +12,7 @@ BPF_TARGETS += bypass_filter.bpf BPF_TARGETS += xdp_filter.bpf BPF_TARGETS += xdp_lb.bpf BPF_TARGETS += vlan_filter.bpf +BPF_TARGETS += xdp_synproxy_kern.bpf all: $(BPF_TARGETS) @@ -21,6 +22,9 @@ $(BPF_TARGETS): %.bpf: %.c ${CLANG} -Wall $(BPF_CFLAGS) -O2 -g \ -I/usr/include/$(build_cpu)-$(build_os)/ \ -D__KERNEL__ -D__ASM_SYSREG_H \ + -Wno-unused-value \ + -Wno-pointer-sign \ + -Wno-compare-distinct-pointer-types \ -target bpf -S -emit-llvm $< -o ${@:.bpf=.ll} # From LLVM-IR to BPF-bytecode in ELF-obj file ${LLC} -march=bpf -filetype=obj ${@:.bpf=.ll} -o $@ diff --git a/ebpf/include/vmlinux/vmlinux_common.h b/ebpf/include/vmlinux/vmlinux_common.h new file mode 100644 index 000000000000..4c8ea59892be --- /dev/null +++ b/ebpf/include/vmlinux/vmlinux_common.h @@ -0,0 +1,11 @@ +#ifndef __VMLINUX_COMMON_H__ +#define __VMLINUX_COMMON_H__ + +enum { + false = 0, + true = 1, +}; + +typedef _Bool bool; + +#endif /* __VMLINUX_COMMON_H__ */ diff --git a/ebpf/include/vmlinux/vmlinux_net.h b/ebpf/include/vmlinux/vmlinux_net.h new file mode 100644 index 000000000000..2db199bbb496 --- /dev/null +++ b/ebpf/include/vmlinux/vmlinux_net.h @@ -0,0 +1,16 @@ +#ifndef __VMLINUX_NET_H__ +#define __VMLINUX_NET_H__ + +typedef __u32 __wsum; + +struct nf_conn { + unsigned long status; +}; + +enum ip_conntrack_status { + /* Connection is confirmed: originating packet has left box */ + IPS_CONFIRMED_BIT = 3, + IPS_CONFIRMED = (1 << IPS_CONFIRMED_BIT), +}; + +#endif /* __VMLINUX_NET_H__ */ diff --git a/ebpf/include/vmlinux/vmlinux_types.h b/ebpf/include/vmlinux/vmlinux_types.h new file mode 100644 index 000000000000..d7b3bed0a1d6 --- /dev/null +++ b/ebpf/include/vmlinux/vmlinux_types.h @@ -0,0 +1,14 @@ +#ifndef __VMLINUX_TYPES_H__ +#define __VMLINUX_TYPES_H__ + +typedef __u8 u8; +typedef __s16 s16; +typedef __u16 u16; +typedef __s32 s32; +typedef __u32 u32; +typedef __s64 s64; +typedef __u64 u64; + +typedef s64 ktime_t; + +#endif /* __VMLINUX_TYPES_H__ */ diff --git a/ebpf/include/vmlinux_local.h b/ebpf/include/vmlinux_local.h new file mode 100644 index 000000000000..42c9a434dfc4 --- /dev/null +++ b/ebpf/include/vmlinux_local.h @@ -0,0 +1,27 @@ +/* + * WARNING: This file shadow vmlinux.h that you can generate yourself + * + * Cmdline to generate vmlinux.h + * bpftool btf dump file /sys/kernel/btf/vmlinux format c > vmlinux.h + * + * This vmlinux.h shadow contains kernel headers reduced to that were + * needed in this project. + */ +#ifndef __VMLINUX_H__ +#define __VMLINUX_H__ + +#include /* Needed for __uNN in vmlinux/vmlinux_types.h */ + +#ifndef BPF_NO_PRESERVE_ACCESS_INDEX +#pragma clang attribute push (__attribute__((preserve_access_index)), apply_to = record) +#endif + +#include "vmlinux/vmlinux_types.h" +#include "vmlinux/vmlinux_common.h" +#include "vmlinux/vmlinux_net.h" + +#ifndef BPF_NO_PRESERVE_ACCESS_INDEX +#pragma clang attribute pop +#endif + +#endif /* __VMLINUX_H__ */ diff --git a/ebpf/xdp_synproxy_kern.c b/ebpf/xdp_synproxy_kern.c new file mode 100644 index 000000000000..a9eec02772cd --- /dev/null +++ b/ebpf/xdp_synproxy_kern.c @@ -0,0 +1,818 @@ +// SPDX-License-Identifier: LGPL-2.1 OR BSD-2-Clause +/* Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. */ + +#include "vmlinux_local.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define NSEC_PER_SEC 1000000000L + +#define ETH_ALEN 6 +#define ETH_P_IP 0x0800 +#define ETH_P_IPV6 0x86DD + +#define tcp_flag_word(tp) (((union tcp_word_hdr *)(tp))->words[3]) + +#define IP_DF 0x4000 +#define IP_MF 0x2000 +#define IP_OFFSET 0x1fff + +#define NEXTHDR_TCP 6 + +#define TCPOPT_NOP 1 +#define TCPOPT_EOL 0 +#define TCPOPT_MSS 2 +#define TCPOPT_WINDOW 3 +#define TCPOPT_SACK_PERM 4 +#define TCPOPT_TIMESTAMP 8 + +#define TCPOLEN_MSS 4 +#define TCPOLEN_WINDOW 3 +#define TCPOLEN_SACK_PERM 2 +#define TCPOLEN_TIMESTAMP 10 + +#define TCP_TS_HZ 1000 +#define TS_OPT_WSCALE_MASK 0xf +#define TS_OPT_SACK (1 << 4) +#define TS_OPT_ECN (1 << 5) +#define TSBITS 6 +#define TSMASK (((__u32)1 << TSBITS) - 1) +#define TCP_MAX_WSCALE 14U + +#define IPV4_MAXLEN 60 +#define TCP_MAXLEN 60 + +#define DEFAULT_MSS4 1460 +#define DEFAULT_MSS6 1440 +#define DEFAULT_WSCALE 7 +#define DEFAULT_TTL 64 +#define MAX_ALLOWED_PORTS 8 + +#define MAX_PACKET_OFF 0xffff + +#define swap(a, b) \ + do { typeof(a) __tmp = (a); (a) = (b); (b) = __tmp; } while (0) + +#define __get_unaligned_t(type, ptr) ({ \ + const struct { type x; } __attribute__((__packed__)) *__pptr = (typeof(__pptr))(ptr); \ + __pptr->x; \ +}) + +#define get_unaligned(ptr) __get_unaligned_t(typeof(*(ptr)), (ptr)) + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __type(key, __u32); + __type(value, __u64); + __uint(max_entries, 2); +} values SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __type(key, __u32); + __type(value, __u16); + __uint(max_entries, MAX_ALLOWED_PORTS); +} allowed_ports SEC(".maps"); + +/* Some symbols defined in net/netfilter/nf_conntrack_bpf.c are unavailable in + * vmlinux.h if CONFIG_NF_CONNTRACK=m, so they are redefined locally. + */ + +struct bpf_ct_opts___local { + int netns_id; + int error; + __u8 l4proto; + __u8 dir; + __u8 reserved[2]; +} __attribute__((preserve_access_index)); + +extern struct nf_conn *bpf_xdp_ct_lookup(struct xdp_md *xdp_ctx, + struct bpf_sock_tuple *bpf_tuple, + __u32 len_tuple, + struct bpf_ct_opts___local *opts, + __u32 len_opts) __ksym; + +extern void bpf_ct_release(struct nf_conn *ct) __ksym; + +static __always_inline void swap_eth_addr(__u8 *a, __u8 *b) +{ + __u8 tmp[ETH_ALEN]; + + __builtin_memcpy(tmp, a, ETH_ALEN); + __builtin_memcpy(a, b, ETH_ALEN); + __builtin_memcpy(b, tmp, ETH_ALEN); +} + +static __always_inline __u16 csum_fold(__u32 csum) +{ + csum = (csum & 0xffff) + (csum >> 16); + csum = (csum & 0xffff) + (csum >> 16); + return (__u16)~csum; +} + +static __always_inline __u16 csum_tcpudp_magic(__u32 saddr, __u32 daddr, + __u32 len, __u8 proto, + __u32 csum) +{ + __u64 s = csum; + + s += (__u32)saddr; + s += (__u32)daddr; +#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + s += proto + len; +#elif __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + s += (proto + len) << 8; +#else +#error Unknown endian +#endif + s = (s & 0xffffffff) + (s >> 32); + s = (s & 0xffffffff) + (s >> 32); + + return csum_fold((__u32)s); +} + +static __always_inline __u16 csum_ipv6_magic(const struct in6_addr *saddr, + const struct in6_addr *daddr, + __u32 len, __u8 proto, __u32 csum) +{ + __u64 sum = csum; + int i; + +#pragma unroll + for (i = 0; i < 4; i++) + sum += (__u32)saddr->in6_u.u6_addr32[i]; + +#pragma unroll + for (i = 0; i < 4; i++) + sum += (__u32)daddr->in6_u.u6_addr32[i]; + + /* Don't combine additions to avoid 32-bit overflow. */ + sum += bpf_htonl(len); + sum += bpf_htonl(proto); + + sum = (sum & 0xffffffff) + (sum >> 32); + sum = (sum & 0xffffffff) + (sum >> 32); + + return csum_fold((__u32)sum); +} + +static __always_inline __u64 tcp_clock_ns(void) +{ + return bpf_ktime_get_ns(); +} + +static __always_inline __u32 tcp_ns_to_ts(__u64 ns) +{ + return ns / (NSEC_PER_SEC / TCP_TS_HZ); +} + +static __always_inline __u32 tcp_clock_ms(void) +{ + return tcp_ns_to_ts(tcp_clock_ns()); +} + +struct tcpopt_context { + void *data; + void *data_end; + __be32 *tsecr; + __u8 wscale; + bool option_timestamp; + bool option_sack; + __u32 off; +}; + +static __always_inline u8 *next(struct tcpopt_context *ctx, __u32 sz) +{ + __u64 off = ctx->off; + __u8 *data; + + /* Verifier forbids access to packet when offset exceeds MAX_PACKET_OFF */ + if (off > MAX_PACKET_OFF - sz) + return NULL; + + data = ctx->data + off; + barrier_var(data); + if (data + sz >= ctx->data_end) + return NULL; + + ctx->off += sz; + return data; +} + +static int tscookie_tcpopt_parse(struct tcpopt_context *ctx) +{ + __u8 *opcode, *opsize, *wscale, *tsecr; + __u32 off = ctx->off; + + opcode = next(ctx, 1); + if (!opcode) + return 1; + + if (*opcode == TCPOPT_EOL) + return 1; + if (*opcode == TCPOPT_NOP) + return 0; + + opsize = next(ctx, 1); + if (!opsize || *opsize < 2) + return 1; + + switch (*opcode) { + case TCPOPT_WINDOW: + wscale = next(ctx, 1); + if (!wscale) + return 1; + if (*opsize == TCPOLEN_WINDOW) + ctx->wscale = *wscale < TCP_MAX_WSCALE ? *wscale : TCP_MAX_WSCALE; + break; + case TCPOPT_TIMESTAMP: + tsecr = next(ctx, 4); + if (!tsecr) + return 1; + if (*opsize == TCPOLEN_TIMESTAMP) { + ctx->option_timestamp = true; + /* Client's tsval becomes our tsecr. */ + *ctx->tsecr = get_unaligned((__be32 *)tsecr); + } + break; + case TCPOPT_SACK_PERM: + if (*opsize == TCPOLEN_SACK_PERM) + ctx->option_sack = true; + break; + } + + ctx->off = off + *opsize; + + return 0; +} + +static int tscookie_tcpopt_parse_batch(__u32 index, void *context) +{ + int i; + + for (i = 0; i < 7; i++) + if (tscookie_tcpopt_parse(context)) + return 1; + return 0; +} + +static __always_inline bool tscookie_init(struct tcphdr *tcp_header, + __u16 tcp_len, __be32 *tsval, + __be32 *tsecr, void *data, void *data_end) +{ + struct tcpopt_context loop_ctx = { + .data = data, + .data_end = data_end, + .tsecr = tsecr, + .wscale = TS_OPT_WSCALE_MASK, + .option_timestamp = false, + .option_sack = false, + /* Note: currently verifier would track .off as unbound scalar. + * In case if verifier would at some point get smarter and + * compute bounded value for this var, beware that it might + * hinder bpf_loop() convergence validation. + */ + .off = (__u8 *)(tcp_header + 1) - (__u8 *)data, + }; + u32 cookie; + + bpf_loop(6, tscookie_tcpopt_parse_batch, &loop_ctx, 0); + + if (!loop_ctx.option_timestamp) + return false; + + cookie = tcp_clock_ms() & ~TSMASK; + cookie |= loop_ctx.wscale & TS_OPT_WSCALE_MASK; + if (loop_ctx.option_sack) + cookie |= TS_OPT_SACK; + if (tcp_header->ece && tcp_header->cwr) + cookie |= TS_OPT_ECN; + *tsval = bpf_htonl(cookie); + + return true; +} + +static __always_inline void values_get_tcpipopts(__u16 *mss, __u8 *wscale, + __u8 *ttl, bool ipv6) +{ + __u32 key = 0; + __u64 *value; + + value = bpf_map_lookup_elem(&values, &key); + if (value && *value != 0) { + if (ipv6) + *mss = (*value >> 32) & 0xffff; + else + *mss = *value & 0xffff; + *wscale = (*value >> 16) & 0xf; + *ttl = (*value >> 24) & 0xff; + return; + } + + *mss = ipv6 ? DEFAULT_MSS6 : DEFAULT_MSS4; + *wscale = DEFAULT_WSCALE; + *ttl = DEFAULT_TTL; +} + +static __always_inline void values_inc_synacks(void) +{ + __u32 key = 1; + __u64 *value; + + value = bpf_map_lookup_elem(&values, &key); + if (value) + __sync_fetch_and_add(value, 1); +} + +static __always_inline bool check_port_allowed(__u16 port) +{ + __u32 i; + + for (i = 0; i < MAX_ALLOWED_PORTS; i++) { + __u32 key = i; + __u16 *value; + + value = bpf_map_lookup_elem(&allowed_ports, &key); + + if (!value) + break; + /* 0 is a terminator value. Check it first to avoid matching on + * a forbidden port == 0 and returning true. + */ + if (*value == 0) + break; + + if (*value == port) + return true; + } + + return false; +} + +struct header_pointers { + struct ethhdr *eth; + struct iphdr *ipv4; + struct ipv6hdr *ipv6; + struct tcphdr *tcp; + __u16 tcp_len; +}; + +static __always_inline int tcp_dissect(void *data, void *data_end, + struct header_pointers *hdr) +{ + hdr->eth = data; + if (hdr->eth + 1 > data_end) + return XDP_DROP; + + switch (bpf_ntohs(hdr->eth->h_proto)) { + case ETH_P_IP: + hdr->ipv6 = NULL; + + hdr->ipv4 = (void *)hdr->eth + sizeof(*hdr->eth); + if (hdr->ipv4 + 1 > data_end) + return XDP_DROP; + if (hdr->ipv4->ihl * 4 < sizeof(*hdr->ipv4)) + return XDP_DROP; + if (hdr->ipv4->version != 4) + return XDP_DROP; + + if (hdr->ipv4->protocol != IPPROTO_TCP) + return XDP_PASS; + + hdr->tcp = (void *)hdr->ipv4 + hdr->ipv4->ihl * 4; + break; + case ETH_P_IPV6: + hdr->ipv4 = NULL; + + hdr->ipv6 = (void *)hdr->eth + sizeof(*hdr->eth); + if (hdr->ipv6 + 1 > data_end) + return XDP_DROP; + if (hdr->ipv6->version != 6) + return XDP_DROP; + + /* XXX: Extension headers are not supported and could circumvent + * XDP SYN flood protection. + */ + if (hdr->ipv6->nexthdr != NEXTHDR_TCP) + return XDP_PASS; + + hdr->tcp = (void *)hdr->ipv6 + sizeof(*hdr->ipv6); + break; + default: + /* XXX: VLANs will circumvent XDP SYN flood protection. */ + return XDP_PASS; + } + + if (hdr->tcp + 1 > data_end) + return XDP_DROP; + hdr->tcp_len = hdr->tcp->doff * 4; + if (hdr->tcp_len < sizeof(*hdr->tcp)) + return XDP_DROP; + + return XDP_TX; +} + +static __always_inline int tcp_lookup(void *ctx, struct header_pointers *hdr, bool xdp) +{ + struct bpf_ct_opts___local ct_lookup_opts = { + .netns_id = BPF_F_CURRENT_NETNS, + .l4proto = IPPROTO_TCP, + }; + struct bpf_sock_tuple tup = {}; + struct nf_conn *ct; + __u32 tup_size; + + if (hdr->ipv4) { + /* TCP doesn't normally use fragments, and XDP can't reassemble + * them. + */ + if ((hdr->ipv4->frag_off & bpf_htons(IP_DF | IP_MF | IP_OFFSET)) != bpf_htons(IP_DF)) + return XDP_DROP; + + tup.ipv4.saddr = hdr->ipv4->saddr; + tup.ipv4.daddr = hdr->ipv4->daddr; + tup.ipv4.sport = hdr->tcp->source; + tup.ipv4.dport = hdr->tcp->dest; + tup_size = sizeof(tup.ipv4); + } else if (hdr->ipv6) { + __builtin_memcpy(tup.ipv6.saddr, &hdr->ipv6->saddr, sizeof(tup.ipv6.saddr)); + __builtin_memcpy(tup.ipv6.daddr, &hdr->ipv6->daddr, sizeof(tup.ipv6.daddr)); + tup.ipv6.sport = hdr->tcp->source; + tup.ipv6.dport = hdr->tcp->dest; + tup_size = sizeof(tup.ipv6); + } else { + /* The verifier can't track that either ipv4 or ipv6 is not + * NULL. + */ + return XDP_ABORTED; + } + + ct = bpf_xdp_ct_lookup(ctx, &tup, tup_size, &ct_lookup_opts, sizeof(ct_lookup_opts)); + if (ct) { + unsigned long status = ct->status; + + bpf_ct_release(ct); + if (status & IPS_CONFIRMED) + return XDP_PASS; + } else if (ct_lookup_opts.error != -ENOENT) { + return XDP_ABORTED; + } + + /* error == -ENOENT || !(status & IPS_CONFIRMED) */ + return XDP_TX; +} + +static __always_inline __u8 tcp_mkoptions(__be32 *buf, __be32 *tsopt, __u16 mss, + __u8 wscale) +{ + __be32 *start = buf; + + *buf++ = bpf_htonl((TCPOPT_MSS << 24) | (TCPOLEN_MSS << 16) | mss); + + if (!tsopt) + return buf - start; + + if (tsopt[0] & bpf_htonl(1 << 4)) + *buf++ = bpf_htonl((TCPOPT_SACK_PERM << 24) | + (TCPOLEN_SACK_PERM << 16) | + (TCPOPT_TIMESTAMP << 8) | + TCPOLEN_TIMESTAMP); + else + *buf++ = bpf_htonl((TCPOPT_NOP << 24) | + (TCPOPT_NOP << 16) | + (TCPOPT_TIMESTAMP << 8) | + TCPOLEN_TIMESTAMP); + *buf++ = tsopt[0]; + *buf++ = tsopt[1]; + + if ((tsopt[0] & bpf_htonl(0xf)) != bpf_htonl(0xf)) + *buf++ = bpf_htonl((TCPOPT_NOP << 24) | + (TCPOPT_WINDOW << 16) | + (TCPOLEN_WINDOW << 8) | + wscale); + + return buf - start; +} + +static __always_inline void tcp_gen_synack(struct tcphdr *tcp_header, + __u32 cookie, __be32 *tsopt, + __u16 mss, __u8 wscale) +{ + void *tcp_options; + + tcp_flag_word(tcp_header) = TCP_FLAG_SYN | TCP_FLAG_ACK; + if (tsopt && (tsopt[0] & bpf_htonl(1 << 5))) + tcp_flag_word(tcp_header) |= TCP_FLAG_ECE; + tcp_header->doff = 5; /* doff is part of tcp_flag_word. */ + swap(tcp_header->source, tcp_header->dest); + tcp_header->ack_seq = bpf_htonl(bpf_ntohl(tcp_header->seq) + 1); + tcp_header->seq = bpf_htonl(cookie); + tcp_header->window = 0; + tcp_header->urg_ptr = 0; + tcp_header->check = 0; /* Calculate checksum later. */ + + tcp_options = (void *)(tcp_header + 1); + tcp_header->doff += tcp_mkoptions(tcp_options, tsopt, mss, wscale); +} + +static __always_inline void tcpv4_gen_synack(struct header_pointers *hdr, + __u32 cookie, __be32 *tsopt) +{ + __u8 wscale; + __u16 mss; + __u8 ttl; + + values_get_tcpipopts(&mss, &wscale, &ttl, false); + + swap_eth_addr(hdr->eth->h_source, hdr->eth->h_dest); + + swap(hdr->ipv4->saddr, hdr->ipv4->daddr); + hdr->ipv4->check = 0; /* Calculate checksum later. */ + hdr->ipv4->tos = 0; + hdr->ipv4->id = 0; + hdr->ipv4->ttl = ttl; + + tcp_gen_synack(hdr->tcp, cookie, tsopt, mss, wscale); + + hdr->tcp_len = hdr->tcp->doff * 4; + hdr->ipv4->tot_len = bpf_htons(sizeof(*hdr->ipv4) + hdr->tcp_len); +} + +static __always_inline void tcpv6_gen_synack(struct header_pointers *hdr, + __u32 cookie, __be32 *tsopt) +{ + __u8 wscale; + __u16 mss; + __u8 ttl; + + values_get_tcpipopts(&mss, &wscale, &ttl, true); + + swap_eth_addr(hdr->eth->h_source, hdr->eth->h_dest); + + swap(hdr->ipv6->saddr, hdr->ipv6->daddr); + *(__be32 *)hdr->ipv6 = bpf_htonl(0x60000000); + hdr->ipv6->hop_limit = ttl; + + tcp_gen_synack(hdr->tcp, cookie, tsopt, mss, wscale); + + hdr->tcp_len = hdr->tcp->doff * 4; + hdr->ipv6->payload_len = bpf_htons(hdr->tcp_len); +} + +static __always_inline int syncookie_handle_syn(struct header_pointers *hdr, + void *ctx, + void *data, void *data_end) +{ + __u32 old_pkt_size, new_pkt_size; + /* Unlike clang 10, clang 11 and 12 generate code that doesn't pass the + * BPF verifier if tsopt is not volatile. Volatile forces it to store + * the pointer value and use it directly, otherwise tcp_mkoptions is + * (mis)compiled like this: + * if (!tsopt) + * return buf - start; + * reg = stored_return_value_of_tscookie_init; + * if (reg) + * tsopt = tsopt_buf; + * else + * tsopt = NULL; + * ... + * *buf++ = tsopt[1]; + * It creates a dead branch where tsopt is assigned NULL, but the + * verifier can't prove it's dead and blocks the program. + */ + __be32 * volatile tsopt = NULL; + __be32 tsopt_buf[2] = {}; + __u16 ip_len; + __u32 cookie; + __s64 value; + + /* Checksum is not yet verified, but both checksum failure and TCP + * header checks return XDP_DROP, so the order doesn't matter. + */ + if (hdr->tcp->fin || hdr->tcp->rst) + return XDP_DROP; + + if (hdr->ipv4) { + /* Check the IPv4 and TCP checksums before creating a SYNACK. */ + value = bpf_csum_diff(0, 0, (void *)hdr->ipv4, hdr->ipv4->ihl * 4, 0); + if (value < 0) + return XDP_ABORTED; + if (csum_fold(value) != 0) + return XDP_DROP; /* Bad IPv4 checksum. */ + + value = bpf_csum_diff(0, 0, (void *)hdr->tcp, hdr->tcp_len, 0); + if (value < 0) + return XDP_ABORTED; + if (csum_tcpudp_magic(hdr->ipv4->saddr, hdr->ipv4->daddr, + hdr->tcp_len, IPPROTO_TCP, value) != 0) + return XDP_DROP; /* Bad TCP checksum. */ + + ip_len = sizeof(*hdr->ipv4); + + value = bpf_tcp_raw_gen_syncookie_ipv4(hdr->ipv4, hdr->tcp, + hdr->tcp_len); + } else if (hdr->ipv6) { + /* Check the TCP checksum before creating a SYNACK. */ + value = bpf_csum_diff(0, 0, (void *)hdr->tcp, hdr->tcp_len, 0); + if (value < 0) + return XDP_ABORTED; + if (csum_ipv6_magic(&hdr->ipv6->saddr, &hdr->ipv6->daddr, + hdr->tcp_len, IPPROTO_TCP, value) != 0) + return XDP_DROP; /* Bad TCP checksum. */ + + ip_len = sizeof(*hdr->ipv6); + + value = bpf_tcp_raw_gen_syncookie_ipv6(hdr->ipv6, hdr->tcp, + hdr->tcp_len); + } else { + return XDP_ABORTED; + } + + if (value < 0) + return XDP_ABORTED; + cookie = (__u32)value; + + if (tscookie_init((void *)hdr->tcp, hdr->tcp_len, + &tsopt_buf[0], &tsopt_buf[1], data, data_end)) + tsopt = tsopt_buf; + + /* Check that there is enough space for a SYNACK. It also covers + * the check that the destination of the __builtin_memmove below + * doesn't overflow. + */ + if (data + sizeof(*hdr->eth) + ip_len + TCP_MAXLEN > data_end) + return XDP_ABORTED; + + if (hdr->ipv4) { + if (hdr->ipv4->ihl * 4 > sizeof(*hdr->ipv4)) { + struct tcphdr *new_tcp_header; + + new_tcp_header = data + sizeof(*hdr->eth) + sizeof(*hdr->ipv4); + __builtin_memmove(new_tcp_header, hdr->tcp, sizeof(*hdr->tcp)); + hdr->tcp = new_tcp_header; + + hdr->ipv4->ihl = sizeof(*hdr->ipv4) / 4; + } + + tcpv4_gen_synack(hdr, cookie, tsopt); + } else if (hdr->ipv6) { + tcpv6_gen_synack(hdr, cookie, tsopt); + } else { + return XDP_ABORTED; + } + + /* Recalculate checksums. */ + hdr->tcp->check = 0; + value = bpf_csum_diff(0, 0, (void *)hdr->tcp, hdr->tcp_len, 0); + if (value < 0) + return XDP_ABORTED; + if (hdr->ipv4) { + hdr->tcp->check = csum_tcpudp_magic(hdr->ipv4->saddr, + hdr->ipv4->daddr, + hdr->tcp_len, + IPPROTO_TCP, + value); + + hdr->ipv4->check = 0; + value = bpf_csum_diff(0, 0, (void *)hdr->ipv4, sizeof(*hdr->ipv4), 0); + if (value < 0) + return XDP_ABORTED; + hdr->ipv4->check = csum_fold(value); + } else if (hdr->ipv6) { + hdr->tcp->check = csum_ipv6_magic(&hdr->ipv6->saddr, + &hdr->ipv6->daddr, + hdr->tcp_len, + IPPROTO_TCP, + value); + } else { + return XDP_ABORTED; + } + + /* Set the new packet size. */ + old_pkt_size = data_end - data; + new_pkt_size = sizeof(*hdr->eth) + ip_len + hdr->tcp->doff * 4; + if (bpf_xdp_adjust_tail(ctx, new_pkt_size - old_pkt_size)) + return XDP_ABORTED; + + values_inc_synacks(); + + return XDP_TX; +} + +static __always_inline int syncookie_handle_ack(struct header_pointers *hdr) +{ + int err; + + if (hdr->tcp->rst) + return XDP_DROP; + + if (hdr->ipv4) + err = bpf_tcp_raw_check_syncookie_ipv4(hdr->ipv4, hdr->tcp); + else if (hdr->ipv6) + err = bpf_tcp_raw_check_syncookie_ipv6(hdr->ipv6, hdr->tcp); + else + return XDP_ABORTED; + if (err) + return XDP_DROP; + + return XDP_PASS; +} + +static __always_inline int syncookie_part1(void *ctx, void *data, void *data_end, + struct header_pointers *hdr, bool xdp) +{ + int ret; + + ret = tcp_dissect(data, data_end, hdr); + if (ret != XDP_TX) + return ret; + + ret = tcp_lookup(ctx, hdr, xdp); + if (ret != XDP_TX) + return ret; + + /* Pass to upper stack if port requires no syncookie handling */ + if (!check_port_allowed(bpf_ntohs(hdr->tcp->dest))) + return XDP_PASS; + + /* Packet is TCP and doesn't belong to an established connection. */ + + if ((hdr->tcp->syn ^ hdr->tcp->ack) != 1) + return XDP_DROP; + + /* Grow the TCP header to TCP_MAXLEN to be able to pass any hdr->tcp_len + * to bpf_tcp_raw_gen_syncookie_ipv{4,6} and pass the verifier. + */ + if (xdp) { + if (bpf_xdp_adjust_tail(ctx, TCP_MAXLEN - hdr->tcp_len)) + return XDP_ABORTED; + } + + return XDP_TX; +} + +static __always_inline int syncookie_part2(void *ctx, void *data, void *data_end, + struct header_pointers *hdr) +{ + if (hdr->ipv4) { + hdr->eth = data; + hdr->ipv4 = (void *)hdr->eth + sizeof(*hdr->eth); + /* IPV4_MAXLEN is needed when calculating checksum. + * At least sizeof(struct iphdr) is needed here to access ihl. + */ + if ((void *)hdr->ipv4 + IPV4_MAXLEN > data_end) + return XDP_ABORTED; + hdr->tcp = (void *)hdr->ipv4 + hdr->ipv4->ihl * 4; + } else if (hdr->ipv6) { + hdr->eth = data; + hdr->ipv6 = (void *)hdr->eth + sizeof(*hdr->eth); + hdr->tcp = (void *)hdr->ipv6 + sizeof(*hdr->ipv6); + } else { + return XDP_ABORTED; + } + + if ((void *)hdr->tcp + TCP_MAXLEN > data_end) + return XDP_ABORTED; + + /* We run out of registers, tcp_len gets spilled to the stack, and the + * verifier forgets its min and max values checked above in tcp_dissect. + */ + hdr->tcp_len = hdr->tcp->doff * 4; + if (hdr->tcp_len < sizeof(*hdr->tcp)) + return XDP_ABORTED; + + return hdr->tcp->syn ? syncookie_handle_syn(hdr, ctx, data, data_end) : + syncookie_handle_ack(hdr); +} + +SEC("xdp") +int syncookie_xdp(struct xdp_md *ctx) +{ + void *data_end = (void *)(long)ctx->data_end; + void *data = (void *)(long)ctx->data; + struct header_pointers hdr; + int ret; + + ret = syncookie_part1(ctx, data, data_end, &hdr, true); + if (ret != XDP_TX) + return ret; + + data_end = (void *)(long)ctx->data_end; + data = (void *)(long)ctx->data; + + return syncookie_part2(ctx, data, data_end, &hdr); +} + +char _license[] SEC("license") = "GPL"; diff --git a/src/runmode-af-packet.c b/src/runmode-af-packet.c index b8ad0bfac0c3..3f94b2307307 100644 --- a/src/runmode-af-packet.c +++ b/src/runmode-af-packet.c @@ -546,6 +546,17 @@ static void *ParseAFPConfig(const char *iface) } #else SCLogWarning("%s: XDP filter set but XDP support is not built-in", iface); +#endif + } + + if (ConfGetChildValueWithDefault(if_root, if_default, "xdp-syncookie-file", &ebpf_file) != 1) { + aconf->xdp_syncookie_file = NULL; + } else { +#ifdef HAVE_PACKET_XDP + aconf->ebpf_t_config.flags |= EBPF_XDP_CODE; + aconf->xdp_filter_file = ebpf_file; +#else + SCLogWarning("%s: XDP filter set but XDP support is not built-in", iface); #endif #ifdef HAVE_PACKET_XDP const char *xdp_mode; @@ -618,6 +629,29 @@ static void *ParseAFPConfig(const char *iface) #endif } + if (aconf->xdp_syncookie_file) { +#ifdef HAVE_PACKET_XDP + int ret = EBPFLoadFile(aconf->iface, aconf->xdp_syncookie_file, "xdp", + &aconf->xdp_syncookie_fd, + &aconf->ebpf_t_config); + switch (ret) { + case 1: + SCLogInfo("%s: loaded pinned maps from sysfs", iface); + break; + case -1: + SCLogWarning("%s: failed to load XDP filter file", iface); + break; + case 0: + ret = EBPFSetupXDP(aconf->iface, aconf->xdp_filter_fd, aconf->xdp_mode); + if (ret != 0) { + SCLogWarning("%s: failed to set up XDP", iface); + } + } +#else + SCLogError("%s: XDP support is not built-in", iface); +#endif + } + if ((ConfGetChildValueIntWithDefault(if_root, if_default, "buffer-size", &value)) == 1) { aconf->buffer_size = value; } else { diff --git a/src/source-af-packet.h b/src/source-af-packet.h index d91d0cb25232..9f022a6e6eca 100644 --- a/src/source-af-packet.h +++ b/src/source-af-packet.h @@ -106,6 +106,8 @@ typedef struct AFPIfaceConfig_ int ebpf_filter_fd; const char *xdp_filter_file; int xdp_filter_fd; + const char *xdp_syncookie_file; + int xdp_syncookie_fd; uint8_t xdp_mode; const char *out_iface; #ifdef HAVE_PACKET_EBPF