From af85d1c63f5de6f2ce6e2f91e838fbe8b5aca1b3 Mon Sep 17 00:00:00 2001 From: Gray Liang Date: Wed, 24 Apr 2024 02:56:18 +0800 Subject: [PATCH 1/2] bpf: monitor use of splice to avoid kernel bug --- control/control_plane_core.go | 6 +++ control/kern/tproxy.c | 95 +++++++++++++++++++++++++++++++++-- 2 files changed, 97 insertions(+), 4 deletions(-) diff --git a/control/control_plane_core.go b/control/control_plane_core.go index 6be9ad80f..b36d8ffd4 100644 --- a/control/control_plane_core.go +++ b/control/control_plane_core.go @@ -417,6 +417,12 @@ func (c *controlPlaneCore) setupSkPidMonitor() error { } func (c *controlPlaneCore) setupLocalTcpFastRedirect() (err error) { + tp, err := link.Tracepoint("syscalls", "sys_enter_splice", c.bpf.TracepointSyscallsSysEnterSplice, nil) + if err != nil { + return fmt.Errorf("Attach tracepoint:sys_enter_splice: %w", err) + } + c.deferFuncs = append(c.deferFuncs, tp.Close) + cgroupPath, err := detectCgroupPath() if err != nil { return diff --git a/control/kern/tproxy.c b/control/kern/tproxy.c index 5b74c299e..1ef1347e3 100644 --- a/control/kern/tproxy.c +++ b/control/kern/tproxy.c @@ -387,6 +387,13 @@ struct { __uint(pinning, LIBBPF_PIN_BY_NAME); } cookie_pid_map SEC(".maps"); +struct { + __uint(type, BPF_MAP_TYPE_LRU_HASH); + __type(key, char[TASK_COMM_LEN]); + __type(value, __u8); + __uint(max_entries, MAX_COOKIE_PID_PNAME_MAPPING_NUM); +} fastsock_allowlist_map SEC(".maps"); + struct udp_conn_state { // pass @@ -1845,12 +1852,74 @@ SEC("sockops") int local_tcp_sockops(struct bpf_sock_ops *skops) { struct task_struct *task = (struct task_struct *)bpf_get_current_task(); - __u32 pid = BPF_CORE_READ(task, pid); + __u32 pid = BPF_CORE_READ(task, tgid); /* Only local TCP connection has non-zero pids. */ if (pid == 0) return 0; + /* We only care about 3 kinds of events, skip others */ + switch (skops->op) { + /* PASSIVE_ESTABLISHED_CB event is triggered when a new connection is + * established on a listening socket. In our case it's a dae TCP + * socket. + */ + case BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB: + /* ACTIVE_ESTABLISHED_CB event is triggered when a new connection is + * established on a client process. In our case it's a local client + * process whose traffic has been redirected to dae. + */ + case BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB: + /* STATE_CB event is triggered when a TCP status changes. It requires + * bpf_sock_ops_cb_flags_set(skops, BPF_SOCK_OPS_STATE_CB_FLAG) when + * connection is established. In our case it can only happen for a + * "probing" socket whose process is unknown to fastsock_allowlist_map. + * */ + case BPF_SOCK_OPS_STATE_CB: + break; + default: + return 0; + } + + char pname[16]; + + __builtin_memset(&pname, 0, sizeof(pname)); + BPF_CORE_READ_STR_INTO(&pname, task, comm); + + /* Let's handle BPF_SOCK_OPS_STATE_CB events here */ + if (skops->op == BPF_SOCK_OPS_STATE_CB) { + /* TCP connection is closing, let's check if splice(2) is called */ + if (skops->args[1] == BPF_TCP_CLOSE || skops->args[0] == BPF_TCP_ESTABLISHED) { + if (bpf_map_lookup_elem(&fastsock_allowlist_map, &pname)) { + /* Process has been recogized, return */ + return 0; + } + /* Still no record, meaning process didn't call + * splice(2), add it to the allowlist. */ + bpf_map_update_elem(&fastsock_allowlist_map, &pname, &one_key, BPF_ANY); + bpf_printk("fastsock_allowlist_map[%s] = 1", pname); + } + return 0; + } + + /* Now it's BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB or + * BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB, let's check if process is in the + * allowlist. + * */ + __u8 *allow = bpf_map_lookup_elem(&fastsock_allowlist_map, &pname); + + if (!allow) { + /* No entry, unknown process, let's probe it. */ + bpf_sock_ops_cb_flags_set(skops, BPF_SOCK_OPS_STATE_CB_FLAG); + bpf_printk("track TCP socket session: \"%s\"\n", pname); + return 0; + } else if (!*allow) { + /* Entry found, but it's forbidden, abort. */ + bpf_printk("fastsock not allowed: %s", pname); + return 0; + } + + /* Okay this process is allowed to proceed with fast socket, let's add it to sockmap. */ struct tuples_key tuple = {}; tuple.l4proto = IPPROTO_TCP; @@ -1955,12 +2024,30 @@ int sk_msg_fast_redirect(struct sk_msg_md *msg) if (bpf_msg_redirect_hash(msg, &fast_sock, &rev_tuple, BPF_F_INGRESS) == SK_PASS) bpf_printk("tcp fast redirect: %pI4:%lu -> %pI4:%lu", - &rev_tuple.sip.u6_addr32[3], - bpf_ntohs(rev_tuple.sport), &rev_tuple.dip.u6_addr32[3], - bpf_ntohs(rev_tuple.dport)); + bpf_ntohs(rev_tuple.dport), + &rev_tuple.sip.u6_addr32[3], + bpf_ntohs(rev_tuple.sport)); return SK_PASS; } +SEC("tracepoint/syscalls/sys_enter_splice") +int tracepoint_syscalls_sys_enter_splice(void) +{ + char pname[16]; + struct task_struct *task = (struct task_struct *)bpf_get_current_task(); + + __builtin_memset(&pname, 0, sizeof(pname)); + BPF_CORE_READ_STR_INTO(&pname, task, comm); + + __u8 *allow = bpf_map_lookup_elem(&fastsock_allowlist_map, &pname); + + if (!allow || (allow && *allow)) { + bpf_map_update_elem(&fastsock_allowlist_map, &pname, &zero_key, BPF_ANY); + bpf_printk("fastsock_allowlist_map[%s] = 0", pname); + } + return 0; +} + SEC("license") const char __license[] = "Dual BSD/GPL"; From 29f3f506c88f4e1e23f9c51310026a205f11ab26 Mon Sep 17 00:00:00 2001 From: Gray Liang Date: Fri, 26 Apr 2024 02:03:36 +0800 Subject: [PATCH 2/2] docs: CONFIG_HAVE_SYSCALL_TRACEPOINTS=y is required --- docs/en/README.md | 3 ++- docs/zh/README.md | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/docs/en/README.md b/docs/en/README.md index a15b76f3e..429110375 100644 --- a/docs/en/README.md +++ b/docs/en/README.md @@ -60,12 +60,13 @@ CONFIG_DEBUG_INFO=y CONFIG_DEBUG_INFO_BTF=y CONFIG_KPROBE_EVENTS=y CONFIG_BPF_EVENTS=y +CONFIG_HAVE_SYSCALL_TRACEPOINTS=y ``` Check them using command like: ```shell -(zcat /proc/config.gz || cat /boot/{config,config-$(uname -r)}) | grep -E 'CONFIG_(DEBUG_INFO|DEBUG_INFO_BTF|KPROBES|KPROBE_EVENTS|BPF|BPF_SYSCALL|BPF_JIT|BPF_STREAM_PARSER|NET_CLS_ACT|NET_SCH_INGRESS|NET_INGRESS|NET_EGRESS|NET_CLS_BPF|BPF_EVENTS|CGROUPS)=|# CONFIG_DEBUG_INFO_REDUCED is not set' +(zcat /proc/config.gz || cat /boot/{config,config-$(uname -r)}) | grep -E 'CONFIG_(DEBUG_INFO|DEBUG_INFO_BTF|KPROBES|KPROBE_EVENTS|BPF|BPF_SYSCALL|BPF_JIT|BPF_STREAM_PARSER|NET_CLS_ACT|NET_SCH_INGRESS|NET_INGRESS|NET_EGRESS|NET_CLS_BPF|BPF_EVENTS|CGROUPS|HAVE_SYSCALL_TRACEPOINTS)=|# CONFIG_DEBUG_INFO_REDUCED is not set' ``` > **Note**: `Armbian` users can follow the [**Upgrade Guide**](user-guide/kernel-upgrade.md) to upgrade the kernel to meet the kernel configuration requirement. diff --git a/docs/zh/README.md b/docs/zh/README.md index 9410882cf..e1a823dc5 100644 --- a/docs/zh/README.md +++ b/docs/zh/README.md @@ -56,12 +56,13 @@ CONFIG_DEBUG_INFO=y CONFIG_DEBUG_INFO_BTF=y CONFIG_KPROBE_EVENTS=y CONFIG_BPF_EVENTS=y +CONFIG_HAVE_SYSCALL_TRACEPOINTS=y ``` 你可以通过以下命令检查他们: ```shell -(zcat /proc/config.gz || cat /boot/{config,config-$(uname -r)}) | grep -E 'CONFIG_(DEBUG_INFO|DEBUG_INFO_BTF|KPROBES|KPROBE_EVENTS|BPF|BPF_SYSCALL|BPF_JIT|BPF_STREAM_PARSER|NET_CLS_ACT|NET_SCH_INGRESS|NET_INGRESS|NET_EGRESS|NET_CLS_BPF|BPF_EVENTS|CGROUPS)=|# CONFIG_DEBUG_INFO_REDUCED is not set' +(zcat /proc/config.gz || cat /boot/{config,config-$(uname -r)}) | grep -E 'CONFIG_(DEBUG_INFO|DEBUG_INFO_BTF|KPROBES|KPROBE_EVENTS|BPF|BPF_SYSCALL|BPF_JIT|BPF_STREAM_PARSER|NET_CLS_ACT|NET_SCH_INGRESS|NET_INGRESS|NET_EGRESS|NET_CLS_BPF|BPF_EVENTS|CGROUPS|HAVE_SYSCALL_TRACEPOINTS)=|# CONFIG_DEBUG_INFO_REDUCED is not set' ``` > **注意**: `Armbian` 用户可以参考 [**Upgrade Guide**](../en/user-guide/kernel-upgrade.md) 升级到支持的内核。