From 65d382584f58289bb0bc86d7fa9561ca15321a1b Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Wed, 7 May 2025 10:17:10 -0700 Subject: [PATCH 01/11] bpf: Introduce bpf_dynptr_from_mem_slice Add a new bpf_dynptr_from_mem_slice kfunc to create a dynptr from a PTR_TO_BTF_ID exposing a variable-length slice of memory, represented by the new bpf_mem_slice type. This slice is read-only, for a read-write slice we can expose a distinct type in the future. Since this is the first kfunc with potential local dynptr initialization, add it to the if-else list in check_kfunc_call. Signed-off-by: Kumar Kartikeya Dwivedi --- include/linux/bpf.h | 6 ++++++ kernel/bpf/helpers.c | 37 +++++++++++++++++++++++++++++++++++++ kernel/bpf/verifier.c | 6 +++++- 3 files changed, 48 insertions(+), 1 deletion(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 3f0cc89c0622c..b0ea0b71df900 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1344,6 +1344,12 @@ enum bpf_dynptr_type { BPF_DYNPTR_TYPE_XDP, }; +struct bpf_mem_slice { + void *ptr; + u32 len; + u32 reserved; +}; + int bpf_dynptr_check_size(u32 size); u32 __bpf_dynptr_size(const struct bpf_dynptr_kern *ptr); const void *__bpf_dynptr_data(const struct bpf_dynptr_kern *ptr, u32 len); diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 78cefb41266a1..89ab3481378d7 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -2873,6 +2873,42 @@ __bpf_kfunc int bpf_dynptr_copy(struct bpf_dynptr *dst_ptr, u32 dst_off, return 0; } +/** + * bpf_dynptr_from_mem_slice - Create a dynptr from a bpf_mem_slice + * @mem_slice: Source bpf_mem_slice, backing the underlying memory for dynptr + * @flags: Flags for dynptr construction, currently no supported flags. + * @dptr__uninit: Destination dynptr, which will be initialized. + * + * Creates a dynptr that points to variable-length read-only memory represented + * by a bpf_mem_slice fat pointer. + * Returns 0 on success; negative error, otherwise. + */ +__bpf_kfunc int bpf_dynptr_from_mem_slice(struct bpf_mem_slice *mem_slice, u64 flags, struct bpf_dynptr *dptr__uninit) +{ + struct bpf_dynptr_kern *dptr = (struct bpf_dynptr_kern *)dptr__uninit; + int err; + + /* mem_slice is never NULL, as we use KF_TRUSTED_ARGS. */ + err = bpf_dynptr_check_size(mem_slice->len); + if (err) + goto error; + + /* flags is currently unsupported */ + if (flags) { + err = -EINVAL; + goto error; + } + + bpf_dynptr_init(dptr, mem_slice->ptr, BPF_DYNPTR_TYPE_LOCAL, 0, mem_slice->len); + bpf_dynptr_set_rdonly(dptr); + + return 0; + +error: + bpf_dynptr_set_null(dptr); + return err; +} + __bpf_kfunc void *bpf_cast_to_kern_ctx(void *obj) { return obj; @@ -3327,6 +3363,7 @@ BTF_ID_FLAGS(func, bpf_dynptr_is_rdonly) BTF_ID_FLAGS(func, bpf_dynptr_size) BTF_ID_FLAGS(func, bpf_dynptr_clone) BTF_ID_FLAGS(func, bpf_dynptr_copy) +BTF_ID_FLAGS(func, bpf_dynptr_from_mem_slice, KF_TRUSTED_ARGS) #ifdef CONFIG_NET BTF_ID_FLAGS(func, bpf_modify_return_test_tp) #endif diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 99aa2c890e7bd..ff34e68c92374 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -12116,6 +12116,7 @@ enum special_kfunc_type { KF_bpf_res_spin_unlock, KF_bpf_res_spin_lock_irqsave, KF_bpf_res_spin_unlock_irqrestore, + KF_bpf_dynptr_from_mem_slice, }; BTF_SET_START(special_kfunc_set) @@ -12219,6 +12220,7 @@ BTF_ID(func, bpf_res_spin_lock) BTF_ID(func, bpf_res_spin_unlock) BTF_ID(func, bpf_res_spin_lock_irqsave) BTF_ID(func, bpf_res_spin_unlock_irqrestore) +BTF_ID(func, bpf_dynptr_from_mem_slice) static bool is_kfunc_ret_null(struct bpf_kfunc_call_arg_meta *meta) { @@ -13140,7 +13142,9 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ if (is_kfunc_arg_uninit(btf, &args[i])) dynptr_arg_type |= MEM_UNINIT; - if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_from_skb]) { + if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_from_mem_slice]) { + dynptr_arg_type |= DYNPTR_TYPE_LOCAL; + } else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_from_skb]) { dynptr_arg_type |= DYNPTR_TYPE_SKB; } else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_from_xdp]) { dynptr_arg_type |= DYNPTR_TYPE_XDP; From 22ef45da038752860756241e2503de9313d07721 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Wed, 7 May 2025 10:17:11 -0700 Subject: [PATCH 02/11] bpf: Introduce BPF standard streams Add support for a stream API to the kernel and expose related kfuncs to BPF programs. Two streams are exposed, BPF_STDOUT and BPF_STDERR. These can be used for printing messages that can be consumed from user space, thus it's similar in spirit to existing trace_pipe interface. The kernel will use the BPF_STDERR stream to notify the program of any errors encountered at runtime. BPF programs themselves may use both streams for writing debug messages. BPF library-like code may use BPF_STDERR to print warnings or errors on misuse at runtime. The implementation of a stream is as follows. Everytime a message is emitted from the kernel (directly, or through a BPF program), a record is allocated by bump allocating from per-cpu region backed by a page obtained using try_alloc_pages. This ensures that we can allocate memory from any context. The eventual plan is to discard this scheme in favor of Alexei's kmalloc_nolock() [0]. This record is then locklessly inserted into a list (llist_add()) so that the printing side doesn't require holding any locks, and works in any context. Each stream has a maximum capacity of 4MB of text, and each printed message is accounted against this limit. Messages from a program are emitted using the bpf_stream_vprintk kfunc, which takes a stream argument in addition to working otherwise similar to bpf_trace_vprintk. The stream itself can be obtained using two kfuncs, bpf_stream_get for the current program, and bpf_prog_stream_get to obtain it for a target program ID. The bprintf buffer helpers are extracted out to be reused for printing the string into them before copying it into the stream, so that we can (with the defined max limit) format a string and know its true length before performing allocations of the stream element. For consuming elements from a stream, bpf_stream_next_elem can be called, which returns a bpf_stream_elem object that contains a bpf_mem_slice struct representing the message contents. A dynptr can be created from this memory slice object to access the contents of the bpf_stream_elem. Once consumed, the bpf_stream_free_elem can be used to release the message back to the memory allocator. The internals of bpf_stream_next_elem merit some discussion. First, the lockless list bpf_stream::log is a LIFO stack. Elements obtained using a llist_del_all() operation are in LIFO order, thus would break the chronological ordering if printed directly. Hence, this batch of messages is first reversed. Then, it is stashed into a separate list in the stream, i.e. the backlog_log. The head of this list is the actual message that should always be returned to the caller. For this purpose, we hold a lock around bpf_stream_backlog_pop(), as llist_del_first() (if we maintained a second lockless list for the backlog) wouldn't be safe from multiple threads anyway. Then, if we fail to find something in the backlog log, we splice out everything from the lockless log, and place it in the backlog log, and then return the head of the backlog. Next time we pop a message, we should visit the remaining elements in the backlog log first. We use rqspinlock for protecting the backlog log, to ensure we can invoke bpf_stream_next_elem in any context. With the exception of bpf_prog_stream_get, these kfuncs are available to all program types. bpf_prog_stream_get takes a spin_lock_bh, thus is susceptible to deadlocks if invoked in random kernel contexts. Hence, it is restricted to BPF_PROG_TYPE_SYSCALL. In the future, if the need arises, we can use rqspinlock to make it callable in any context. From the kernel side, the writing into the stream will be a bit more involved than the typical printk. First, the kernel typically may print a collection of messages into the stream, and parallel writers into the stream may suffer from interleaving of messages. To ensure each group of messages is visible atomically, we can lift the advantage of using a lockless list for pushing in messages. To enable this, we add a bpf_stream_stage() macro, and require kernel users to use bpf_stream_printk statements for the passed expression to write into the stream. Underneath the macro, we have a message staging API, where a bpf_stream_stage object on the stack accumulates the messages being printed into a local llist_head, and then a commit operation splices the whole batch into the stream's lockless log list. This is especially pertinent for rqspinlock deadlock messages printed to program streams. After this change, we see each deadlock invocation as a non-interleaving contiguous message without any confusion on the reader's part, improving their user experience in debugging the fault. While programs cannot benefit from this staged stream writing API, they could just as well hold an rqspinlock around their print statements to serialize messages, hence this is kept kernel-internal for now. Overall, this infrastructure provides NMI-safe any context printing of messages to two dedicated streams. Later patches will add support for printing splats in case of BPF arena page faults, rqspinlock deadlocks, and cond_break timeouts, and integration of this facility into bpftool for dumping messages to user space. [0]: https://lore.kernel.org/bpf/20250501032718.65476-1-alexei.starovoitov@gmail.com Signed-off-by: Kumar Kartikeya Dwivedi --- include/linux/bpf.h | 72 +++++- kernel/bpf/Makefile | 2 +- kernel/bpf/core.c | 12 + kernel/bpf/helpers.c | 26 +-- kernel/bpf/stream.c | 499 ++++++++++++++++++++++++++++++++++++++++++ kernel/bpf/syscall.c | 2 +- kernel/bpf/verifier.c | 15 +- 7 files changed, 605 insertions(+), 23 deletions(-) create mode 100644 kernel/bpf/stream.c diff --git a/include/linux/bpf.h b/include/linux/bpf.h index b0ea0b71df900..2c10ae62df2d4 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1524,6 +1524,40 @@ struct btf_mod_pair { struct bpf_kfunc_desc_tab; +enum bpf_stream_id { + BPF_STDOUT = 1, + BPF_STDERR = 2, +}; + +struct bpf_stream_elem { + struct llist_node node; + struct bpf_mem_slice mem_slice; + char str[]; +}; + +struct bpf_stream_elem_batch { + struct llist_node *node; +}; + +enum { + BPF_STREAM_MAX_CAPACITY = (4 * 1024U * 1024U), +}; + +struct bpf_stream { + enum bpf_stream_id stream_id; + atomic_t capacity; + struct llist_head log; + + rqspinlock_t lock; + struct llist_node *backlog_head; + struct llist_node *backlog_tail; +}; + +struct bpf_stream_stage { + struct llist_head log; + int len; +}; + struct bpf_prog_aux { atomic64_t refcnt; u32 used_map_cnt; @@ -1632,6 +1666,7 @@ struct bpf_prog_aux { struct work_struct work; struct rcu_head rcu; }; + struct bpf_stream stream[2]; }; struct bpf_prog { @@ -2391,6 +2426,8 @@ int generic_map_delete_batch(struct bpf_map *map, struct bpf_map *bpf_map_get_curr_or_next(u32 *id); struct bpf_prog *bpf_prog_get_curr_or_next(u32 *id); + +struct page *__bpf_alloc_page(int nid); int bpf_map_alloc_pages(const struct bpf_map *map, int nid, unsigned long nr_pages, struct page **page_array); #ifdef CONFIG_MEMCG @@ -3529,6 +3566,16 @@ bool btf_id_set_contains(const struct btf_id_set *set, u32 id); #define MAX_BPRINTF_VARARGS 12 #define MAX_BPRINTF_BUF 1024 +/* Per-cpu temp buffers used by printf-like helpers to store the bprintf binary + * arguments representation. + */ +#define MAX_BPRINTF_BIN_ARGS 512 + +struct bpf_bprintf_buffers { + char bin_args[MAX_BPRINTF_BIN_ARGS]; + char buf[MAX_BPRINTF_BUF]; +}; + struct bpf_bprintf_data { u32 *bin_args; char *buf; @@ -3536,9 +3583,32 @@ struct bpf_bprintf_data { bool get_buf; }; -int bpf_bprintf_prepare(char *fmt, u32 fmt_size, const u64 *raw_args, +int bpf_bprintf_prepare(const char *fmt, u32 fmt_size, const u64 *raw_args, u32 num_args, struct bpf_bprintf_data *data); void bpf_bprintf_cleanup(struct bpf_bprintf_data *data); +int bpf_try_get_buffers(struct bpf_bprintf_buffers **bufs); +void bpf_put_buffers(void); + +void bpf_prog_stream_init(struct bpf_prog *prog); +void bpf_prog_stream_free(struct bpf_prog *prog); + +void bpf_stream_stage_init(struct bpf_stream_stage *ss); +void bpf_stream_stage_free(struct bpf_stream_stage *ss); +__printf(2, 3) +int bpf_stream_stage_printk(struct bpf_stream_stage *ss, const char *fmt, ...); +int bpf_stream_stage_commit(struct bpf_stream_stage *ss, struct bpf_prog *prog, + enum bpf_stream_id stream_id); + +#define bpf_stream_printk(...) bpf_stream_stage_printk(&__ss, __VA_ARGS__) + +#define bpf_stream_stage(prog, stream_id, expr) \ + ({ \ + struct bpf_stream_stage __ss; \ + bpf_stream_stage_init(&__ss); \ + (expr); \ + bpf_stream_stage_commit(&__ss, prog, stream_id); \ + bpf_stream_stage_free(&__ss); \ + }) #ifdef CONFIG_BPF_LSM void bpf_cgroup_atype_get(u32 attach_btf_id, int cgroup_atype); diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 70502f038b921..a89575822b60c 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -14,7 +14,7 @@ obj-$(CONFIG_BPF_SYSCALL) += bpf_local_storage.o bpf_task_storage.o obj-${CONFIG_BPF_LSM} += bpf_inode_storage.o obj-$(CONFIG_BPF_SYSCALL) += disasm.o mprog.o obj-$(CONFIG_BPF_JIT) += trampoline.o -obj-$(CONFIG_BPF_SYSCALL) += btf.o memalloc.o rqspinlock.o +obj-$(CONFIG_BPF_SYSCALL) += btf.o memalloc.o rqspinlock.o stream.o ifeq ($(CONFIG_MMU)$(CONFIG_64BIT),yy) obj-$(CONFIG_BPF_SYSCALL) += arena.o range_tree.o endif diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index a3e5716884211..22c278c008ce2 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -134,6 +134,10 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag mutex_init(&fp->aux->ext_mutex); mutex_init(&fp->aux->dst_mutex); +#ifdef CONFIG_BPF_SYSCALL + bpf_prog_stream_init(fp); +#endif + return fp; } @@ -2861,6 +2865,7 @@ static void bpf_prog_free_deferred(struct work_struct *work) aux = container_of(work, struct bpf_prog_aux, work); #ifdef CONFIG_BPF_SYSCALL bpf_free_kfunc_btf_tab(aux->kfunc_btf_tab); + bpf_prog_stream_free(aux->prog); #endif #ifdef CONFIG_CGROUP_BPF if (aux->cgroup_atype != CGROUP_BPF_ATTACH_TYPE_INVALID) @@ -2877,6 +2882,13 @@ static void bpf_prog_free_deferred(struct work_struct *work) if (aux->dst_trampoline) bpf_trampoline_put(aux->dst_trampoline); for (i = 0; i < aux->real_func_cnt; i++) { +#ifdef CONFIG_BPF_SYSCALL + /* Ensure we don't push to subprog lists. */ + if (bpf_is_subprog(aux->func[i])) { + WARN_ON_ONCE(!llist_empty(&aux->func[i]->aux->stream[0].log)); + WARN_ON_ONCE(!llist_empty(&aux->func[i]->aux->stream[1].log)); + } +#endif /* We can just unlink the subprog poke descriptor table as * it was originally linked to the main program and is also * released along with it. diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 89ab3481378d7..98806368121ed 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -761,22 +761,13 @@ static int bpf_trace_copy_string(char *buf, void *unsafe_ptr, char fmt_ptype, return -EINVAL; } -/* Per-cpu temp buffers used by printf-like helpers to store the bprintf binary - * arguments representation. - */ -#define MAX_BPRINTF_BIN_ARGS 512 - /* Support executing three nested bprintf helper calls on a given CPU */ #define MAX_BPRINTF_NEST_LEVEL 3 -struct bpf_bprintf_buffers { - char bin_args[MAX_BPRINTF_BIN_ARGS]; - char buf[MAX_BPRINTF_BUF]; -}; static DEFINE_PER_CPU(struct bpf_bprintf_buffers[MAX_BPRINTF_NEST_LEVEL], bpf_bprintf_bufs); static DEFINE_PER_CPU(int, bpf_bprintf_nest_level); -static int try_get_buffers(struct bpf_bprintf_buffers **bufs) +int bpf_try_get_buffers(struct bpf_bprintf_buffers **bufs) { int nest_level; @@ -792,16 +783,21 @@ static int try_get_buffers(struct bpf_bprintf_buffers **bufs) return 0; } -void bpf_bprintf_cleanup(struct bpf_bprintf_data *data) +void bpf_put_buffers(void) { - if (!data->bin_args && !data->buf) - return; if (WARN_ON_ONCE(this_cpu_read(bpf_bprintf_nest_level) == 0)) return; this_cpu_dec(bpf_bprintf_nest_level); preempt_enable(); } +void bpf_bprintf_cleanup(struct bpf_bprintf_data *data) +{ + if (!data->bin_args && !data->buf) + return; + bpf_put_buffers(); +} + /* * bpf_bprintf_prepare - Generic pass on format strings for bprintf-like helpers * @@ -816,7 +812,7 @@ void bpf_bprintf_cleanup(struct bpf_bprintf_data *data) * In argument preparation mode, if 0 is returned, safe temporary buffers are * allocated and bpf_bprintf_cleanup should be called to free them after use. */ -int bpf_bprintf_prepare(char *fmt, u32 fmt_size, const u64 *raw_args, +int bpf_bprintf_prepare(const char *fmt, u32 fmt_size, const u64 *raw_args, u32 num_args, struct bpf_bprintf_data *data) { bool get_buffers = (data->get_bin_args && num_args) || data->get_buf; @@ -832,7 +828,7 @@ int bpf_bprintf_prepare(char *fmt, u32 fmt_size, const u64 *raw_args, return -EINVAL; fmt_size = fmt_end - fmt; - if (get_buffers && try_get_buffers(&buffers)) + if (get_buffers && bpf_try_get_buffers(&buffers)) return -EBUSY; if (data->get_bin_args) { diff --git a/kernel/bpf/stream.c b/kernel/bpf/stream.c new file mode 100644 index 0000000000000..a9151a8575ec7 --- /dev/null +++ b/kernel/bpf/stream.c @@ -0,0 +1,499 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ + +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Simple per-CPU NMI-safe bump allocation mechanism, backed by the NMI-safe + * try_alloc_pages()/free_pages_nolock() primitives. We allocate a page and + * stash it in a local per-CPU variable, and bump allocate from the page + * whenever items need to be printed to a stream. Each page holds a global + * atomic refcount in its first 4 bytes, and then records of variable length + * that describe the printed messages. Once the global refcount has dropped to + * zero, it is a signal to free the page back to the kernel's page allocator, + * given all the individual records in it have been consumed. + * + * It is possible the same page is used to serve allocations across different + * programs, which may be consumed at different times individually, hence + * maintaining a reference count per-page is critical for correct lifetime + * tracking. + * + * The bpf_stream_page code will be replaced to use kmalloc_nolock() once it + * lands. + */ +struct bpf_stream_page { + refcount_t ref; + u32 consumed; + char buf[]; +}; + +/* Available room to add data to a refcounted page. */ +#define BPF_STREAM_PAGE_SZ (PAGE_SIZE - offsetofend(struct bpf_stream_page, consumed)) + +static DEFINE_PER_CPU(local_trylock_t, stream_local_lock) = INIT_LOCAL_TRYLOCK(stream_local_lock); +static DEFINE_PER_CPU(struct bpf_stream_page *, stream_pcpu_page); + +static bool bpf_stream_page_local_lock(unsigned long *flags) +{ + return local_trylock_irqsave(&stream_local_lock, *flags); +} + +static void bpf_stream_page_local_unlock(unsigned long *flags) +{ + local_unlock_irqrestore(&stream_local_lock, *flags); +} + +static void bpf_stream_page_free(struct bpf_stream_page *stream_page) +{ + struct page *p; + + if (!stream_page) + return; + p = virt_to_page(stream_page); + free_pages_nolock(p, 0); +} + +static void bpf_stream_page_get(struct bpf_stream_page *stream_page) +{ + refcount_inc(&stream_page->ref); +} + +static void bpf_stream_page_put(struct bpf_stream_page *stream_page) +{ + if (refcount_dec_and_test(&stream_page->ref)) + bpf_stream_page_free(stream_page); +} + +static void bpf_stream_page_init(struct bpf_stream_page *stream_page) +{ + refcount_set(&stream_page->ref, 1); + stream_page->consumed = 0; +} + +static struct bpf_stream_page *bpf_stream_page_replace(void) +{ + struct bpf_stream_page *stream_page, *old_stream_page; + struct page *page; + + page = __bpf_alloc_page(NUMA_NO_NODE); + if (!page) + return NULL; + stream_page = page_address(page); + bpf_stream_page_init(stream_page); + + old_stream_page = this_cpu_read(stream_pcpu_page); + if (old_stream_page) + bpf_stream_page_put(old_stream_page); + this_cpu_write(stream_pcpu_page, stream_page); + return stream_page; +} + +static int bpf_stream_page_check_room(struct bpf_stream_page *stream_page, int len) +{ + int min = offsetof(struct bpf_stream_elem, str[0]); + int consumed = stream_page->consumed; + int total = BPF_STREAM_PAGE_SZ; + int rem = max(0, total - consumed - min); + + /* Let's give room of at least 8 bytes. */ + WARN_ON_ONCE(rem % 8 != 0); + rem = rem < 8 ? 0 : rem; + return min(len, rem); +} + +static void bpf_stream_elem_init(struct bpf_stream_elem *elem, int len) +{ + init_llist_node(&elem->node); + elem->mem_slice.ptr = elem->str; + elem->mem_slice.len = len; +} + +static struct bpf_stream_page *bpf_stream_page_from_elem(struct bpf_stream_elem *elem) +{ + unsigned long addr = (unsigned long)elem; + + return (struct bpf_stream_page *)PAGE_ALIGN_DOWN(addr); +} + +static struct bpf_stream_elem *bpf_stream_page_push_elem(struct bpf_stream_page *stream_page, int len) +{ + u32 consumed = stream_page->consumed; + + stream_page->consumed += round_up(offsetof(struct bpf_stream_elem, str[len]), 8); + return (struct bpf_stream_elem *)&stream_page->buf[consumed]; +} + +static noinline struct bpf_stream_elem *bpf_stream_page_reserve_elem(int len) +{ + struct bpf_stream_elem *elem = NULL; + struct bpf_stream_page *page; + int room = 0; + + page = this_cpu_read(stream_pcpu_page); + if (!page) + page = bpf_stream_page_replace(); + if (!page) + return NULL; + + room = bpf_stream_page_check_room(page, len); + if (room != len) + page = bpf_stream_page_replace(); + if (!page) + return NULL; + bpf_stream_page_get(page); + room = bpf_stream_page_check_room(page, len); + WARN_ON_ONCE(room != len); + + elem = bpf_stream_page_push_elem(page, room); + bpf_stream_elem_init(elem, room); + return elem; +} + +static struct bpf_stream_elem *bpf_stream_elem_alloc(int len) +{ + const int max_len = ARRAY_SIZE((struct bpf_bprintf_buffers){}.buf); + struct bpf_stream_elem *elem; + unsigned long flags; + + /* + * We may overflow, but we should never need more than one page size + * worth of memory. This can be lifted, but we'd need to adjust the + * other code to keep allocating more pages to overflow messages. + */ + BUILD_BUG_ON(max_len > BPF_STREAM_PAGE_SZ); + /* + * Length denotes the amount of data to be written as part of stream element, + * thus includes '\0' byte. We're capped by how much bpf_bprintf_buffers can + * accomodate, therefore deny allocations that won't fit into them. + */ + if (len < 0 || len > max_len) + return NULL; + + if (!bpf_stream_page_local_lock(&flags)) + return NULL; + elem = bpf_stream_page_reserve_elem(len); + bpf_stream_page_local_unlock(&flags); + return elem; +} + +__bpf_kfunc_start_defs(); + +static int __bpf_stream_push_str(struct llist_head *log, const char *str, int len) +{ + struct bpf_stream_elem *elem = NULL; + + /* + * Allocate a bpf_prog_stream_elem and push it to the bpf_prog_stream + * log, elements will be popped at once and reversed to print the log. + */ + elem = bpf_stream_elem_alloc(len); + if (!elem) + return -ENOMEM; + + memcpy(elem->str, str, len); + llist_add(&elem->node, log); + + return 0; +} + +static int bpf_stream_consume_capacity(struct bpf_stream *stream, int len) +{ + if (atomic_read(&stream->capacity) >= BPF_STREAM_MAX_CAPACITY) + return -ENOSPC; + if (atomic_add_return(len, &stream->capacity) >= BPF_STREAM_MAX_CAPACITY) { + atomic_sub(len, &stream->capacity); + return -ENOSPC; + } + return 0; +} + +static void bpf_stream_release_capacity(struct bpf_stream *stream, struct bpf_stream_elem *elem) +{ + int len = elem->mem_slice.len; + + atomic_sub(len, &stream->capacity); +} + +static int bpf_stream_push_str(struct bpf_stream *stream, const char *str, int len) +{ + int ret = bpf_stream_consume_capacity(stream, len); + + return ret ?: __bpf_stream_push_str(&stream->log, str, len); +} + +__bpf_kfunc int bpf_stream_vprintk(struct bpf_stream *stream, const char *fmt__str, const void *args, u32 len__sz) +{ + struct bpf_bprintf_data data = { + .get_bin_args = true, + .get_buf = true, + }; + u32 fmt_size = strlen(fmt__str) + 1; + u32 data_len = len__sz; + int ret, num_args; + + if (data_len & 7 || data_len > MAX_BPRINTF_VARARGS * 8 || + (data_len && !args)) + return -EINVAL; + num_args = data_len / 8; + + ret = bpf_bprintf_prepare(fmt__str, fmt_size, args, num_args, &data); + if (ret < 0) + return ret; + + ret = bstr_printf(data.buf, MAX_BPRINTF_BUF, fmt__str, data.bin_args); + /* If the string was truncated, we only wrote until the size of buffer. */ + ret = min_t(u32, ret + 1, MAX_BPRINTF_BUF); + ret = bpf_stream_push_str(stream, data.buf, ret); + bpf_bprintf_cleanup(&data); + + return ret; +} + +__bpf_kfunc struct bpf_stream *bpf_stream_get(enum bpf_stream_id stream_id, void *aux__ign) +{ + struct bpf_prog_aux *aux = aux__ign; + + if (stream_id != BPF_STDOUT && stream_id != BPF_STDERR) + return NULL; + return &aux->stream[stream_id - 1]; +} + +__bpf_kfunc void bpf_stream_free_elem(struct bpf_stream_elem *elem) +{ + struct bpf_stream_page *p; + + p = bpf_stream_page_from_elem(elem); + bpf_stream_page_put(p); +} + +static void bpf_stream_free_list(struct llist_node *list) +{ + struct bpf_stream_elem *elem, *tmp; + + llist_for_each_entry_safe(elem, tmp, list, node) + bpf_stream_free_elem(elem); +} + +static struct llist_node *bpf_stream_backlog_pop(struct bpf_stream *stream) +{ + struct llist_node *node; + + node = stream->backlog_head; + if (stream->backlog_head == stream->backlog_tail) + stream->backlog_head = stream->backlog_tail = NULL; + else + stream->backlog_head = node->next; + return node; +} + +static struct llist_node *bpf_stream_log_pop(struct bpf_stream *stream) +{ + struct llist_node *node, *head, *tail; + unsigned long flags; + + if (llist_empty(&stream->log)) + return NULL; + tail = llist_del_all(&stream->log); + if (!tail) + return NULL; + head = llist_reverse_order(tail); + + if (raw_res_spin_lock_irqsave(&stream->lock, flags)) { + bpf_stream_free_list(head); + return NULL; + } + + if (!stream->backlog_head) { + stream->backlog_head = head; + stream->backlog_tail = tail; + } else { + stream->backlog_tail->next = head; + stream->backlog_tail = tail; + } + + node = bpf_stream_backlog_pop(stream); + raw_res_spin_unlock_irqrestore(&stream->lock, flags); + + return node; +} + +__bpf_kfunc struct bpf_stream_elem *bpf_stream_next_elem(struct bpf_stream *stream) +{ + struct bpf_stream_elem *elem = NULL; + struct llist_node *node; + unsigned long flags; + + if (raw_res_spin_lock_irqsave(&stream->lock, flags)) + return NULL; + node = bpf_stream_backlog_pop(stream); + if (!node) + goto unlock; +unlock: + raw_res_spin_unlock_irqrestore(&stream->lock, flags); + + if (node) + goto end; + + node = bpf_stream_log_pop(stream); + if (!node) + return NULL; +end: + elem = container_of(node, typeof(*elem), node); + bpf_stream_release_capacity(stream, elem); + return elem; +} + +__bpf_kfunc struct bpf_stream *bpf_prog_stream_get(enum bpf_stream_id stream_id, u32 prog_id) +{ + struct bpf_stream *stream; + struct bpf_prog *prog; + + prog = bpf_prog_by_id(prog_id); + if (IS_ERR_OR_NULL(prog)) + return NULL; + stream = bpf_stream_get(stream_id, prog->aux); + if (!stream) + bpf_prog_put(prog); + return stream; +} + +__bpf_kfunc void bpf_prog_stream_put(struct bpf_stream *stream) +{ + enum bpf_stream_id stream_id = stream->stream_id; + struct bpf_prog *prog; + + prog = container_of(stream, struct bpf_prog_aux, stream[stream_id - 1])->prog; + bpf_prog_put(prog); +} + +__bpf_kfunc_end_defs(); + +BTF_KFUNCS_START(stream_kfunc_set) +BTF_ID_FLAGS(func, bpf_stream_get, KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_stream_vprintk, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_stream_next_elem, KF_ACQUIRE | KF_RET_NULL | KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_stream_free_elem, KF_RELEASE) +BTF_KFUNCS_END(stream_kfunc_set) + +BTF_KFUNCS_START(stream_syscall_kfunc_set) +BTF_ID_FLAGS(func, bpf_prog_stream_get, KF_ACQUIRE | KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_prog_stream_put, KF_RELEASE) +BTF_KFUNCS_END(stream_syscall_kfunc_set) + +static const struct btf_kfunc_id_set bpf_stream_kfunc_set = { + .owner = THIS_MODULE, + .set = &stream_kfunc_set, +}; + +static const struct btf_kfunc_id_set bpf_stream_syscall_kfunc_set = { + .owner = THIS_MODULE, + .set = &stream_syscall_kfunc_set, +}; + +static int __init bpf_stream_kfunc_init(void) +{ + int ret; + + ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_UNSPEC, &bpf_stream_kfunc_set); + if (ret) + return ret; + return register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, &bpf_stream_syscall_kfunc_set); +} +late_initcall(bpf_stream_kfunc_init); + +void bpf_prog_stream_init(struct bpf_prog *prog) +{ + int i; + + prog->aux->stream[0].stream_id = BPF_STDOUT; + prog->aux->stream[1].stream_id = BPF_STDERR; + + for (i = 0; i < ARRAY_SIZE(prog->aux->stream); i++) { + atomic_set(&prog->aux->stream[i].capacity, 0); + init_llist_head(&prog->aux->stream[i].log); + raw_res_spin_lock_init(&prog->aux->stream[i].lock); + prog->aux->stream[i].backlog_head = NULL; + prog->aux->stream[i].backlog_tail = NULL; + } +} + +void bpf_prog_stream_free(struct bpf_prog *prog) +{ + struct llist_node *list; + int i; + + for (i = 0; i < ARRAY_SIZE(prog->aux->stream); i++) { + list = llist_del_all(&prog->aux->stream[i].log); + bpf_stream_free_list(list); + bpf_stream_free_list(prog->aux->stream[i].backlog_head); + } +} + +void bpf_stream_stage_init(struct bpf_stream_stage *ss) +{ + init_llist_head(&ss->log); + ss->len = 0; +} + +void bpf_stream_stage_free(struct bpf_stream_stage *ss) +{ + struct llist_node *node; + + node = llist_del_all(&ss->log); + bpf_stream_free_list(node); +} + +int bpf_stream_stage_printk(struct bpf_stream_stage *ss, const char *fmt, ...) +{ + struct bpf_bprintf_buffers *buf; + va_list args; + int ret; + + if (bpf_try_get_buffers(&buf)) + return -EBUSY; + + va_start(args, fmt); + ret = vsnprintf(buf->buf, ARRAY_SIZE(buf->buf), fmt, args); + va_end(args); + /* If the string was truncated, we only wrote until the size of buffer. */ + ret = min_t(u32, ret + 1, ARRAY_SIZE(buf->buf)); + ss->len += ret; + ret = __bpf_stream_push_str(&ss->log, buf->buf, ret); + bpf_put_buffers(); + return ret; +} + +int bpf_stream_stage_commit(struct bpf_stream_stage *ss, struct bpf_prog *prog, + enum bpf_stream_id stream_id) +{ + struct llist_node *list, *head, *tail; + struct bpf_stream *stream; + int ret; + + stream = bpf_stream_get(stream_id, prog->aux); + if (!stream) + return -EINVAL; + + ret = bpf_stream_consume_capacity(stream, ss->len); + if (ret) + return ret; + + list = llist_del_all(&ss->log); + head = list; + + if (!list) + return 0; + while (llist_next(list)) { + tail = llist_next(list); + list = tail; + } + llist_add_batch(head, tail, &stream->log); + return 0; +} diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index df33d19c5c3b3..60778be870e3f 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -576,7 +576,7 @@ static bool can_alloc_pages(void) !IS_ENABLED(CONFIG_PREEMPT_RT); } -static struct page *__bpf_alloc_page(int nid) +struct page *__bpf_alloc_page(int nid) { if (!can_alloc_pages()) return try_alloc_pages(nid, 0); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index ff34e68c92374..aba0b38733bcd 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -12117,6 +12117,7 @@ enum special_kfunc_type { KF_bpf_res_spin_lock_irqsave, KF_bpf_res_spin_unlock_irqrestore, KF_bpf_dynptr_from_mem_slice, + KF_bpf_stream_get, }; BTF_SET_START(special_kfunc_set) @@ -12221,6 +12222,7 @@ BTF_ID(func, bpf_res_spin_unlock) BTF_ID(func, bpf_res_spin_lock_irqsave) BTF_ID(func, bpf_res_spin_unlock_irqrestore) BTF_ID(func, bpf_dynptr_from_mem_slice) +BTF_ID(func, bpf_stream_get) static bool is_kfunc_ret_null(struct bpf_kfunc_call_arg_meta *meta) { @@ -13886,10 +13888,11 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, regs[BPF_REG_0].type = PTR_TO_BTF_ID; regs[BPF_REG_0].btf_id = ptr_type_id; - if (meta.func_id == special_kfunc_list[KF_bpf_get_kmem_cache]) + if (meta.func_id == special_kfunc_list[KF_bpf_get_kmem_cache]) { regs[BPF_REG_0].type |= PTR_UNTRUSTED; - - if (is_iter_next_kfunc(&meta)) { + } else if (meta.func_id == special_kfunc_list[KF_bpf_stream_get]) { + regs[BPF_REG_0].type |= PTR_TRUSTED; + } else if (is_iter_next_kfunc(&meta)) { struct bpf_reg_state *cur_iter; cur_iter = get_iter_from_state(env->cur_state, &meta); @@ -21521,8 +21524,10 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, desc->func_id == special_kfunc_list[KF_bpf_rdonly_cast]) { insn_buf[0] = BPF_MOV64_REG(BPF_REG_0, BPF_REG_1); *cnt = 1; - } else if (is_bpf_wq_set_callback_impl_kfunc(desc->func_id)) { - struct bpf_insn ld_addrs[2] = { BPF_LD_IMM64(BPF_REG_4, (long)env->prog->aux) }; + } else if (is_bpf_wq_set_callback_impl_kfunc(desc->func_id) || + desc->func_id == special_kfunc_list[KF_bpf_stream_get]) { + u32 regno = is_bpf_wq_set_callback_impl_kfunc(desc->func_id) ? BPF_REG_4 : BPF_REG_2; + struct bpf_insn ld_addrs[2] = { BPF_LD_IMM64(regno, (long)env->prog->aux) }; insn_buf[0] = ld_addrs[0]; insn_buf[1] = ld_addrs[1]; From f308ad0e18ffa686b3bd2eab42d7f75551c5f112 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Wed, 7 May 2025 10:17:12 -0700 Subject: [PATCH 03/11] bpf: Add function to extract program source info Prepare a function for use in future patches that can extract the file info, line info, and the source line number for a given BPF program provided it's program counter. Only the basename of the file path is provided, given it can be excessively long in some cases. This will be used in later patches to print source info to the BPF stream. The source line number is indicated by the return value, and the file and line info are provided through out parameters. Signed-off-by: Kumar Kartikeya Dwivedi --- include/linux/bpf.h | 2 ++ kernel/bpf/core.c | 40 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 42 insertions(+) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 2c10ae62df2d4..f12a0bf536c05 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -3644,4 +3644,6 @@ static inline bool bpf_is_subprog(const struct bpf_prog *prog) return prog->aux->func_idx != 0; } +int bpf_prog_get_file_line(struct bpf_prog *prog, unsigned long ip, const char **filep, const char **linep); + #endif /* _LINUX_BPF_H */ diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 22c278c008ce2..df1bae084abdd 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -3204,3 +3204,43 @@ EXPORT_SYMBOL(bpf_stats_enabled_key); EXPORT_TRACEPOINT_SYMBOL_GPL(xdp_exception); EXPORT_TRACEPOINT_SYMBOL_GPL(xdp_bulk_tx); + +int bpf_prog_get_file_line(struct bpf_prog *prog, unsigned long ip, const char **filep, const char **linep) +{ + int idx = -1, insn_start, insn_end, len; + struct bpf_line_info *linfo; + void **jited_linfo; + struct btf *btf; + + btf = prog->aux->btf; + linfo = prog->aux->linfo; + jited_linfo = prog->aux->jited_linfo; + + if (!btf || !linfo || !prog->aux->jited_linfo) + return -EINVAL; + len = prog->aux->func ? prog->aux->func[prog->aux->func_idx]->len : prog->len; + + linfo = &prog->aux->linfo[prog->aux->linfo_idx]; + jited_linfo = &prog->aux->jited_linfo[prog->aux->linfo_idx]; + + insn_start = linfo[0].insn_off; + insn_end = insn_start + len; + + for (int i = 0; linfo[i].insn_off >= insn_start && linfo[i].insn_off < insn_end; i++) { + if (jited_linfo[i] >= (void *)ip) + break; + idx = i; + } + + if (idx == -1) + return -ENOENT; + + /* Get base component of the file path. */ + *filep = btf_name_by_offset(btf, linfo[idx].file_name_off); + *filep = kbasename(*filep); + /* Obtain the source line, and strip whitespace in prefix. */ + *linep = btf_name_by_offset(btf, linfo[idx].line_off); + while (isspace(**linep)) + *linep += 1; + return BPF_LINE_INFO_LINE_NUM(linfo[idx].line_col); +} From 7139d8cf152f479bb83c6f3060cf2672889a31c6 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Wed, 7 May 2025 10:17:13 -0700 Subject: [PATCH 04/11] bpf: Add function to find program from stack trace In preparation of figuring out the closest program that led to the current point in the kernel, implement a function that scans through the stack trace and finds out the closest BPF program when walking down the stack trace. Special care needs to be taken to skip over kernel and BPF subprog frames. We basically scan until we find a BPF main prog frame. The assumption is that if a program calls into us transitively, we'll hit it along the way. If not, we end up returning NULL. Contextually the function will be used in places where we know the program may have called into us. Due to reliance on arch_bpf_stack_walk(), this function only works on x86 with CONFIG_UNWINDER_ORC, arm64, and s390. Remove the warning from arch_bpf_stack_walk as well since we call it outside bpf_throw() context. Signed-off-by: Kumar Kartikeya Dwivedi Acked-by: Eduard Zingerman --- arch/x86/net/bpf_jit_comp.c | 1 - include/linux/bpf.h | 1 + kernel/bpf/core.c | 26 ++++++++++++++++++++++++++ 3 files changed, 27 insertions(+), 1 deletion(-) diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 9e5fe2ba858f0..17693ee6bb1a6 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -3791,7 +3791,6 @@ void arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp } return; #endif - WARN(1, "verification of programs using bpf_throw should have failed\n"); } void bpf_arch_poke_desc_update(struct bpf_jit_poke_descriptor *poke, diff --git a/include/linux/bpf.h b/include/linux/bpf.h index f12a0bf536c05..b57d8a1a77581 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -3645,5 +3645,6 @@ static inline bool bpf_is_subprog(const struct bpf_prog *prog) } int bpf_prog_get_file_line(struct bpf_prog *prog, unsigned long ip, const char **filep, const char **linep); +struct bpf_prog *bpf_prog_find_from_stack(void); #endif /* _LINUX_BPF_H */ diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index df1bae084abdd..dcb665bff22fc 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -3244,3 +3244,29 @@ int bpf_prog_get_file_line(struct bpf_prog *prog, unsigned long ip, const char * *linep += 1; return BPF_LINE_INFO_LINE_NUM(linfo[idx].line_col); } + +struct walk_stack_ctx { + struct bpf_prog *prog; +}; + +static bool find_from_stack_cb(void *cookie, u64 ip, u64 sp, u64 bp) +{ + struct walk_stack_ctx *ctxp = cookie; + struct bpf_prog *prog; + + if (!is_bpf_text_address(ip)) + return true; + prog = bpf_prog_ksym_find(ip); + if (bpf_is_subprog(prog)) + return true; + ctxp->prog = prog; + return false; +} + +struct bpf_prog *bpf_prog_find_from_stack(void) +{ + struct walk_stack_ctx ctx = {}; + + arch_bpf_stack_walk(find_from_stack_cb, &ctx); + return ctx.prog; +} From eab484ea91ce74b9d7943e0f1b1b268a5c9bc224 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Wed, 7 May 2025 10:17:14 -0700 Subject: [PATCH 05/11] bpf: Add dump_stack() analogue to print to BPF stderr Introduce a kernel function which is the analogue of dump_stack() printing some useful information and the stack trace. This is not exposed to BPF programs yet, but can be made available in the future. When we have a program counter for a BPF program in the stack trace, also additionally output the filename and line number to make the trace helpful. The rest of the trace can be passed into ./decode_stacktrace.sh to obtain the line numbers for kernel symbols. Signed-off-by: Kumar Kartikeya Dwivedi --- include/linux/bpf.h | 2 ++ kernel/bpf/stream.c | 42 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 44 insertions(+) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index b57d8a1a77581..46ce05aad0ede 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -3598,8 +3598,10 @@ __printf(2, 3) int bpf_stream_stage_printk(struct bpf_stream_stage *ss, const char *fmt, ...); int bpf_stream_stage_commit(struct bpf_stream_stage *ss, struct bpf_prog *prog, enum bpf_stream_id stream_id); +int bpf_stream_stage_dump_stack(struct bpf_stream_stage *ss); #define bpf_stream_printk(...) bpf_stream_stage_printk(&__ss, __VA_ARGS__) +#define bpf_stream_dump_stack() bpf_stream_stage_dump_stack(&__ss) #define bpf_stream_stage(prog, stream_id, expr) \ ({ \ diff --git a/kernel/bpf/stream.c b/kernel/bpf/stream.c index a9151a8575ec7..a921fb1de3199 100644 --- a/kernel/bpf/stream.c +++ b/kernel/bpf/stream.c @@ -2,6 +2,7 @@ /* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ #include +#include #include #include #include @@ -497,3 +498,44 @@ int bpf_stream_stage_commit(struct bpf_stream_stage *ss, struct bpf_prog *prog, llist_add_batch(head, tail, &stream->log); return 0; } + +struct dump_stack_ctx { + struct bpf_stream_stage *ss; + int err; +}; + +static bool dump_stack_cb(void *cookie, u64 ip, u64 sp, u64 bp) +{ + struct dump_stack_ctx *ctxp = cookie; + const char *file = "", *line = ""; + struct bpf_prog *prog; + int num; + + if (is_bpf_text_address(ip)) { + prog = bpf_prog_ksym_find(ip); + num = bpf_prog_get_file_line(prog, ip, &file, &line); + if (num == -1) + goto end; + ctxp->err = bpf_stream_stage_printk(ctxp->ss, "%pS\n %s @ %s:%d\n", + (void *)ip, line, file, num); + return !ctxp->err; + } +end: + ctxp->err = bpf_stream_stage_printk(ctxp->ss, "%pS\n", (void *)ip); + return !ctxp->err; +} + +int bpf_stream_stage_dump_stack(struct bpf_stream_stage *ss) +{ + struct dump_stack_ctx ctx = { .ss = ss }; + int ret; + + ret = bpf_stream_stage_printk(ss, "CPU: %d UID: %d PID: %d Comm: %s\n", + raw_smp_processor_id(), __kuid_val(current_real_cred()->euid), + current->pid, current->comm); + ret = ret ?: bpf_stream_stage_printk(ss, "Call trace:\n"); + if (!ret) + arch_bpf_stack_walk(dump_stack_cb, &ctx); + ret = ret ?: ctx.err; + return ret ?: bpf_stream_stage_printk(ss, "\n"); +} From 8b0b643b07e7c17fb5d697ac9e8abbd89f3884c3 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Wed, 7 May 2025 10:17:15 -0700 Subject: [PATCH 06/11] bpf: Report may_goto timeout to BPF stderr Begin reporting may_goto timeouts to BPF program's stderr stream. Make sure that we don't end up spamming too many errors if the program keeps failing repeatedly and filling up the stream, hence emit at most 512 error messages from the kernel for a given stream. Signed-off-by: Kumar Kartikeya Dwivedi --- include/linux/bpf.h | 21 ++++++++++++++------- kernel/bpf/core.c | 17 ++++++++++++++++- kernel/bpf/stream.c | 5 +++++ 3 files changed, 35 insertions(+), 8 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 46ce05aad0ede..daf95333be789 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1667,6 +1667,7 @@ struct bpf_prog_aux { struct rcu_head rcu; }; struct bpf_stream stream[2]; + atomic_t stream_error_cnt; }; struct bpf_prog { @@ -3589,6 +3590,8 @@ void bpf_bprintf_cleanup(struct bpf_bprintf_data *data); int bpf_try_get_buffers(struct bpf_bprintf_buffers **bufs); void bpf_put_buffers(void); +#define BPF_PROG_STREAM_ERROR_CNT 512 + void bpf_prog_stream_init(struct bpf_prog *prog); void bpf_prog_stream_free(struct bpf_prog *prog); @@ -3600,16 +3603,20 @@ int bpf_stream_stage_commit(struct bpf_stream_stage *ss, struct bpf_prog *prog, enum bpf_stream_id stream_id); int bpf_stream_stage_dump_stack(struct bpf_stream_stage *ss); +bool bpf_prog_stream_error_limit(struct bpf_prog *prog); + #define bpf_stream_printk(...) bpf_stream_stage_printk(&__ss, __VA_ARGS__) #define bpf_stream_dump_stack() bpf_stream_stage_dump_stack(&__ss) -#define bpf_stream_stage(prog, stream_id, expr) \ - ({ \ - struct bpf_stream_stage __ss; \ - bpf_stream_stage_init(&__ss); \ - (expr); \ - bpf_stream_stage_commit(&__ss, prog, stream_id); \ - bpf_stream_stage_free(&__ss); \ +#define bpf_stream_stage(prog, stream_id, expr) \ + ({ \ + struct bpf_stream_stage __ss; \ + if (!bpf_prog_stream_error_limit(prog)) { \ + bpf_stream_stage_init(&__ss); \ + (expr); \ + bpf_stream_stage_commit(&__ss, prog, stream_id); \ + bpf_stream_stage_free(&__ss); \ + } \ }) #ifdef CONFIG_BPF_LSM diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index dcb665bff22fc..d21c304fe829f 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -3156,6 +3156,19 @@ u64 __weak arch_bpf_timed_may_goto(void) return 0; } +static noinline void bpf_prog_report_may_goto_violation(void) +{ + struct bpf_prog *prog; + + prog = bpf_prog_find_from_stack(); + if (!prog) + return; + bpf_stream_stage(prog, BPF_STDERR, ({ + bpf_stream_printk("ERROR: Timeout detected for may_goto instruction\n"); + bpf_stream_dump_stack(); + })); +} + u64 bpf_check_timed_may_goto(struct bpf_timed_may_goto *p) { u64 time = ktime_get_mono_fast_ns(); @@ -3166,8 +3179,10 @@ u64 bpf_check_timed_may_goto(struct bpf_timed_may_goto *p) return BPF_MAX_TIMED_LOOPS; } /* Check if we've exhausted our time slice, and zero count. */ - if (time - p->timestamp >= (NSEC_PER_SEC / 4)) + if (unlikely(time - p->timestamp >= (NSEC_PER_SEC / 4))) { + bpf_prog_report_may_goto_violation(); return 0; + } /* Refresh the count for the stack frame. */ return BPF_MAX_TIMED_LOOPS; } diff --git a/kernel/bpf/stream.c b/kernel/bpf/stream.c index a921fb1de3199..eaf0574866b19 100644 --- a/kernel/bpf/stream.c +++ b/kernel/bpf/stream.c @@ -539,3 +539,8 @@ int bpf_stream_stage_dump_stack(struct bpf_stream_stage *ss) ret = ret ?: ctx.err; return ret ?: bpf_stream_stage_printk(ss, "\n"); } + +bool bpf_prog_stream_error_limit(struct bpf_prog *prog) +{ + return atomic_fetch_add(1, &prog->aux->stream_error_cnt) >= BPF_PROG_STREAM_ERROR_CNT; +} From 677a643a1e66861aedaa6ad102d6f0582f9e0df0 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Wed, 7 May 2025 10:17:16 -0700 Subject: [PATCH 07/11] bpf: Report rqspinlock deadlocks/timeout to BPF stderr Begin reporting rqspinlock deadlocks and timeout to BPF program's stderr. Signed-off-by: Kumar Kartikeya Dwivedi --- kernel/bpf/rqspinlock.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/kernel/bpf/rqspinlock.c b/kernel/bpf/rqspinlock.c index 338305c8852cf..888c8e2f90615 100644 --- a/kernel/bpf/rqspinlock.c +++ b/kernel/bpf/rqspinlock.c @@ -666,6 +666,26 @@ EXPORT_SYMBOL_GPL(resilient_queued_spin_lock_slowpath); __bpf_kfunc_start_defs(); +static void bpf_prog_report_rqspinlock_violation(const char *str, void *lock, bool irqsave) +{ + struct rqspinlock_held *rqh = this_cpu_ptr(&rqspinlock_held_locks); + struct bpf_prog *prog; + + prog = bpf_prog_find_from_stack(); + if (!prog) + return; + bpf_stream_stage(prog, BPF_STDERR, ({ + bpf_stream_printk("ERROR: %s for bpf_res_spin_lock%s\n", str, irqsave ? "_irqsave" : ""); + bpf_stream_printk("Attempted lock = 0x%px\n", lock); + bpf_stream_printk("Total held locks = %d\n", rqh->cnt); + for (int i = 0; i < min(RES_NR_HELD, rqh->cnt); i++) + bpf_stream_printk("Held lock[%2d] = 0x%px\n", i, rqh->locks[i]); + bpf_stream_dump_stack(); + })); +} + +#define REPORT_STR(ret) ({ (ret) == -ETIMEDOUT ? "Timeout detected" : "AA or ABBA deadlock detected"; }) + __bpf_kfunc int bpf_res_spin_lock(struct bpf_res_spin_lock *lock) { int ret; @@ -676,6 +696,7 @@ __bpf_kfunc int bpf_res_spin_lock(struct bpf_res_spin_lock *lock) preempt_disable(); ret = res_spin_lock((rqspinlock_t *)lock); if (unlikely(ret)) { + bpf_prog_report_rqspinlock_violation(REPORT_STR(ret), lock, false); preempt_enable(); return ret; } @@ -698,6 +719,7 @@ __bpf_kfunc int bpf_res_spin_lock_irqsave(struct bpf_res_spin_lock *lock, unsign local_irq_save(flags); ret = res_spin_lock((rqspinlock_t *)lock); if (unlikely(ret)) { + bpf_prog_report_rqspinlock_violation(REPORT_STR(ret), lock, true); local_irq_restore(flags); preempt_enable(); return ret; From ca484f1a7daf10128991700823c29b7626f13555 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Wed, 7 May 2025 10:17:17 -0700 Subject: [PATCH 08/11] bpf: Report arena faults to BPF stderr Begin reporting arena page faults and the faulting address to BPF program's stderr, for now limited to x86, but arm64 support should be easy to add. Signed-off-by: Kumar Kartikeya Dwivedi --- arch/x86/net/bpf_jit_comp.c | 21 ++++++++++++++++++--- include/linux/bpf.h | 1 + kernel/bpf/arena.c | 14 ++++++++++++++ 3 files changed, 33 insertions(+), 3 deletions(-) diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 17693ee6bb1a6..dbb0feeec7015 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -1384,15 +1384,27 @@ static int emit_atomic_ld_st_index(u8 **pprog, u32 atomic_op, u32 size, } #define DONT_CLEAR 1 +#define ARENA_FAULT (1 << 8) bool ex_handler_bpf(const struct exception_table_entry *x, struct pt_regs *regs) { - u32 reg = x->fixup >> 8; + u32 arena_reg = (x->fixup >> 8) & 0xff; + bool is_arena = !!arena_reg; + u32 reg = x->fixup >> 16; + unsigned long addr; + + /* Read here, if src_reg is dst_reg for load, we'll write 0 to it. */ + if (is_arena) + addr = *(unsigned long *)((void *)regs + arena_reg); /* jump over faulting load and clear dest register */ if (reg != DONT_CLEAR) *(unsigned long *)((void *)regs + reg) = 0; regs->ip += x->fixup & 0xff; + + if (is_arena) + bpf_prog_report_arena_violation(reg == DONT_CLEAR, addr); + return true; } @@ -2043,7 +2055,10 @@ st: if (is_imm8(insn->off)) ex->data = EX_TYPE_BPF; ex->fixup = (prog - start_of_ldx) | - ((BPF_CLASS(insn->code) == BPF_LDX ? reg2pt_regs[dst_reg] : DONT_CLEAR) << 8); + ((BPF_CLASS(insn->code) == BPF_LDX ? reg2pt_regs[dst_reg] : DONT_CLEAR) << 16) + | ((BPF_CLASS(insn->code) == BPF_LDX ? reg2pt_regs[src_reg] : reg2pt_regs[dst_reg])<< 8); + /* Ensure src_reg offset fits in 1 byte. */ + BUILD_BUG_ON(sizeof(struct pt_regs) > U8_MAX); } break; @@ -2161,7 +2176,7 @@ st: if (is_imm8(insn->off)) * End result: x86 insn "mov rbx, qword ptr [rax+0x14]" * of 4 bytes will be ignored and rbx will be zero inited. */ - ex->fixup = (prog - start_of_ldx) | (reg2pt_regs[dst_reg] << 8); + ex->fixup = (prog - start_of_ldx) | (reg2pt_regs[dst_reg] << 16); } break; diff --git a/include/linux/bpf.h b/include/linux/bpf.h index daf95333be789..9e086ca160288 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -3604,6 +3604,7 @@ int bpf_stream_stage_commit(struct bpf_stream_stage *ss, struct bpf_prog *prog, int bpf_stream_stage_dump_stack(struct bpf_stream_stage *ss); bool bpf_prog_stream_error_limit(struct bpf_prog *prog); +void bpf_prog_report_arena_violation(bool write, unsigned long addr); #define bpf_stream_printk(...) bpf_stream_stage_printk(&__ss, __VA_ARGS__) #define bpf_stream_dump_stack() bpf_stream_stage_dump_stack(&__ss) diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c index 0d56cea716022..d4baa98de7d81 100644 --- a/kernel/bpf/arena.c +++ b/kernel/bpf/arena.c @@ -590,3 +590,17 @@ static int __init kfunc_init(void) return register_btf_kfunc_id_set(BPF_PROG_TYPE_UNSPEC, &common_kfunc_set); } late_initcall(kfunc_init); + +void bpf_prog_report_arena_violation(bool write, unsigned long addr) +{ + struct bpf_prog *prog; + + prog = bpf_prog_find_from_stack(); + if (!prog) + return; + bpf_stream_stage(prog, BPF_STDERR, ({ + bpf_stream_printk("ERROR: Arena %s access at unmapped address 0x%lx\n", + write ? "WRITE" : "READ", addr); + bpf_stream_dump_stack(); + })); +} From 5e72b7821e7b7355ce251b6697bf81726c95382a Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Wed, 7 May 2025 10:17:18 -0700 Subject: [PATCH 09/11] libbpf: Add bpf_stream_printk() macro Introduce a new macro that allows printing data similar to bpf_printk(), but to BPF streams. The first argument is the stream ID, the rest of the arguments are same as what one would pass to bpf_printk(). Signed-off-by: Kumar Kartikeya Dwivedi --- kernel/bpf/stream.c | 10 +++++++-- tools/lib/bpf/bpf_helpers.h | 44 +++++++++++++++++++++++++++++++------ 2 files changed, 45 insertions(+), 9 deletions(-) diff --git a/kernel/bpf/stream.c b/kernel/bpf/stream.c index eaf0574866b19..d64975486ad12 100644 --- a/kernel/bpf/stream.c +++ b/kernel/bpf/stream.c @@ -257,7 +257,12 @@ __bpf_kfunc int bpf_stream_vprintk(struct bpf_stream *stream, const char *fmt__s return ret; } -__bpf_kfunc struct bpf_stream *bpf_stream_get(enum bpf_stream_id stream_id, void *aux__ign) +/* Use int vs enum stream_id here, we use this kfunc in bpf_helpers.h, and + * keeping enum stream_id necessitates a complete definition of enum, but we + * can't copy it in the header as it may conflict with the definition in + * vmlinux.h. + */ +__bpf_kfunc struct bpf_stream *bpf_stream_get(int stream_id, void *aux__ign) { struct bpf_prog_aux *aux = aux__ign; @@ -351,7 +356,8 @@ __bpf_kfunc struct bpf_stream_elem *bpf_stream_next_elem(struct bpf_stream *stre return elem; } -__bpf_kfunc struct bpf_stream *bpf_prog_stream_get(enum bpf_stream_id stream_id, u32 prog_id) +/* Use int vs enum bpf_stream_id for consistency with bpf_stream_get. */ +__bpf_kfunc struct bpf_stream *bpf_prog_stream_get(int stream_id, u32 prog_id) { struct bpf_stream *stream; struct bpf_prog *prog; diff --git a/tools/lib/bpf/bpf_helpers.h b/tools/lib/bpf/bpf_helpers.h index a50773d4616e6..1a748c21e358a 100644 --- a/tools/lib/bpf/bpf_helpers.h +++ b/tools/lib/bpf/bpf_helpers.h @@ -314,17 +314,47 @@ enum libbpf_tristate { ___param, sizeof(___param)); \ }) +struct bpf_stream; + +extern struct bpf_stream *bpf_stream_get(int stream_id, void *aux__ign) __weak __ksym; +extern int bpf_stream_vprintk(struct bpf_stream *stream, const char *fmt__str, const void *args, + __u32 len__sz) __weak __ksym; + +#define __bpf_stream_vprintk(stream, fmt, args...) \ +({ \ + static const char ___fmt[] = fmt; \ + unsigned long long ___param[___bpf_narg(args)]; \ + \ + _Pragma("GCC diagnostic push") \ + _Pragma("GCC diagnostic ignored \"-Wint-conversion\"") \ + ___bpf_fill(___param, args); \ + _Pragma("GCC diagnostic pop") \ + \ + int ___id = stream; \ + struct bpf_stream *___sptr = bpf_stream_get(___id, NULL); \ + if (___sptr) \ + bpf_stream_vprintk(___sptr, ___fmt, ___param, sizeof(___param));\ +}) + /* Use __bpf_printk when bpf_printk call has 3 or fewer fmt args - * Otherwise use __bpf_vprintk + * Otherwise use __bpf_vprintk. Virtualize choices so stream printk + * can override it to bpf_stream_vprintk. */ -#define ___bpf_pick_printk(...) \ - ___bpf_nth(_, ##__VA_ARGS__, __bpf_vprintk, __bpf_vprintk, __bpf_vprintk, \ - __bpf_vprintk, __bpf_vprintk, __bpf_vprintk, __bpf_vprintk, \ - __bpf_vprintk, __bpf_vprintk, __bpf_printk /*3*/, __bpf_printk /*2*/,\ - __bpf_printk /*1*/, __bpf_printk /*0*/) +#define ___bpf_pick_printk(choice, choice_3, ...) \ + ___bpf_nth(_, ##__VA_ARGS__, choice, choice, choice, \ + choice, choice, choice, choice, \ + choice, choice, choice_3 /*3*/, choice_3 /*2*/, \ + choice_3 /*1*/, choice_3 /*0*/) /* Helper macro to print out debug messages */ -#define bpf_printk(fmt, args...) ___bpf_pick_printk(args)(fmt, ##args) +#define __bpf_trace_printk(fmt, args...) \ + ___bpf_pick_printk(__bpf_vprintk, __bpf_printk, args)(fmt, ##args) +#define __bpf_stream_printk(stream, fmt, args...) \ + ___bpf_pick_printk(__bpf_stream_vprintk, __bpf_stream_vprintk, args)(stream, fmt, ##args) + +#define bpf_stream_printk(stream, fmt, args...) __bpf_stream_printk(stream, fmt, ##args) + +#define bpf_printk(arg, args...) __bpf_trace_printk(arg, ##args) struct bpf_iter_num; From 8e9f9af981eb7268bc5ad4d2e0df198d77ce7eea Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Wed, 7 May 2025 10:17:19 -0700 Subject: [PATCH 10/11] bpftool: Add support for dumping streams Add bpftool support for dumping streams of a given BPF program. The syntax is `bpftool prog tracelog { stdout | stderr } PROG`. The stdout is dumped to stdout, stderr is dumped to stderr. Cc: Quentin Monnet Signed-off-by: Kumar Kartikeya Dwivedi --- .../bpftool/Documentation/bpftool-prog.rst | 6 ++ tools/bpf/bpftool/Makefile | 2 +- tools/bpf/bpftool/bash-completion/bpftool | 16 +++- tools/bpf/bpftool/prog.c | 88 ++++++++++++++++++- tools/bpf/bpftool/skeleton/stream.bpf.c | 69 +++++++++++++++ 5 files changed, 178 insertions(+), 3 deletions(-) create mode 100644 tools/bpf/bpftool/skeleton/stream.bpf.c diff --git a/tools/bpf/bpftool/Documentation/bpftool-prog.rst b/tools/bpf/bpftool/Documentation/bpftool-prog.rst index d6304e01afe00..258e16ee8def5 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-prog.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-prog.rst @@ -173,6 +173,12 @@ bpftool prog tracelog purposes. For streaming data from BPF programs to user space, one can use perf events (see also **bpftool-map**\ (8)). +bpftool prog tracelog { stdout | stderr } *PROG* + Dump the BPF stream of the program. BPF programs can write to these streams + at runtime with the **bpf_stream_vprintk**\ () kfunc. The kernel may write + error messages to the standard error stream. This facility should be used + only for debugging purposes. + bpftool prog run *PROG* data_in *FILE* [data_out *FILE* [data_size_out *L*]] [ctx_in *FILE* [ctx_out *FILE* [ctx_size_out *M*]]] [repeat *N*] Run BPF program *PROG* in the kernel testing infrastructure for BPF, meaning that the program works on the data and context provided by the diff --git a/tools/bpf/bpftool/Makefile b/tools/bpf/bpftool/Makefile index 9e9a5f006cd2a..eb908223c3bb0 100644 --- a/tools/bpf/bpftool/Makefile +++ b/tools/bpf/bpftool/Makefile @@ -234,7 +234,7 @@ $(OUTPUT)%.bpf.o: skeleton/%.bpf.c $(OUTPUT)vmlinux.h $(LIBBPF_BOOTSTRAP) $(OUTPUT)%.skel.h: $(OUTPUT)%.bpf.o $(BPFTOOL_BOOTSTRAP) $(QUIET_GEN)$(BPFTOOL_BOOTSTRAP) gen skeleton $< > $@ -$(OUTPUT)prog.o: $(OUTPUT)profiler.skel.h +$(OUTPUT)prog.o: $(OUTPUT)profiler.skel.h $(OUTPUT)stream.skel.h $(OUTPUT)pids.o: $(OUTPUT)pid_iter.skel.h diff --git a/tools/bpf/bpftool/bash-completion/bpftool b/tools/bpf/bpftool/bash-completion/bpftool index 1ce409a6cbd91..c7c0bf3aee249 100644 --- a/tools/bpf/bpftool/bash-completion/bpftool +++ b/tools/bpf/bpftool/bash-completion/bpftool @@ -518,7 +518,21 @@ _bpftool() esac ;; tracelog) - return 0 + case $prev in + $command) + COMPREPLY+=( $( compgen -W "stdout stderr" -- \ + "$cur" ) ) + return 0 + ;; + stdout|stderr) + COMPREPLY=( $( compgen -W "$PROG_TYPE" -- \ + "$cur" ) ) + return 0 + ;; + *) + return 0 + ;; + esac ;; profile) case $cword in diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c index f010295350be5..7abe4698c86cd 100644 --- a/tools/bpf/bpftool/prog.c +++ b/tools/bpf/bpftool/prog.c @@ -35,6 +35,8 @@ #include "main.h" #include "xlated_dumper.h" +#include "stream.skel.h" + #define BPF_METADATA_PREFIX "bpf_metadata_" #define BPF_METADATA_PREFIX_LEN (sizeof(BPF_METADATA_PREFIX) - 1) @@ -697,6 +699,15 @@ static int do_show(int argc, char **argv) return err; } +static int process_stream_sample(void *ctx, void *data, size_t len) +{ + FILE *file = ctx; + + fprintf(file, "%s", (char *)data); + fflush(file); + return 0; +} + static int prog_dump(struct bpf_prog_info *info, enum dump_mode mode, char *filepath, bool opcodes, bool visual, bool linum) @@ -1113,6 +1124,80 @@ static int do_detach(int argc, char **argv) return 0; } +enum prog_tracelog_mode { + TRACE_STDOUT, + TRACE_STDERR, +}; + +static int +prog_tracelog_stream(struct bpf_prog_info *info, enum prog_tracelog_mode mode) +{ + FILE *file = mode == TRACE_STDOUT ? stdout : stderr; + LIBBPF_OPTS(bpf_test_run_opts, opts); + struct ring_buffer *ringbuf; + struct stream_bpf *skel; + int map_fd, ret = -1; + + __u32 prog_id = info->id; + __u32 stream_id = mode == TRACE_STDOUT ? 1 : 2; + + skel = stream_bpf__open_and_load(); + if (!skel) + return -errno; + skel->bss->prog_id = prog_id; + skel->bss->stream_id = stream_id; + + map_fd = bpf_map__fd(skel->maps.ringbuf); + ringbuf = ring_buffer__new(map_fd, process_stream_sample, file, NULL); + if (!ringbuf) { + ret = -errno; + goto end; + } + do { + skel->bss->written_count = skel->bss->written_size = 0; + ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.bpftool_dump_prog_stream), &opts); + if (ring_buffer__consume_n(ringbuf, skel->bss->written_count) != skel->bss->written_count) { + ret = -EINVAL; + goto end; + } + } while (!ret && opts.retval == EAGAIN); + + if (opts.retval != 0) + ret = -EINVAL; +end: + stream_bpf__destroy(skel); + return ret; +} + + +static int do_tracelog_any(int argc, char **argv) +{ + enum prog_tracelog_mode mode; + struct bpf_prog_info info; + __u32 info_len = sizeof(info); + int fd, err; + + if (argc == 0) + return do_tracelog(argc, argv); + if (!is_prefix(*argv, "stdout") && !is_prefix(*argv, "stderr")) + usage(); + mode = is_prefix(*argv, "stdout") ? TRACE_STDOUT : TRACE_STDERR; + NEXT_ARG(); + + if (!REQ_ARGS(2)) + return -1; + + fd = prog_parse_fd(&argc, &argv); + if (fd < 0) + return -1; + + err = bpf_prog_get_info_by_fd(fd, &info, &info_len); + if (err < 0) + return -1; + + return prog_tracelog_stream(&info, mode); +} + static int check_single_stdin(char *file_data_in, char *file_ctx_in) { if (file_data_in && file_ctx_in && @@ -2483,6 +2568,7 @@ static int do_help(int argc, char **argv) " [repeat N]\n" " %1$s %2$s profile PROG [duration DURATION] METRICs\n" " %1$s %2$s tracelog\n" + " %1$s %2$s tracelog { stdout | stderr } PROG\n" " %1$s %2$s help\n" "\n" " " HELP_SPEC_MAP "\n" @@ -2522,7 +2608,7 @@ static const struct cmd cmds[] = { { "loadall", do_loadall }, { "attach", do_attach }, { "detach", do_detach }, - { "tracelog", do_tracelog }, + { "tracelog", do_tracelog_any }, { "run", do_run }, { "profile", do_profile }, { 0 } diff --git a/tools/bpf/bpftool/skeleton/stream.bpf.c b/tools/bpf/bpftool/skeleton/stream.bpf.c new file mode 100644 index 0000000000000..9103159591448 --- /dev/null +++ b/tools/bpf/bpftool/skeleton/stream.bpf.c @@ -0,0 +1,69 @@ +// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ +#include +#include +#include + +struct { + __uint(type, BPF_MAP_TYPE_RINGBUF); + __uint(max_entries, 1024 * 1024); +} ringbuf SEC(".maps"); + +int written_size; +int written_count; +int stream_id; +int prog_id; + +#define ENOENT 2 +#define EAGAIN 11 +#define EFAULT 14 + +SEC("syscall") +int bpftool_dump_prog_stream(void *ctx) +{ + struct bpf_stream_elem *elem; + struct bpf_stream *stream; + bool cont = false; + bool ret = 0; + + stream = bpf_prog_stream_get(stream_id, prog_id); + if (!stream) + return ENOENT; + + bpf_repeat(BPF_MAX_LOOPS) { + struct bpf_dynptr dst_dptr, src_dptr; + int size; + + elem = bpf_stream_next_elem(stream); + if (!elem) + break; + size = elem->mem_slice.len; + + if (bpf_dynptr_from_mem_slice(&elem->mem_slice, 0, &src_dptr)) + ret = EFAULT; + if (bpf_ringbuf_reserve_dynptr(&ringbuf, size, 0, &dst_dptr)) + ret = EFAULT; + if (bpf_dynptr_copy(&dst_dptr, 0, &src_dptr, 0, size)) + ret = EFAULT; + bpf_ringbuf_submit_dynptr(&dst_dptr, 0); + + written_count++; + written_size += size; + + bpf_stream_free_elem(elem); + + /* Probe and exit if no more space, probe for twice the typical size. */ + if (bpf_ringbuf_reserve_dynptr(&ringbuf, 2048, 0, &dst_dptr)) + cont = true; + bpf_ringbuf_discard_dynptr(&dst_dptr, 0); + + if (ret || cont) + break; + } + + bpf_prog_stream_put(stream); + + return ret ? ret : (cont ? EAGAIN : 0); +} + +char _license[] SEC("license") = "Dual BSD/GPL"; From 48c389162810712096fb115ff08e142bb67b2b0b Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Wed, 7 May 2025 10:17:20 -0700 Subject: [PATCH 11/11] selftests/bpf: Add tests for prog streams Add selftests to stress test the various facets of the stream API, memory allocation pattern, and ensuring dumping support is tested and functional. Create symlink to bpftool stream.bpf.c and use it to test the support to dump messages to ringbuf in user space, and verify output. Signed-off-by: Kumar Kartikeya Dwivedi --- .../testing/selftests/bpf/prog_tests/stream.c | 95 +++++++++++++ tools/testing/selftests/bpf/progs/stream.c | 127 ++++++++++++++++++ .../selftests/bpf/progs/stream_bpftool.c | 1 + .../testing/selftests/bpf/progs/stream_fail.c | 90 +++++++++++++ 4 files changed, 313 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/stream.c create mode 100644 tools/testing/selftests/bpf/progs/stream.c create mode 120000 tools/testing/selftests/bpf/progs/stream_bpftool.c create mode 100644 tools/testing/selftests/bpf/progs/stream_fail.c diff --git a/tools/testing/selftests/bpf/prog_tests/stream.c b/tools/testing/selftests/bpf/prog_tests/stream.c new file mode 100644 index 0000000000000..7b97b783ff1f3 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/stream.c @@ -0,0 +1,95 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ +#include +#include + +#include "stream.skel.h" +#include "stream_fail.skel.h" + +#include "stream_bpftool.skel.h" + +void test_stream_failure(void) +{ + RUN_TESTS(stream_fail); +} + +void test_stream_success(void) +{ + RUN_TESTS(stream); + RUN_TESTS(stream_bpftool); + return; +} + +typedef int (*sample_cb_t)(void *, void *, size_t); + +static void stream_ringbuf_output(int prog_id, sample_cb_t sample_cb) +{ + LIBBPF_OPTS(bpf_test_run_opts, opts); + struct ring_buffer *ringbuf; + struct stream_bpftool *skel; + int fd, ret; + + skel = stream_bpftool__open_and_load(); + if (!ASSERT_OK_PTR(skel, "stream_bpftool_open_and_load")) + return; + + fd = bpf_map__fd(skel->maps.ringbuf); + + ringbuf = ring_buffer__new(fd, sample_cb, NULL, NULL); + if (!ASSERT_OK_PTR(ringbuf, "ringbuf_new")) + goto end; + + skel->bss->prog_id = prog_id; + skel->bss->stream_id = 1; + do { + skel->bss->written_count = skel->bss->written_size = 0; + ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.bpftool_dump_prog_stream), &opts); + if (ret) + break; + ret = ring_buffer__consume_n(ringbuf, skel->bss->written_count); + if (!ASSERT_EQ(ret, skel->bss->written_count, "consume")) + break; + ret = 0; + } while (opts.retval == EAGAIN); + + ASSERT_OK(ret, "ret"); + ASSERT_EQ(opts.retval, 0, "retval"); + +end: + stream_bpftool__destroy(skel); +} + +int cnt = 0; + +static int process_sample(void *ctx, void *data, size_t len) +{ + char buf[64]; + + snprintf(buf, sizeof(buf), "num=%d\n", cnt++); + ASSERT_TRUE(strcmp(buf, (char *)data) == 0, "sample strcmp"); + return 0; +} + +void test_stream_output(void) +{ + LIBBPF_OPTS(bpf_test_run_opts, opts); + struct bpf_prog_info info = {}; + __u32 info_len = sizeof(info); + struct stream *skel; + int ret; + + skel = stream__open_and_load(); + if (!ASSERT_OK_PTR(skel, "stream__open_and_load")) + return; + + ASSERT_OK(bpf_prog_get_info_by_fd(bpf_program__fd(skel->progs.stream_test_output), &info, &info_len), "get info"); + ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.stream_test_output), &opts); + ASSERT_OK(ret, "ret"); + ASSERT_OK(opts.retval, "retval"); + stream_ringbuf_output(info.id, process_sample); + + ASSERT_EQ(cnt, 1000, "cnt"); + + stream__destroy(skel); + return; +} diff --git a/tools/testing/selftests/bpf/progs/stream.c b/tools/testing/selftests/bpf/progs/stream.c new file mode 100644 index 0000000000000..14cb8690824fa --- /dev/null +++ b/tools/testing/selftests/bpf/progs/stream.c @@ -0,0 +1,127 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ +#include +#include +#include +#include "bpf_misc.h" + +#define _STR "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx" + +#define STREAM_STR (u64)(_STR _STR _STR _STR) + +static __noinline int stream_exercise(int id, int N) +{ + struct bpf_stream_elem *elem, *earr[56] = {}; + struct bpf_stream *stream; + int ret; + u32 i; + + if (N > 56) + return 56; + + stream = bpf_stream_get(id, NULL); + if (!stream) + return 1; + for (i = 0; i < N; i++) + if ((ret = bpf_stream_vprintk(stream, "%llu%s", &(u64[]){i, STREAM_STR}, 16)) < 0) { + bpf_printk("bpf_stream_vprintk ret=%d", ret); + return 2; + } + ret = 0; + for (i = 0; i < N; i++) { + elem = bpf_stream_next_elem(stream); + if (!elem) { + ret = 4; + break; + } + earr[i] = elem; + } + elem = bpf_stream_next_elem(stream); + if (elem) { + bpf_stream_free_elem(elem); + ret = 5; + } + for (i = 0; i < N; i++) + if (earr[i]) + bpf_stream_free_elem(earr[i]); + return ret; +} + +static __noinline int stream_exercise_nums(int id) +{ + int ret = 0; + + ret = ret ?: stream_exercise(id, 56); + ret = ret ?: stream_exercise(id, 42); + ret = ret ?: stream_exercise(id, 28); + ret = ret ?: stream_exercise(id, 10); + ret = ret ?: stream_exercise(id, 1); + + return ret; +} + +SEC("syscall") +__success __retval(0) +int stream_test(void *ctx) +{ + unsigned long flags; + int ret; + + bpf_local_irq_save(&flags); + bpf_repeat(50) { + ret = stream_exercise_nums(BPF_STDOUT); + if (ret) + break; + } + if (ret) { + bpf_local_irq_restore(&flags); + return ret; + } + bpf_repeat(100) { + ret = stream_exercise_nums(BPF_STDERR); + if (ret) + break; + } + bpf_local_irq_restore(&flags); + + if (ret) + return ret; + + ret = stream_exercise_nums(BPF_STDOUT); + if (ret) + return ret; + return stream_exercise_nums(BPF_STDERR); +} + +SEC("syscall") +__success __retval(0) +int stream_test_output(void *ctx) +{ + for (int i = 0; i < 1000; i++) + bpf_stream_printk(BPF_STDOUT, "num=%d\n", i); + return 0; +} + +SEC("syscall") +__success __retval(0) +int stream_test_limit(void *ctx) +{ + struct bpf_stream *stream; + bool failed = false; + + stream = bpf_stream_get(BPF_STDOUT, NULL); + if (!stream) + return 2; + + bpf_repeat(BPF_MAX_LOOPS) { + failed = bpf_stream_vprintk(stream, "%s%s%s", &(u64[]){STREAM_STR, STREAM_STR}, 16) != 0; + if (failed) + break; + } + + if (failed) + return 0; + return 1; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/stream_bpftool.c b/tools/testing/selftests/bpf/progs/stream_bpftool.c new file mode 120000 index 0000000000000..5904c0d92edc5 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/stream_bpftool.c @@ -0,0 +1 @@ +../../../../bpf/bpftool/skeleton/stream.bpf.c \ No newline at end of file diff --git a/tools/testing/selftests/bpf/progs/stream_fail.c b/tools/testing/selftests/bpf/progs/stream_fail.c new file mode 100644 index 0000000000000..50f70b9878b8b --- /dev/null +++ b/tools/testing/selftests/bpf/progs/stream_fail.c @@ -0,0 +1,90 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ +#include +#include +#include +#include +#include "bpf_misc.h" + +SEC("syscall") +__failure __msg("R1 type=trusted_ptr_or_null_ expected=") +int stream_get_trusted(void *ctx) { + struct bpf_stream *stream; + + stream = bpf_stream_get(BPF_STDOUT, NULL); + bpf_this_cpu_ptr(stream); + return 0; +} + +SEC("tc") +__failure __msg("calling kernel function bpf_prog_stream_get is not allowed") +int stream_get_prog_fail(void *ctx) { + struct bpf_stream *stream; + + stream = bpf_prog_stream_get(BPF_STDOUT, 0); + if (!stream) + return 0; + bpf_this_cpu_ptr(stream); + return 0; +} + +SEC("syscall") +__failure __msg("R1 type=ptr_or_null_ expected=") +int stream_get_prog_trusted(void *ctx) { + struct bpf_stream *stream; + + stream = bpf_prog_stream_get(BPF_STDOUT, 0); + bpf_this_cpu_ptr(stream); + return 0; +} + +SEC("syscall") +__failure __msg("Unreleased reference") +int stream_get_put_missing(void *ctx) { + struct bpf_stream *stream; + + stream = bpf_prog_stream_get(BPF_STDOUT, 0); + if (!stream) + return 0; + return 0; +} + +SEC("syscall") +__failure __msg("R1 must be referenced or trusted") +int stream_next_untrusted_arg(void *ctx) +{ + struct bpf_stream *stream; + + stream = bpf_core_cast((void *)0xdeadbeef, typeof(*stream)); + bpf_stream_next_elem(stream); + return 0; +} + +SEC("syscall") +__failure __msg("Possibly NULL pointer passed") +int stream_next_null_arg(void *ctx) +{ + bpf_stream_next_elem(NULL); + return 0; +} + +SEC("syscall") +__failure __msg("R1 must be referenced or trusted") +int stream_vprintk_untrusted_arg(void *ctx) +{ + struct bpf_stream *stream; + + stream = bpf_core_cast((void *)0xfaceb00c, typeof(*stream)); + bpf_stream_vprintk(stream, "", NULL, 0); + return 0; +} + +SEC("syscall") +__failure __msg("Possibly NULL pointer passed") +int stream_vprintk_null_arg(void *ctx) +{ + bpf_stream_vprintk(NULL, "", NULL, 0); + return 0; +} + +char _license[] SEC("license") = "GPL";