diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 9e5fe2ba858f0..dbb0feeec7015 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -1384,15 +1384,27 @@ static int emit_atomic_ld_st_index(u8 **pprog, u32 atomic_op, u32 size, } #define DONT_CLEAR 1 +#define ARENA_FAULT (1 << 8) bool ex_handler_bpf(const struct exception_table_entry *x, struct pt_regs *regs) { - u32 reg = x->fixup >> 8; + u32 arena_reg = (x->fixup >> 8) & 0xff; + bool is_arena = !!arena_reg; + u32 reg = x->fixup >> 16; + unsigned long addr; + + /* Read here, if src_reg is dst_reg for load, we'll write 0 to it. */ + if (is_arena) + addr = *(unsigned long *)((void *)regs + arena_reg); /* jump over faulting load and clear dest register */ if (reg != DONT_CLEAR) *(unsigned long *)((void *)regs + reg) = 0; regs->ip += x->fixup & 0xff; + + if (is_arena) + bpf_prog_report_arena_violation(reg == DONT_CLEAR, addr); + return true; } @@ -2043,7 +2055,10 @@ st: if (is_imm8(insn->off)) ex->data = EX_TYPE_BPF; ex->fixup = (prog - start_of_ldx) | - ((BPF_CLASS(insn->code) == BPF_LDX ? reg2pt_regs[dst_reg] : DONT_CLEAR) << 8); + ((BPF_CLASS(insn->code) == BPF_LDX ? reg2pt_regs[dst_reg] : DONT_CLEAR) << 16) + | ((BPF_CLASS(insn->code) == BPF_LDX ? reg2pt_regs[src_reg] : reg2pt_regs[dst_reg])<< 8); + /* Ensure src_reg offset fits in 1 byte. */ + BUILD_BUG_ON(sizeof(struct pt_regs) > U8_MAX); } break; @@ -2161,7 +2176,7 @@ st: if (is_imm8(insn->off)) * End result: x86 insn "mov rbx, qword ptr [rax+0x14]" * of 4 bytes will be ignored and rbx will be zero inited. */ - ex->fixup = (prog - start_of_ldx) | (reg2pt_regs[dst_reg] << 8); + ex->fixup = (prog - start_of_ldx) | (reg2pt_regs[dst_reg] << 16); } break; @@ -3791,7 +3806,6 @@ void arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp } return; #endif - WARN(1, "verification of programs using bpf_throw should have failed\n"); } void bpf_arch_poke_desc_update(struct bpf_jit_poke_descriptor *poke, diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 3f0cc89c0622c..9e086ca160288 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1344,6 +1344,12 @@ enum bpf_dynptr_type { BPF_DYNPTR_TYPE_XDP, }; +struct bpf_mem_slice { + void *ptr; + u32 len; + u32 reserved; +}; + int bpf_dynptr_check_size(u32 size); u32 __bpf_dynptr_size(const struct bpf_dynptr_kern *ptr); const void *__bpf_dynptr_data(const struct bpf_dynptr_kern *ptr, u32 len); @@ -1518,6 +1524,40 @@ struct btf_mod_pair { struct bpf_kfunc_desc_tab; +enum bpf_stream_id { + BPF_STDOUT = 1, + BPF_STDERR = 2, +}; + +struct bpf_stream_elem { + struct llist_node node; + struct bpf_mem_slice mem_slice; + char str[]; +}; + +struct bpf_stream_elem_batch { + struct llist_node *node; +}; + +enum { + BPF_STREAM_MAX_CAPACITY = (4 * 1024U * 1024U), +}; + +struct bpf_stream { + enum bpf_stream_id stream_id; + atomic_t capacity; + struct llist_head log; + + rqspinlock_t lock; + struct llist_node *backlog_head; + struct llist_node *backlog_tail; +}; + +struct bpf_stream_stage { + struct llist_head log; + int len; +}; + struct bpf_prog_aux { atomic64_t refcnt; u32 used_map_cnt; @@ -1626,6 +1666,8 @@ struct bpf_prog_aux { struct work_struct work; struct rcu_head rcu; }; + struct bpf_stream stream[2]; + atomic_t stream_error_cnt; }; struct bpf_prog { @@ -2385,6 +2427,8 @@ int generic_map_delete_batch(struct bpf_map *map, struct bpf_map *bpf_map_get_curr_or_next(u32 *id); struct bpf_prog *bpf_prog_get_curr_or_next(u32 *id); + +struct page *__bpf_alloc_page(int nid); int bpf_map_alloc_pages(const struct bpf_map *map, int nid, unsigned long nr_pages, struct page **page_array); #ifdef CONFIG_MEMCG @@ -3523,6 +3567,16 @@ bool btf_id_set_contains(const struct btf_id_set *set, u32 id); #define MAX_BPRINTF_VARARGS 12 #define MAX_BPRINTF_BUF 1024 +/* Per-cpu temp buffers used by printf-like helpers to store the bprintf binary + * arguments representation. + */ +#define MAX_BPRINTF_BIN_ARGS 512 + +struct bpf_bprintf_buffers { + char bin_args[MAX_BPRINTF_BIN_ARGS]; + char buf[MAX_BPRINTF_BUF]; +}; + struct bpf_bprintf_data { u32 *bin_args; char *buf; @@ -3530,9 +3584,41 @@ struct bpf_bprintf_data { bool get_buf; }; -int bpf_bprintf_prepare(char *fmt, u32 fmt_size, const u64 *raw_args, +int bpf_bprintf_prepare(const char *fmt, u32 fmt_size, const u64 *raw_args, u32 num_args, struct bpf_bprintf_data *data); void bpf_bprintf_cleanup(struct bpf_bprintf_data *data); +int bpf_try_get_buffers(struct bpf_bprintf_buffers **bufs); +void bpf_put_buffers(void); + +#define BPF_PROG_STREAM_ERROR_CNT 512 + +void bpf_prog_stream_init(struct bpf_prog *prog); +void bpf_prog_stream_free(struct bpf_prog *prog); + +void bpf_stream_stage_init(struct bpf_stream_stage *ss); +void bpf_stream_stage_free(struct bpf_stream_stage *ss); +__printf(2, 3) +int bpf_stream_stage_printk(struct bpf_stream_stage *ss, const char *fmt, ...); +int bpf_stream_stage_commit(struct bpf_stream_stage *ss, struct bpf_prog *prog, + enum bpf_stream_id stream_id); +int bpf_stream_stage_dump_stack(struct bpf_stream_stage *ss); + +bool bpf_prog_stream_error_limit(struct bpf_prog *prog); +void bpf_prog_report_arena_violation(bool write, unsigned long addr); + +#define bpf_stream_printk(...) bpf_stream_stage_printk(&__ss, __VA_ARGS__) +#define bpf_stream_dump_stack() bpf_stream_stage_dump_stack(&__ss) + +#define bpf_stream_stage(prog, stream_id, expr) \ + ({ \ + struct bpf_stream_stage __ss; \ + if (!bpf_prog_stream_error_limit(prog)) { \ + bpf_stream_stage_init(&__ss); \ + (expr); \ + bpf_stream_stage_commit(&__ss, prog, stream_id); \ + bpf_stream_stage_free(&__ss); \ + } \ + }) #ifdef CONFIG_BPF_LSM void bpf_cgroup_atype_get(u32 attach_btf_id, int cgroup_atype); @@ -3568,4 +3654,7 @@ static inline bool bpf_is_subprog(const struct bpf_prog *prog) return prog->aux->func_idx != 0; } +int bpf_prog_get_file_line(struct bpf_prog *prog, unsigned long ip, const char **filep, const char **linep); +struct bpf_prog *bpf_prog_find_from_stack(void); + #endif /* _LINUX_BPF_H */ diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 70502f038b921..a89575822b60c 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -14,7 +14,7 @@ obj-$(CONFIG_BPF_SYSCALL) += bpf_local_storage.o bpf_task_storage.o obj-${CONFIG_BPF_LSM} += bpf_inode_storage.o obj-$(CONFIG_BPF_SYSCALL) += disasm.o mprog.o obj-$(CONFIG_BPF_JIT) += trampoline.o -obj-$(CONFIG_BPF_SYSCALL) += btf.o memalloc.o rqspinlock.o +obj-$(CONFIG_BPF_SYSCALL) += btf.o memalloc.o rqspinlock.o stream.o ifeq ($(CONFIG_MMU)$(CONFIG_64BIT),yy) obj-$(CONFIG_BPF_SYSCALL) += arena.o range_tree.o endif diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c index 0d56cea716022..d4baa98de7d81 100644 --- a/kernel/bpf/arena.c +++ b/kernel/bpf/arena.c @@ -590,3 +590,17 @@ static int __init kfunc_init(void) return register_btf_kfunc_id_set(BPF_PROG_TYPE_UNSPEC, &common_kfunc_set); } late_initcall(kfunc_init); + +void bpf_prog_report_arena_violation(bool write, unsigned long addr) +{ + struct bpf_prog *prog; + + prog = bpf_prog_find_from_stack(); + if (!prog) + return; + bpf_stream_stage(prog, BPF_STDERR, ({ + bpf_stream_printk("ERROR: Arena %s access at unmapped address 0x%lx\n", + write ? "WRITE" : "READ", addr); + bpf_stream_dump_stack(); + })); +} diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index a3e5716884211..d21c304fe829f 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -134,6 +134,10 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag mutex_init(&fp->aux->ext_mutex); mutex_init(&fp->aux->dst_mutex); +#ifdef CONFIG_BPF_SYSCALL + bpf_prog_stream_init(fp); +#endif + return fp; } @@ -2861,6 +2865,7 @@ static void bpf_prog_free_deferred(struct work_struct *work) aux = container_of(work, struct bpf_prog_aux, work); #ifdef CONFIG_BPF_SYSCALL bpf_free_kfunc_btf_tab(aux->kfunc_btf_tab); + bpf_prog_stream_free(aux->prog); #endif #ifdef CONFIG_CGROUP_BPF if (aux->cgroup_atype != CGROUP_BPF_ATTACH_TYPE_INVALID) @@ -2877,6 +2882,13 @@ static void bpf_prog_free_deferred(struct work_struct *work) if (aux->dst_trampoline) bpf_trampoline_put(aux->dst_trampoline); for (i = 0; i < aux->real_func_cnt; i++) { +#ifdef CONFIG_BPF_SYSCALL + /* Ensure we don't push to subprog lists. */ + if (bpf_is_subprog(aux->func[i])) { + WARN_ON_ONCE(!llist_empty(&aux->func[i]->aux->stream[0].log)); + WARN_ON_ONCE(!llist_empty(&aux->func[i]->aux->stream[1].log)); + } +#endif /* We can just unlink the subprog poke descriptor table as * it was originally linked to the main program and is also * released along with it. @@ -3144,6 +3156,19 @@ u64 __weak arch_bpf_timed_may_goto(void) return 0; } +static noinline void bpf_prog_report_may_goto_violation(void) +{ + struct bpf_prog *prog; + + prog = bpf_prog_find_from_stack(); + if (!prog) + return; + bpf_stream_stage(prog, BPF_STDERR, ({ + bpf_stream_printk("ERROR: Timeout detected for may_goto instruction\n"); + bpf_stream_dump_stack(); + })); +} + u64 bpf_check_timed_may_goto(struct bpf_timed_may_goto *p) { u64 time = ktime_get_mono_fast_ns(); @@ -3154,8 +3179,10 @@ u64 bpf_check_timed_may_goto(struct bpf_timed_may_goto *p) return BPF_MAX_TIMED_LOOPS; } /* Check if we've exhausted our time slice, and zero count. */ - if (time - p->timestamp >= (NSEC_PER_SEC / 4)) + if (unlikely(time - p->timestamp >= (NSEC_PER_SEC / 4))) { + bpf_prog_report_may_goto_violation(); return 0; + } /* Refresh the count for the stack frame. */ return BPF_MAX_TIMED_LOOPS; } @@ -3192,3 +3219,69 @@ EXPORT_SYMBOL(bpf_stats_enabled_key); EXPORT_TRACEPOINT_SYMBOL_GPL(xdp_exception); EXPORT_TRACEPOINT_SYMBOL_GPL(xdp_bulk_tx); + +int bpf_prog_get_file_line(struct bpf_prog *prog, unsigned long ip, const char **filep, const char **linep) +{ + int idx = -1, insn_start, insn_end, len; + struct bpf_line_info *linfo; + void **jited_linfo; + struct btf *btf; + + btf = prog->aux->btf; + linfo = prog->aux->linfo; + jited_linfo = prog->aux->jited_linfo; + + if (!btf || !linfo || !prog->aux->jited_linfo) + return -EINVAL; + len = prog->aux->func ? prog->aux->func[prog->aux->func_idx]->len : prog->len; + + linfo = &prog->aux->linfo[prog->aux->linfo_idx]; + jited_linfo = &prog->aux->jited_linfo[prog->aux->linfo_idx]; + + insn_start = linfo[0].insn_off; + insn_end = insn_start + len; + + for (int i = 0; linfo[i].insn_off >= insn_start && linfo[i].insn_off < insn_end; i++) { + if (jited_linfo[i] >= (void *)ip) + break; + idx = i; + } + + if (idx == -1) + return -ENOENT; + + /* Get base component of the file path. */ + *filep = btf_name_by_offset(btf, linfo[idx].file_name_off); + *filep = kbasename(*filep); + /* Obtain the source line, and strip whitespace in prefix. */ + *linep = btf_name_by_offset(btf, linfo[idx].line_off); + while (isspace(**linep)) + *linep += 1; + return BPF_LINE_INFO_LINE_NUM(linfo[idx].line_col); +} + +struct walk_stack_ctx { + struct bpf_prog *prog; +}; + +static bool find_from_stack_cb(void *cookie, u64 ip, u64 sp, u64 bp) +{ + struct walk_stack_ctx *ctxp = cookie; + struct bpf_prog *prog; + + if (!is_bpf_text_address(ip)) + return true; + prog = bpf_prog_ksym_find(ip); + if (bpf_is_subprog(prog)) + return true; + ctxp->prog = prog; + return false; +} + +struct bpf_prog *bpf_prog_find_from_stack(void) +{ + struct walk_stack_ctx ctx = {}; + + arch_bpf_stack_walk(find_from_stack_cb, &ctx); + return ctx.prog; +} diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 78cefb41266a1..98806368121ed 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -761,22 +761,13 @@ static int bpf_trace_copy_string(char *buf, void *unsafe_ptr, char fmt_ptype, return -EINVAL; } -/* Per-cpu temp buffers used by printf-like helpers to store the bprintf binary - * arguments representation. - */ -#define MAX_BPRINTF_BIN_ARGS 512 - /* Support executing three nested bprintf helper calls on a given CPU */ #define MAX_BPRINTF_NEST_LEVEL 3 -struct bpf_bprintf_buffers { - char bin_args[MAX_BPRINTF_BIN_ARGS]; - char buf[MAX_BPRINTF_BUF]; -}; static DEFINE_PER_CPU(struct bpf_bprintf_buffers[MAX_BPRINTF_NEST_LEVEL], bpf_bprintf_bufs); static DEFINE_PER_CPU(int, bpf_bprintf_nest_level); -static int try_get_buffers(struct bpf_bprintf_buffers **bufs) +int bpf_try_get_buffers(struct bpf_bprintf_buffers **bufs) { int nest_level; @@ -792,16 +783,21 @@ static int try_get_buffers(struct bpf_bprintf_buffers **bufs) return 0; } -void bpf_bprintf_cleanup(struct bpf_bprintf_data *data) +void bpf_put_buffers(void) { - if (!data->bin_args && !data->buf) - return; if (WARN_ON_ONCE(this_cpu_read(bpf_bprintf_nest_level) == 0)) return; this_cpu_dec(bpf_bprintf_nest_level); preempt_enable(); } +void bpf_bprintf_cleanup(struct bpf_bprintf_data *data) +{ + if (!data->bin_args && !data->buf) + return; + bpf_put_buffers(); +} + /* * bpf_bprintf_prepare - Generic pass on format strings for bprintf-like helpers * @@ -816,7 +812,7 @@ void bpf_bprintf_cleanup(struct bpf_bprintf_data *data) * In argument preparation mode, if 0 is returned, safe temporary buffers are * allocated and bpf_bprintf_cleanup should be called to free them after use. */ -int bpf_bprintf_prepare(char *fmt, u32 fmt_size, const u64 *raw_args, +int bpf_bprintf_prepare(const char *fmt, u32 fmt_size, const u64 *raw_args, u32 num_args, struct bpf_bprintf_data *data) { bool get_buffers = (data->get_bin_args && num_args) || data->get_buf; @@ -832,7 +828,7 @@ int bpf_bprintf_prepare(char *fmt, u32 fmt_size, const u64 *raw_args, return -EINVAL; fmt_size = fmt_end - fmt; - if (get_buffers && try_get_buffers(&buffers)) + if (get_buffers && bpf_try_get_buffers(&buffers)) return -EBUSY; if (data->get_bin_args) { @@ -2873,6 +2869,42 @@ __bpf_kfunc int bpf_dynptr_copy(struct bpf_dynptr *dst_ptr, u32 dst_off, return 0; } +/** + * bpf_dynptr_from_mem_slice - Create a dynptr from a bpf_mem_slice + * @mem_slice: Source bpf_mem_slice, backing the underlying memory for dynptr + * @flags: Flags for dynptr construction, currently no supported flags. + * @dptr__uninit: Destination dynptr, which will be initialized. + * + * Creates a dynptr that points to variable-length read-only memory represented + * by a bpf_mem_slice fat pointer. + * Returns 0 on success; negative error, otherwise. + */ +__bpf_kfunc int bpf_dynptr_from_mem_slice(struct bpf_mem_slice *mem_slice, u64 flags, struct bpf_dynptr *dptr__uninit) +{ + struct bpf_dynptr_kern *dptr = (struct bpf_dynptr_kern *)dptr__uninit; + int err; + + /* mem_slice is never NULL, as we use KF_TRUSTED_ARGS. */ + err = bpf_dynptr_check_size(mem_slice->len); + if (err) + goto error; + + /* flags is currently unsupported */ + if (flags) { + err = -EINVAL; + goto error; + } + + bpf_dynptr_init(dptr, mem_slice->ptr, BPF_DYNPTR_TYPE_LOCAL, 0, mem_slice->len); + bpf_dynptr_set_rdonly(dptr); + + return 0; + +error: + bpf_dynptr_set_null(dptr); + return err; +} + __bpf_kfunc void *bpf_cast_to_kern_ctx(void *obj) { return obj; @@ -3327,6 +3359,7 @@ BTF_ID_FLAGS(func, bpf_dynptr_is_rdonly) BTF_ID_FLAGS(func, bpf_dynptr_size) BTF_ID_FLAGS(func, bpf_dynptr_clone) BTF_ID_FLAGS(func, bpf_dynptr_copy) +BTF_ID_FLAGS(func, bpf_dynptr_from_mem_slice, KF_TRUSTED_ARGS) #ifdef CONFIG_NET BTF_ID_FLAGS(func, bpf_modify_return_test_tp) #endif diff --git a/kernel/bpf/rqspinlock.c b/kernel/bpf/rqspinlock.c index 338305c8852cf..888c8e2f90615 100644 --- a/kernel/bpf/rqspinlock.c +++ b/kernel/bpf/rqspinlock.c @@ -666,6 +666,26 @@ EXPORT_SYMBOL_GPL(resilient_queued_spin_lock_slowpath); __bpf_kfunc_start_defs(); +static void bpf_prog_report_rqspinlock_violation(const char *str, void *lock, bool irqsave) +{ + struct rqspinlock_held *rqh = this_cpu_ptr(&rqspinlock_held_locks); + struct bpf_prog *prog; + + prog = bpf_prog_find_from_stack(); + if (!prog) + return; + bpf_stream_stage(prog, BPF_STDERR, ({ + bpf_stream_printk("ERROR: %s for bpf_res_spin_lock%s\n", str, irqsave ? "_irqsave" : ""); + bpf_stream_printk("Attempted lock = 0x%px\n", lock); + bpf_stream_printk("Total held locks = %d\n", rqh->cnt); + for (int i = 0; i < min(RES_NR_HELD, rqh->cnt); i++) + bpf_stream_printk("Held lock[%2d] = 0x%px\n", i, rqh->locks[i]); + bpf_stream_dump_stack(); + })); +} + +#define REPORT_STR(ret) ({ (ret) == -ETIMEDOUT ? "Timeout detected" : "AA or ABBA deadlock detected"; }) + __bpf_kfunc int bpf_res_spin_lock(struct bpf_res_spin_lock *lock) { int ret; @@ -676,6 +696,7 @@ __bpf_kfunc int bpf_res_spin_lock(struct bpf_res_spin_lock *lock) preempt_disable(); ret = res_spin_lock((rqspinlock_t *)lock); if (unlikely(ret)) { + bpf_prog_report_rqspinlock_violation(REPORT_STR(ret), lock, false); preempt_enable(); return ret; } @@ -698,6 +719,7 @@ __bpf_kfunc int bpf_res_spin_lock_irqsave(struct bpf_res_spin_lock *lock, unsign local_irq_save(flags); ret = res_spin_lock((rqspinlock_t *)lock); if (unlikely(ret)) { + bpf_prog_report_rqspinlock_violation(REPORT_STR(ret), lock, true); local_irq_restore(flags); preempt_enable(); return ret; diff --git a/kernel/bpf/stream.c b/kernel/bpf/stream.c new file mode 100644 index 0000000000000..d64975486ad12 --- /dev/null +++ b/kernel/bpf/stream.c @@ -0,0 +1,552 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Simple per-CPU NMI-safe bump allocation mechanism, backed by the NMI-safe + * try_alloc_pages()/free_pages_nolock() primitives. We allocate a page and + * stash it in a local per-CPU variable, and bump allocate from the page + * whenever items need to be printed to a stream. Each page holds a global + * atomic refcount in its first 4 bytes, and then records of variable length + * that describe the printed messages. Once the global refcount has dropped to + * zero, it is a signal to free the page back to the kernel's page allocator, + * given all the individual records in it have been consumed. + * + * It is possible the same page is used to serve allocations across different + * programs, which may be consumed at different times individually, hence + * maintaining a reference count per-page is critical for correct lifetime + * tracking. + * + * The bpf_stream_page code will be replaced to use kmalloc_nolock() once it + * lands. + */ +struct bpf_stream_page { + refcount_t ref; + u32 consumed; + char buf[]; +}; + +/* Available room to add data to a refcounted page. */ +#define BPF_STREAM_PAGE_SZ (PAGE_SIZE - offsetofend(struct bpf_stream_page, consumed)) + +static DEFINE_PER_CPU(local_trylock_t, stream_local_lock) = INIT_LOCAL_TRYLOCK(stream_local_lock); +static DEFINE_PER_CPU(struct bpf_stream_page *, stream_pcpu_page); + +static bool bpf_stream_page_local_lock(unsigned long *flags) +{ + return local_trylock_irqsave(&stream_local_lock, *flags); +} + +static void bpf_stream_page_local_unlock(unsigned long *flags) +{ + local_unlock_irqrestore(&stream_local_lock, *flags); +} + +static void bpf_stream_page_free(struct bpf_stream_page *stream_page) +{ + struct page *p; + + if (!stream_page) + return; + p = virt_to_page(stream_page); + free_pages_nolock(p, 0); +} + +static void bpf_stream_page_get(struct bpf_stream_page *stream_page) +{ + refcount_inc(&stream_page->ref); +} + +static void bpf_stream_page_put(struct bpf_stream_page *stream_page) +{ + if (refcount_dec_and_test(&stream_page->ref)) + bpf_stream_page_free(stream_page); +} + +static void bpf_stream_page_init(struct bpf_stream_page *stream_page) +{ + refcount_set(&stream_page->ref, 1); + stream_page->consumed = 0; +} + +static struct bpf_stream_page *bpf_stream_page_replace(void) +{ + struct bpf_stream_page *stream_page, *old_stream_page; + struct page *page; + + page = __bpf_alloc_page(NUMA_NO_NODE); + if (!page) + return NULL; + stream_page = page_address(page); + bpf_stream_page_init(stream_page); + + old_stream_page = this_cpu_read(stream_pcpu_page); + if (old_stream_page) + bpf_stream_page_put(old_stream_page); + this_cpu_write(stream_pcpu_page, stream_page); + return stream_page; +} + +static int bpf_stream_page_check_room(struct bpf_stream_page *stream_page, int len) +{ + int min = offsetof(struct bpf_stream_elem, str[0]); + int consumed = stream_page->consumed; + int total = BPF_STREAM_PAGE_SZ; + int rem = max(0, total - consumed - min); + + /* Let's give room of at least 8 bytes. */ + WARN_ON_ONCE(rem % 8 != 0); + rem = rem < 8 ? 0 : rem; + return min(len, rem); +} + +static void bpf_stream_elem_init(struct bpf_stream_elem *elem, int len) +{ + init_llist_node(&elem->node); + elem->mem_slice.ptr = elem->str; + elem->mem_slice.len = len; +} + +static struct bpf_stream_page *bpf_stream_page_from_elem(struct bpf_stream_elem *elem) +{ + unsigned long addr = (unsigned long)elem; + + return (struct bpf_stream_page *)PAGE_ALIGN_DOWN(addr); +} + +static struct bpf_stream_elem *bpf_stream_page_push_elem(struct bpf_stream_page *stream_page, int len) +{ + u32 consumed = stream_page->consumed; + + stream_page->consumed += round_up(offsetof(struct bpf_stream_elem, str[len]), 8); + return (struct bpf_stream_elem *)&stream_page->buf[consumed]; +} + +static noinline struct bpf_stream_elem *bpf_stream_page_reserve_elem(int len) +{ + struct bpf_stream_elem *elem = NULL; + struct bpf_stream_page *page; + int room = 0; + + page = this_cpu_read(stream_pcpu_page); + if (!page) + page = bpf_stream_page_replace(); + if (!page) + return NULL; + + room = bpf_stream_page_check_room(page, len); + if (room != len) + page = bpf_stream_page_replace(); + if (!page) + return NULL; + bpf_stream_page_get(page); + room = bpf_stream_page_check_room(page, len); + WARN_ON_ONCE(room != len); + + elem = bpf_stream_page_push_elem(page, room); + bpf_stream_elem_init(elem, room); + return elem; +} + +static struct bpf_stream_elem *bpf_stream_elem_alloc(int len) +{ + const int max_len = ARRAY_SIZE((struct bpf_bprintf_buffers){}.buf); + struct bpf_stream_elem *elem; + unsigned long flags; + + /* + * We may overflow, but we should never need more than one page size + * worth of memory. This can be lifted, but we'd need to adjust the + * other code to keep allocating more pages to overflow messages. + */ + BUILD_BUG_ON(max_len > BPF_STREAM_PAGE_SZ); + /* + * Length denotes the amount of data to be written as part of stream element, + * thus includes '\0' byte. We're capped by how much bpf_bprintf_buffers can + * accomodate, therefore deny allocations that won't fit into them. + */ + if (len < 0 || len > max_len) + return NULL; + + if (!bpf_stream_page_local_lock(&flags)) + return NULL; + elem = bpf_stream_page_reserve_elem(len); + bpf_stream_page_local_unlock(&flags); + return elem; +} + +__bpf_kfunc_start_defs(); + +static int __bpf_stream_push_str(struct llist_head *log, const char *str, int len) +{ + struct bpf_stream_elem *elem = NULL; + + /* + * Allocate a bpf_prog_stream_elem and push it to the bpf_prog_stream + * log, elements will be popped at once and reversed to print the log. + */ + elem = bpf_stream_elem_alloc(len); + if (!elem) + return -ENOMEM; + + memcpy(elem->str, str, len); + llist_add(&elem->node, log); + + return 0; +} + +static int bpf_stream_consume_capacity(struct bpf_stream *stream, int len) +{ + if (atomic_read(&stream->capacity) >= BPF_STREAM_MAX_CAPACITY) + return -ENOSPC; + if (atomic_add_return(len, &stream->capacity) >= BPF_STREAM_MAX_CAPACITY) { + atomic_sub(len, &stream->capacity); + return -ENOSPC; + } + return 0; +} + +static void bpf_stream_release_capacity(struct bpf_stream *stream, struct bpf_stream_elem *elem) +{ + int len = elem->mem_slice.len; + + atomic_sub(len, &stream->capacity); +} + +static int bpf_stream_push_str(struct bpf_stream *stream, const char *str, int len) +{ + int ret = bpf_stream_consume_capacity(stream, len); + + return ret ?: __bpf_stream_push_str(&stream->log, str, len); +} + +__bpf_kfunc int bpf_stream_vprintk(struct bpf_stream *stream, const char *fmt__str, const void *args, u32 len__sz) +{ + struct bpf_bprintf_data data = { + .get_bin_args = true, + .get_buf = true, + }; + u32 fmt_size = strlen(fmt__str) + 1; + u32 data_len = len__sz; + int ret, num_args; + + if (data_len & 7 || data_len > MAX_BPRINTF_VARARGS * 8 || + (data_len && !args)) + return -EINVAL; + num_args = data_len / 8; + + ret = bpf_bprintf_prepare(fmt__str, fmt_size, args, num_args, &data); + if (ret < 0) + return ret; + + ret = bstr_printf(data.buf, MAX_BPRINTF_BUF, fmt__str, data.bin_args); + /* If the string was truncated, we only wrote until the size of buffer. */ + ret = min_t(u32, ret + 1, MAX_BPRINTF_BUF); + ret = bpf_stream_push_str(stream, data.buf, ret); + bpf_bprintf_cleanup(&data); + + return ret; +} + +/* Use int vs enum stream_id here, we use this kfunc in bpf_helpers.h, and + * keeping enum stream_id necessitates a complete definition of enum, but we + * can't copy it in the header as it may conflict with the definition in + * vmlinux.h. + */ +__bpf_kfunc struct bpf_stream *bpf_stream_get(int stream_id, void *aux__ign) +{ + struct bpf_prog_aux *aux = aux__ign; + + if (stream_id != BPF_STDOUT && stream_id != BPF_STDERR) + return NULL; + return &aux->stream[stream_id - 1]; +} + +__bpf_kfunc void bpf_stream_free_elem(struct bpf_stream_elem *elem) +{ + struct bpf_stream_page *p; + + p = bpf_stream_page_from_elem(elem); + bpf_stream_page_put(p); +} + +static void bpf_stream_free_list(struct llist_node *list) +{ + struct bpf_stream_elem *elem, *tmp; + + llist_for_each_entry_safe(elem, tmp, list, node) + bpf_stream_free_elem(elem); +} + +static struct llist_node *bpf_stream_backlog_pop(struct bpf_stream *stream) +{ + struct llist_node *node; + + node = stream->backlog_head; + if (stream->backlog_head == stream->backlog_tail) + stream->backlog_head = stream->backlog_tail = NULL; + else + stream->backlog_head = node->next; + return node; +} + +static struct llist_node *bpf_stream_log_pop(struct bpf_stream *stream) +{ + struct llist_node *node, *head, *tail; + unsigned long flags; + + if (llist_empty(&stream->log)) + return NULL; + tail = llist_del_all(&stream->log); + if (!tail) + return NULL; + head = llist_reverse_order(tail); + + if (raw_res_spin_lock_irqsave(&stream->lock, flags)) { + bpf_stream_free_list(head); + return NULL; + } + + if (!stream->backlog_head) { + stream->backlog_head = head; + stream->backlog_tail = tail; + } else { + stream->backlog_tail->next = head; + stream->backlog_tail = tail; + } + + node = bpf_stream_backlog_pop(stream); + raw_res_spin_unlock_irqrestore(&stream->lock, flags); + + return node; +} + +__bpf_kfunc struct bpf_stream_elem *bpf_stream_next_elem(struct bpf_stream *stream) +{ + struct bpf_stream_elem *elem = NULL; + struct llist_node *node; + unsigned long flags; + + if (raw_res_spin_lock_irqsave(&stream->lock, flags)) + return NULL; + node = bpf_stream_backlog_pop(stream); + if (!node) + goto unlock; +unlock: + raw_res_spin_unlock_irqrestore(&stream->lock, flags); + + if (node) + goto end; + + node = bpf_stream_log_pop(stream); + if (!node) + return NULL; +end: + elem = container_of(node, typeof(*elem), node); + bpf_stream_release_capacity(stream, elem); + return elem; +} + +/* Use int vs enum bpf_stream_id for consistency with bpf_stream_get. */ +__bpf_kfunc struct bpf_stream *bpf_prog_stream_get(int stream_id, u32 prog_id) +{ + struct bpf_stream *stream; + struct bpf_prog *prog; + + prog = bpf_prog_by_id(prog_id); + if (IS_ERR_OR_NULL(prog)) + return NULL; + stream = bpf_stream_get(stream_id, prog->aux); + if (!stream) + bpf_prog_put(prog); + return stream; +} + +__bpf_kfunc void bpf_prog_stream_put(struct bpf_stream *stream) +{ + enum bpf_stream_id stream_id = stream->stream_id; + struct bpf_prog *prog; + + prog = container_of(stream, struct bpf_prog_aux, stream[stream_id - 1])->prog; + bpf_prog_put(prog); +} + +__bpf_kfunc_end_defs(); + +BTF_KFUNCS_START(stream_kfunc_set) +BTF_ID_FLAGS(func, bpf_stream_get, KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_stream_vprintk, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_stream_next_elem, KF_ACQUIRE | KF_RET_NULL | KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_stream_free_elem, KF_RELEASE) +BTF_KFUNCS_END(stream_kfunc_set) + +BTF_KFUNCS_START(stream_syscall_kfunc_set) +BTF_ID_FLAGS(func, bpf_prog_stream_get, KF_ACQUIRE | KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_prog_stream_put, KF_RELEASE) +BTF_KFUNCS_END(stream_syscall_kfunc_set) + +static const struct btf_kfunc_id_set bpf_stream_kfunc_set = { + .owner = THIS_MODULE, + .set = &stream_kfunc_set, +}; + +static const struct btf_kfunc_id_set bpf_stream_syscall_kfunc_set = { + .owner = THIS_MODULE, + .set = &stream_syscall_kfunc_set, +}; + +static int __init bpf_stream_kfunc_init(void) +{ + int ret; + + ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_UNSPEC, &bpf_stream_kfunc_set); + if (ret) + return ret; + return register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, &bpf_stream_syscall_kfunc_set); +} +late_initcall(bpf_stream_kfunc_init); + +void bpf_prog_stream_init(struct bpf_prog *prog) +{ + int i; + + prog->aux->stream[0].stream_id = BPF_STDOUT; + prog->aux->stream[1].stream_id = BPF_STDERR; + + for (i = 0; i < ARRAY_SIZE(prog->aux->stream); i++) { + atomic_set(&prog->aux->stream[i].capacity, 0); + init_llist_head(&prog->aux->stream[i].log); + raw_res_spin_lock_init(&prog->aux->stream[i].lock); + prog->aux->stream[i].backlog_head = NULL; + prog->aux->stream[i].backlog_tail = NULL; + } +} + +void bpf_prog_stream_free(struct bpf_prog *prog) +{ + struct llist_node *list; + int i; + + for (i = 0; i < ARRAY_SIZE(prog->aux->stream); i++) { + list = llist_del_all(&prog->aux->stream[i].log); + bpf_stream_free_list(list); + bpf_stream_free_list(prog->aux->stream[i].backlog_head); + } +} + +void bpf_stream_stage_init(struct bpf_stream_stage *ss) +{ + init_llist_head(&ss->log); + ss->len = 0; +} + +void bpf_stream_stage_free(struct bpf_stream_stage *ss) +{ + struct llist_node *node; + + node = llist_del_all(&ss->log); + bpf_stream_free_list(node); +} + +int bpf_stream_stage_printk(struct bpf_stream_stage *ss, const char *fmt, ...) +{ + struct bpf_bprintf_buffers *buf; + va_list args; + int ret; + + if (bpf_try_get_buffers(&buf)) + return -EBUSY; + + va_start(args, fmt); + ret = vsnprintf(buf->buf, ARRAY_SIZE(buf->buf), fmt, args); + va_end(args); + /* If the string was truncated, we only wrote until the size of buffer. */ + ret = min_t(u32, ret + 1, ARRAY_SIZE(buf->buf)); + ss->len += ret; + ret = __bpf_stream_push_str(&ss->log, buf->buf, ret); + bpf_put_buffers(); + return ret; +} + +int bpf_stream_stage_commit(struct bpf_stream_stage *ss, struct bpf_prog *prog, + enum bpf_stream_id stream_id) +{ + struct llist_node *list, *head, *tail; + struct bpf_stream *stream; + int ret; + + stream = bpf_stream_get(stream_id, prog->aux); + if (!stream) + return -EINVAL; + + ret = bpf_stream_consume_capacity(stream, ss->len); + if (ret) + return ret; + + list = llist_del_all(&ss->log); + head = list; + + if (!list) + return 0; + while (llist_next(list)) { + tail = llist_next(list); + list = tail; + } + llist_add_batch(head, tail, &stream->log); + return 0; +} + +struct dump_stack_ctx { + struct bpf_stream_stage *ss; + int err; +}; + +static bool dump_stack_cb(void *cookie, u64 ip, u64 sp, u64 bp) +{ + struct dump_stack_ctx *ctxp = cookie; + const char *file = "", *line = ""; + struct bpf_prog *prog; + int num; + + if (is_bpf_text_address(ip)) { + prog = bpf_prog_ksym_find(ip); + num = bpf_prog_get_file_line(prog, ip, &file, &line); + if (num == -1) + goto end; + ctxp->err = bpf_stream_stage_printk(ctxp->ss, "%pS\n %s @ %s:%d\n", + (void *)ip, line, file, num); + return !ctxp->err; + } +end: + ctxp->err = bpf_stream_stage_printk(ctxp->ss, "%pS\n", (void *)ip); + return !ctxp->err; +} + +int bpf_stream_stage_dump_stack(struct bpf_stream_stage *ss) +{ + struct dump_stack_ctx ctx = { .ss = ss }; + int ret; + + ret = bpf_stream_stage_printk(ss, "CPU: %d UID: %d PID: %d Comm: %s\n", + raw_smp_processor_id(), __kuid_val(current_real_cred()->euid), + current->pid, current->comm); + ret = ret ?: bpf_stream_stage_printk(ss, "Call trace:\n"); + if (!ret) + arch_bpf_stack_walk(dump_stack_cb, &ctx); + ret = ret ?: ctx.err; + return ret ?: bpf_stream_stage_printk(ss, "\n"); +} + +bool bpf_prog_stream_error_limit(struct bpf_prog *prog) +{ + return atomic_fetch_add(1, &prog->aux->stream_error_cnt) >= BPF_PROG_STREAM_ERROR_CNT; +} diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index df33d19c5c3b3..60778be870e3f 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -576,7 +576,7 @@ static bool can_alloc_pages(void) !IS_ENABLED(CONFIG_PREEMPT_RT); } -static struct page *__bpf_alloc_page(int nid) +struct page *__bpf_alloc_page(int nid) { if (!can_alloc_pages()) return try_alloc_pages(nid, 0); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 99aa2c890e7bd..aba0b38733bcd 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -12116,6 +12116,8 @@ enum special_kfunc_type { KF_bpf_res_spin_unlock, KF_bpf_res_spin_lock_irqsave, KF_bpf_res_spin_unlock_irqrestore, + KF_bpf_dynptr_from_mem_slice, + KF_bpf_stream_get, }; BTF_SET_START(special_kfunc_set) @@ -12219,6 +12221,8 @@ BTF_ID(func, bpf_res_spin_lock) BTF_ID(func, bpf_res_spin_unlock) BTF_ID(func, bpf_res_spin_lock_irqsave) BTF_ID(func, bpf_res_spin_unlock_irqrestore) +BTF_ID(func, bpf_dynptr_from_mem_slice) +BTF_ID(func, bpf_stream_get) static bool is_kfunc_ret_null(struct bpf_kfunc_call_arg_meta *meta) { @@ -13140,7 +13144,9 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ if (is_kfunc_arg_uninit(btf, &args[i])) dynptr_arg_type |= MEM_UNINIT; - if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_from_skb]) { + if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_from_mem_slice]) { + dynptr_arg_type |= DYNPTR_TYPE_LOCAL; + } else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_from_skb]) { dynptr_arg_type |= DYNPTR_TYPE_SKB; } else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_from_xdp]) { dynptr_arg_type |= DYNPTR_TYPE_XDP; @@ -13882,10 +13888,11 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, regs[BPF_REG_0].type = PTR_TO_BTF_ID; regs[BPF_REG_0].btf_id = ptr_type_id; - if (meta.func_id == special_kfunc_list[KF_bpf_get_kmem_cache]) + if (meta.func_id == special_kfunc_list[KF_bpf_get_kmem_cache]) { regs[BPF_REG_0].type |= PTR_UNTRUSTED; - - if (is_iter_next_kfunc(&meta)) { + } else if (meta.func_id == special_kfunc_list[KF_bpf_stream_get]) { + regs[BPF_REG_0].type |= PTR_TRUSTED; + } else if (is_iter_next_kfunc(&meta)) { struct bpf_reg_state *cur_iter; cur_iter = get_iter_from_state(env->cur_state, &meta); @@ -21517,8 +21524,10 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, desc->func_id == special_kfunc_list[KF_bpf_rdonly_cast]) { insn_buf[0] = BPF_MOV64_REG(BPF_REG_0, BPF_REG_1); *cnt = 1; - } else if (is_bpf_wq_set_callback_impl_kfunc(desc->func_id)) { - struct bpf_insn ld_addrs[2] = { BPF_LD_IMM64(BPF_REG_4, (long)env->prog->aux) }; + } else if (is_bpf_wq_set_callback_impl_kfunc(desc->func_id) || + desc->func_id == special_kfunc_list[KF_bpf_stream_get]) { + u32 regno = is_bpf_wq_set_callback_impl_kfunc(desc->func_id) ? BPF_REG_4 : BPF_REG_2; + struct bpf_insn ld_addrs[2] = { BPF_LD_IMM64(regno, (long)env->prog->aux) }; insn_buf[0] = ld_addrs[0]; insn_buf[1] = ld_addrs[1]; diff --git a/tools/bpf/bpftool/Documentation/bpftool-prog.rst b/tools/bpf/bpftool/Documentation/bpftool-prog.rst index d6304e01afe00..258e16ee8def5 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-prog.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-prog.rst @@ -173,6 +173,12 @@ bpftool prog tracelog purposes. For streaming data from BPF programs to user space, one can use perf events (see also **bpftool-map**\ (8)). +bpftool prog tracelog { stdout | stderr } *PROG* + Dump the BPF stream of the program. BPF programs can write to these streams + at runtime with the **bpf_stream_vprintk**\ () kfunc. The kernel may write + error messages to the standard error stream. This facility should be used + only for debugging purposes. + bpftool prog run *PROG* data_in *FILE* [data_out *FILE* [data_size_out *L*]] [ctx_in *FILE* [ctx_out *FILE* [ctx_size_out *M*]]] [repeat *N*] Run BPF program *PROG* in the kernel testing infrastructure for BPF, meaning that the program works on the data and context provided by the diff --git a/tools/bpf/bpftool/Makefile b/tools/bpf/bpftool/Makefile index 9e9a5f006cd2a..eb908223c3bb0 100644 --- a/tools/bpf/bpftool/Makefile +++ b/tools/bpf/bpftool/Makefile @@ -234,7 +234,7 @@ $(OUTPUT)%.bpf.o: skeleton/%.bpf.c $(OUTPUT)vmlinux.h $(LIBBPF_BOOTSTRAP) $(OUTPUT)%.skel.h: $(OUTPUT)%.bpf.o $(BPFTOOL_BOOTSTRAP) $(QUIET_GEN)$(BPFTOOL_BOOTSTRAP) gen skeleton $< > $@ -$(OUTPUT)prog.o: $(OUTPUT)profiler.skel.h +$(OUTPUT)prog.o: $(OUTPUT)profiler.skel.h $(OUTPUT)stream.skel.h $(OUTPUT)pids.o: $(OUTPUT)pid_iter.skel.h diff --git a/tools/bpf/bpftool/bash-completion/bpftool b/tools/bpf/bpftool/bash-completion/bpftool index 1ce409a6cbd91..c7c0bf3aee249 100644 --- a/tools/bpf/bpftool/bash-completion/bpftool +++ b/tools/bpf/bpftool/bash-completion/bpftool @@ -518,7 +518,21 @@ _bpftool() esac ;; tracelog) - return 0 + case $prev in + $command) + COMPREPLY+=( $( compgen -W "stdout stderr" -- \ + "$cur" ) ) + return 0 + ;; + stdout|stderr) + COMPREPLY=( $( compgen -W "$PROG_TYPE" -- \ + "$cur" ) ) + return 0 + ;; + *) + return 0 + ;; + esac ;; profile) case $cword in diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c index f010295350be5..7abe4698c86cd 100644 --- a/tools/bpf/bpftool/prog.c +++ b/tools/bpf/bpftool/prog.c @@ -35,6 +35,8 @@ #include "main.h" #include "xlated_dumper.h" +#include "stream.skel.h" + #define BPF_METADATA_PREFIX "bpf_metadata_" #define BPF_METADATA_PREFIX_LEN (sizeof(BPF_METADATA_PREFIX) - 1) @@ -697,6 +699,15 @@ static int do_show(int argc, char **argv) return err; } +static int process_stream_sample(void *ctx, void *data, size_t len) +{ + FILE *file = ctx; + + fprintf(file, "%s", (char *)data); + fflush(file); + return 0; +} + static int prog_dump(struct bpf_prog_info *info, enum dump_mode mode, char *filepath, bool opcodes, bool visual, bool linum) @@ -1113,6 +1124,80 @@ static int do_detach(int argc, char **argv) return 0; } +enum prog_tracelog_mode { + TRACE_STDOUT, + TRACE_STDERR, +}; + +static int +prog_tracelog_stream(struct bpf_prog_info *info, enum prog_tracelog_mode mode) +{ + FILE *file = mode == TRACE_STDOUT ? stdout : stderr; + LIBBPF_OPTS(bpf_test_run_opts, opts); + struct ring_buffer *ringbuf; + struct stream_bpf *skel; + int map_fd, ret = -1; + + __u32 prog_id = info->id; + __u32 stream_id = mode == TRACE_STDOUT ? 1 : 2; + + skel = stream_bpf__open_and_load(); + if (!skel) + return -errno; + skel->bss->prog_id = prog_id; + skel->bss->stream_id = stream_id; + + map_fd = bpf_map__fd(skel->maps.ringbuf); + ringbuf = ring_buffer__new(map_fd, process_stream_sample, file, NULL); + if (!ringbuf) { + ret = -errno; + goto end; + } + do { + skel->bss->written_count = skel->bss->written_size = 0; + ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.bpftool_dump_prog_stream), &opts); + if (ring_buffer__consume_n(ringbuf, skel->bss->written_count) != skel->bss->written_count) { + ret = -EINVAL; + goto end; + } + } while (!ret && opts.retval == EAGAIN); + + if (opts.retval != 0) + ret = -EINVAL; +end: + stream_bpf__destroy(skel); + return ret; +} + + +static int do_tracelog_any(int argc, char **argv) +{ + enum prog_tracelog_mode mode; + struct bpf_prog_info info; + __u32 info_len = sizeof(info); + int fd, err; + + if (argc == 0) + return do_tracelog(argc, argv); + if (!is_prefix(*argv, "stdout") && !is_prefix(*argv, "stderr")) + usage(); + mode = is_prefix(*argv, "stdout") ? TRACE_STDOUT : TRACE_STDERR; + NEXT_ARG(); + + if (!REQ_ARGS(2)) + return -1; + + fd = prog_parse_fd(&argc, &argv); + if (fd < 0) + return -1; + + err = bpf_prog_get_info_by_fd(fd, &info, &info_len); + if (err < 0) + return -1; + + return prog_tracelog_stream(&info, mode); +} + static int check_single_stdin(char *file_data_in, char *file_ctx_in) { if (file_data_in && file_ctx_in && @@ -2483,6 +2568,7 @@ static int do_help(int argc, char **argv) " [repeat N]\n" " %1$s %2$s profile PROG [duration DURATION] METRICs\n" " %1$s %2$s tracelog\n" + " %1$s %2$s tracelog { stdout | stderr } PROG\n" " %1$s %2$s help\n" "\n" " " HELP_SPEC_MAP "\n" @@ -2522,7 +2608,7 @@ static const struct cmd cmds[] = { { "loadall", do_loadall }, { "attach", do_attach }, { "detach", do_detach }, - { "tracelog", do_tracelog }, + { "tracelog", do_tracelog_any }, { "run", do_run }, { "profile", do_profile }, { 0 } diff --git a/tools/bpf/bpftool/skeleton/stream.bpf.c b/tools/bpf/bpftool/skeleton/stream.bpf.c new file mode 100644 index 0000000000000..9103159591448 --- /dev/null +++ b/tools/bpf/bpftool/skeleton/stream.bpf.c @@ -0,0 +1,69 @@ +// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ +#include +#include +#include + +struct { + __uint(type, BPF_MAP_TYPE_RINGBUF); + __uint(max_entries, 1024 * 1024); +} ringbuf SEC(".maps"); + +int written_size; +int written_count; +int stream_id; +int prog_id; + +#define ENOENT 2 +#define EAGAIN 11 +#define EFAULT 14 + +SEC("syscall") +int bpftool_dump_prog_stream(void *ctx) +{ + struct bpf_stream_elem *elem; + struct bpf_stream *stream; + bool cont = false; + bool ret = 0; + + stream = bpf_prog_stream_get(stream_id, prog_id); + if (!stream) + return ENOENT; + + bpf_repeat(BPF_MAX_LOOPS) { + struct bpf_dynptr dst_dptr, src_dptr; + int size; + + elem = bpf_stream_next_elem(stream); + if (!elem) + break; + size = elem->mem_slice.len; + + if (bpf_dynptr_from_mem_slice(&elem->mem_slice, 0, &src_dptr)) + ret = EFAULT; + if (bpf_ringbuf_reserve_dynptr(&ringbuf, size, 0, &dst_dptr)) + ret = EFAULT; + if (bpf_dynptr_copy(&dst_dptr, 0, &src_dptr, 0, size)) + ret = EFAULT; + bpf_ringbuf_submit_dynptr(&dst_dptr, 0); + + written_count++; + written_size += size; + + bpf_stream_free_elem(elem); + + /* Probe and exit if no more space, probe for twice the typical size. */ + if (bpf_ringbuf_reserve_dynptr(&ringbuf, 2048, 0, &dst_dptr)) + cont = true; + bpf_ringbuf_discard_dynptr(&dst_dptr, 0); + + if (ret || cont) + break; + } + + bpf_prog_stream_put(stream); + + return ret ? ret : (cont ? EAGAIN : 0); +} + +char _license[] SEC("license") = "Dual BSD/GPL"; diff --git a/tools/lib/bpf/bpf_helpers.h b/tools/lib/bpf/bpf_helpers.h index a50773d4616e6..1a748c21e358a 100644 --- a/tools/lib/bpf/bpf_helpers.h +++ b/tools/lib/bpf/bpf_helpers.h @@ -314,17 +314,47 @@ enum libbpf_tristate { ___param, sizeof(___param)); \ }) +struct bpf_stream; + +extern struct bpf_stream *bpf_stream_get(int stream_id, void *aux__ign) __weak __ksym; +extern int bpf_stream_vprintk(struct bpf_stream *stream, const char *fmt__str, const void *args, + __u32 len__sz) __weak __ksym; + +#define __bpf_stream_vprintk(stream, fmt, args...) \ +({ \ + static const char ___fmt[] = fmt; \ + unsigned long long ___param[___bpf_narg(args)]; \ + \ + _Pragma("GCC diagnostic push") \ + _Pragma("GCC diagnostic ignored \"-Wint-conversion\"") \ + ___bpf_fill(___param, args); \ + _Pragma("GCC diagnostic pop") \ + \ + int ___id = stream; \ + struct bpf_stream *___sptr = bpf_stream_get(___id, NULL); \ + if (___sptr) \ + bpf_stream_vprintk(___sptr, ___fmt, ___param, sizeof(___param));\ +}) + /* Use __bpf_printk when bpf_printk call has 3 or fewer fmt args - * Otherwise use __bpf_vprintk + * Otherwise use __bpf_vprintk. Virtualize choices so stream printk + * can override it to bpf_stream_vprintk. */ -#define ___bpf_pick_printk(...) \ - ___bpf_nth(_, ##__VA_ARGS__, __bpf_vprintk, __bpf_vprintk, __bpf_vprintk, \ - __bpf_vprintk, __bpf_vprintk, __bpf_vprintk, __bpf_vprintk, \ - __bpf_vprintk, __bpf_vprintk, __bpf_printk /*3*/, __bpf_printk /*2*/,\ - __bpf_printk /*1*/, __bpf_printk /*0*/) +#define ___bpf_pick_printk(choice, choice_3, ...) \ + ___bpf_nth(_, ##__VA_ARGS__, choice, choice, choice, \ + choice, choice, choice, choice, \ + choice, choice, choice_3 /*3*/, choice_3 /*2*/, \ + choice_3 /*1*/, choice_3 /*0*/) /* Helper macro to print out debug messages */ -#define bpf_printk(fmt, args...) ___bpf_pick_printk(args)(fmt, ##args) +#define __bpf_trace_printk(fmt, args...) \ + ___bpf_pick_printk(__bpf_vprintk, __bpf_printk, args)(fmt, ##args) +#define __bpf_stream_printk(stream, fmt, args...) \ + ___bpf_pick_printk(__bpf_stream_vprintk, __bpf_stream_vprintk, args)(stream, fmt, ##args) + +#define bpf_stream_printk(stream, fmt, args...) __bpf_stream_printk(stream, fmt, ##args) + +#define bpf_printk(arg, args...) __bpf_trace_printk(arg, ##args) struct bpf_iter_num; diff --git a/tools/testing/selftests/bpf/prog_tests/stream.c b/tools/testing/selftests/bpf/prog_tests/stream.c new file mode 100644 index 0000000000000..7b97b783ff1f3 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/stream.c @@ -0,0 +1,95 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ +#include +#include + +#include "stream.skel.h" +#include "stream_fail.skel.h" + +#include "stream_bpftool.skel.h" + +void test_stream_failure(void) +{ + RUN_TESTS(stream_fail); +} + +void test_stream_success(void) +{ + RUN_TESTS(stream); + RUN_TESTS(stream_bpftool); + return; +} + +typedef int (*sample_cb_t)(void *, void *, size_t); + +static void stream_ringbuf_output(int prog_id, sample_cb_t sample_cb) +{ + LIBBPF_OPTS(bpf_test_run_opts, opts); + struct ring_buffer *ringbuf; + struct stream_bpftool *skel; + int fd, ret; + + skel = stream_bpftool__open_and_load(); + if (!ASSERT_OK_PTR(skel, "stream_bpftool_open_and_load")) + return; + + fd = bpf_map__fd(skel->maps.ringbuf); + + ringbuf = ring_buffer__new(fd, sample_cb, NULL, NULL); + if (!ASSERT_OK_PTR(ringbuf, "ringbuf_new")) + goto end; + + skel->bss->prog_id = prog_id; + skel->bss->stream_id = 1; + do { + skel->bss->written_count = skel->bss->written_size = 0; + ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.bpftool_dump_prog_stream), &opts); + if (ret) + break; + ret = ring_buffer__consume_n(ringbuf, skel->bss->written_count); + if (!ASSERT_EQ(ret, skel->bss->written_count, "consume")) + break; + ret = 0; + } while (opts.retval == EAGAIN); + + ASSERT_OK(ret, "ret"); + ASSERT_EQ(opts.retval, 0, "retval"); + +end: + stream_bpftool__destroy(skel); +} + +int cnt = 0; + +static int process_sample(void *ctx, void *data, size_t len) +{ + char buf[64]; + + snprintf(buf, sizeof(buf), "num=%d\n", cnt++); + ASSERT_TRUE(strcmp(buf, (char *)data) == 0, "sample strcmp"); + return 0; +} + +void test_stream_output(void) +{ + LIBBPF_OPTS(bpf_test_run_opts, opts); + struct bpf_prog_info info = {}; + __u32 info_len = sizeof(info); + struct stream *skel; + int ret; + + skel = stream__open_and_load(); + if (!ASSERT_OK_PTR(skel, "stream__open_and_load")) + return; + + ASSERT_OK(bpf_prog_get_info_by_fd(bpf_program__fd(skel->progs.stream_test_output), &info, &info_len), "get info"); + ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.stream_test_output), &opts); + ASSERT_OK(ret, "ret"); + ASSERT_OK(opts.retval, "retval"); + stream_ringbuf_output(info.id, process_sample); + + ASSERT_EQ(cnt, 1000, "cnt"); + + stream__destroy(skel); + return; +} diff --git a/tools/testing/selftests/bpf/progs/stream.c b/tools/testing/selftests/bpf/progs/stream.c new file mode 100644 index 0000000000000..14cb8690824fa --- /dev/null +++ b/tools/testing/selftests/bpf/progs/stream.c @@ -0,0 +1,127 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ +#include +#include +#include +#include "bpf_misc.h" + +#define _STR "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx" + +#define STREAM_STR (u64)(_STR _STR _STR _STR) + +static __noinline int stream_exercise(int id, int N) +{ + struct bpf_stream_elem *elem, *earr[56] = {}; + struct bpf_stream *stream; + int ret; + u32 i; + + if (N > 56) + return 56; + + stream = bpf_stream_get(id, NULL); + if (!stream) + return 1; + for (i = 0; i < N; i++) + if ((ret = bpf_stream_vprintk(stream, "%llu%s", &(u64[]){i, STREAM_STR}, 16)) < 0) { + bpf_printk("bpf_stream_vprintk ret=%d", ret); + return 2; + } + ret = 0; + for (i = 0; i < N; i++) { + elem = bpf_stream_next_elem(stream); + if (!elem) { + ret = 4; + break; + } + earr[i] = elem; + } + elem = bpf_stream_next_elem(stream); + if (elem) { + bpf_stream_free_elem(elem); + ret = 5; + } + for (i = 0; i < N; i++) + if (earr[i]) + bpf_stream_free_elem(earr[i]); + return ret; +} + +static __noinline int stream_exercise_nums(int id) +{ + int ret = 0; + + ret = ret ?: stream_exercise(id, 56); + ret = ret ?: stream_exercise(id, 42); + ret = ret ?: stream_exercise(id, 28); + ret = ret ?: stream_exercise(id, 10); + ret = ret ?: stream_exercise(id, 1); + + return ret; +} + +SEC("syscall") +__success __retval(0) +int stream_test(void *ctx) +{ + unsigned long flags; + int ret; + + bpf_local_irq_save(&flags); + bpf_repeat(50) { + ret = stream_exercise_nums(BPF_STDOUT); + if (ret) + break; + } + if (ret) { + bpf_local_irq_restore(&flags); + return ret; + } + bpf_repeat(100) { + ret = stream_exercise_nums(BPF_STDERR); + if (ret) + break; + } + bpf_local_irq_restore(&flags); + + if (ret) + return ret; + + ret = stream_exercise_nums(BPF_STDOUT); + if (ret) + return ret; + return stream_exercise_nums(BPF_STDERR); +} + +SEC("syscall") +__success __retval(0) +int stream_test_output(void *ctx) +{ + for (int i = 0; i < 1000; i++) + bpf_stream_printk(BPF_STDOUT, "num=%d\n", i); + return 0; +} + +SEC("syscall") +__success __retval(0) +int stream_test_limit(void *ctx) +{ + struct bpf_stream *stream; + bool failed = false; + + stream = bpf_stream_get(BPF_STDOUT, NULL); + if (!stream) + return 2; + + bpf_repeat(BPF_MAX_LOOPS) { + failed = bpf_stream_vprintk(stream, "%s%s%s", &(u64[]){STREAM_STR, STREAM_STR}, 16) != 0; + if (failed) + break; + } + + if (failed) + return 0; + return 1; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/stream_bpftool.c b/tools/testing/selftests/bpf/progs/stream_bpftool.c new file mode 120000 index 0000000000000..5904c0d92edc5 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/stream_bpftool.c @@ -0,0 +1 @@ +../../../../bpf/bpftool/skeleton/stream.bpf.c \ No newline at end of file diff --git a/tools/testing/selftests/bpf/progs/stream_fail.c b/tools/testing/selftests/bpf/progs/stream_fail.c new file mode 100644 index 0000000000000..50f70b9878b8b --- /dev/null +++ b/tools/testing/selftests/bpf/progs/stream_fail.c @@ -0,0 +1,90 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ +#include +#include +#include +#include +#include "bpf_misc.h" + +SEC("syscall") +__failure __msg("R1 type=trusted_ptr_or_null_ expected=") +int stream_get_trusted(void *ctx) { + struct bpf_stream *stream; + + stream = bpf_stream_get(BPF_STDOUT, NULL); + bpf_this_cpu_ptr(stream); + return 0; +} + +SEC("tc") +__failure __msg("calling kernel function bpf_prog_stream_get is not allowed") +int stream_get_prog_fail(void *ctx) { + struct bpf_stream *stream; + + stream = bpf_prog_stream_get(BPF_STDOUT, 0); + if (!stream) + return 0; + bpf_this_cpu_ptr(stream); + return 0; +} + +SEC("syscall") +__failure __msg("R1 type=ptr_or_null_ expected=") +int stream_get_prog_trusted(void *ctx) { + struct bpf_stream *stream; + + stream = bpf_prog_stream_get(BPF_STDOUT, 0); + bpf_this_cpu_ptr(stream); + return 0; +} + +SEC("syscall") +__failure __msg("Unreleased reference") +int stream_get_put_missing(void *ctx) { + struct bpf_stream *stream; + + stream = bpf_prog_stream_get(BPF_STDOUT, 0); + if (!stream) + return 0; + return 0; +} + +SEC("syscall") +__failure __msg("R1 must be referenced or trusted") +int stream_next_untrusted_arg(void *ctx) +{ + struct bpf_stream *stream; + + stream = bpf_core_cast((void *)0xdeadbeef, typeof(*stream)); + bpf_stream_next_elem(stream); + return 0; +} + +SEC("syscall") +__failure __msg("Possibly NULL pointer passed") +int stream_next_null_arg(void *ctx) +{ + bpf_stream_next_elem(NULL); + return 0; +} + +SEC("syscall") +__failure __msg("R1 must be referenced or trusted") +int stream_vprintk_untrusted_arg(void *ctx) +{ + struct bpf_stream *stream; + + stream = bpf_core_cast((void *)0xfaceb00c, typeof(*stream)); + bpf_stream_vprintk(stream, "", NULL, 0); + return 0; +} + +SEC("syscall") +__failure __msg("Possibly NULL pointer passed") +int stream_vprintk_null_arg(void *ctx) +{ + bpf_stream_vprintk(NULL, "", NULL, 0); + return 0; +} + +char _license[] SEC("license") = "GPL";