From 1ac134c9236946aebf51e4aad9fccf5238d9d649 Mon Sep 17 00:00:00 2001 From: Kyle Huey Date: Sun, 21 Apr 2024 17:25:56 -0700 Subject: [PATCH] Add a path for BPF-accelerated async signal emulation. Starting in kernel 6.10 BPF filters can choose whether or not to trigger the SIGIO behavior for a perf event that becomes readable. We combine that with a hardware breakpoint and a BPF filter that matches the GPRs to produce an accelerated internal breakpoint type that can fast forward through loop iterations to deliver async signals. On one trace this reduced rr's replay overhead by 94%. This adds a runtime dependency on libbpf and a compile time dependency on clang --target bpf. rr also needs CAP_BPF and CAP_PERFMON to use this feature. Because of all of that, this isn't really suitable for wide use at this point and is instead a CMake feature usebpf. Set -Dusebpf=ON to test it. --- CMakeLists.txt | 26 ++++- src/PerfCounters.cc | 186 +++++++++++++++++++++++++++++++++++ src/PerfCounters.h | 22 +++++ src/ReplaySession.cc | 37 ++++--- src/Task.cc | 3 + src/bpf/async_event_filter.c | 65 ++++++++++++ src/fast_forward.cc | 8 +- src/fast_forward.h | 8 ++ 8 files changed, 339 insertions(+), 16 deletions(-) create mode 100644 src/bpf/async_event_filter.c diff --git a/CMakeLists.txt b/CMakeLists.txt index 0327d605641..15158548e33 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -237,6 +237,16 @@ if(NOT ANDROID) add_definitions(-DZSTD=1) endif() +option(bpf "Enable bpf acceleration") + +if(bpf) + add_definitions(-DBPF=1) + set(REQUIRED_LIBS + ${REQUIRED_LIBS} + libbpf + ) +endif(bpf) + foreach(required_lib ${REQUIRED_LIBS}) string(TOUPPER ${required_lib} PKG) if(NOT SKIP_PKGCONFIG) @@ -692,6 +702,19 @@ post_build_executable(rr) set(RR_BIN rr) add_dependencies(rr Generated) +if(bpf) + add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/share/rr/async_event_filter.o + DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/src/bpf/async_event_filter.c + COMMAND clang -g -target bpf -Wall -O2 -c ${CMAKE_CURRENT_SOURCE_DIR}/src/bpf/async_event_filter.c -o ${CMAKE_CURRENT_BINARY_DIR}/share/rr/async_event_filter.o) + + install(FILES ${CMAKE_CURRENT_BINARY_DIR}/share/rr/async_event_filter.o + DESTINATION ${CMAKE_INSTALL_DATADIR}/rr) + + add_custom_target(BPF DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/share/rr/async_event_filter.o) + + add_dependencies(rr BPF) +endif() + option(strip "Strip debug info from rr binary") set(RR_MAIN_LINKER_FLAGS ${LINKER_FLAGS}) @@ -724,6 +747,7 @@ endif() target_link_libraries(rr ${CMAKE_DL_LIBS} ${ZLIB_LDFLAGS} + ${LIBBPF_LDFLAGS} brotli ) @@ -733,7 +757,7 @@ endif() if(staticlibs) # Urgh ... this might not work for everyone, but there doesn't seem to be - # a way to persuade pkg-confing/pkg_check_modules to produce the right flags + # a way to persuade pkg-config/pkg_check_modules to produce the right flags target_link_libraries(rr -L/home/roc/lib -l:libcapnp.a -l:libkj.a) # Note that this works for both clang++ and g++ set(RR_MAIN_LINKER_FLAGS "-static-libstdc++ ${RR_MAIN_LINKER_FLAGS}") diff --git a/src/PerfCounters.cc b/src/PerfCounters.cc index 85ef2fd9c7b..294776e6558 100644 --- a/src/PerfCounters.cc +++ b/src/PerfCounters.cc @@ -17,6 +17,11 @@ #include #include +#ifdef BPF +#include +#include +#endif + #include #include #include @@ -981,6 +986,7 @@ void PerfCounters::close() { fd_minus_ticks_measure.close(); fd_useless_counter.close(); fd_ticks_in_transaction.close(); + fd_async_signal_accelerator.close(); } Ticks PerfCounters::stop(Task* t, Error* error) { @@ -1007,6 +1013,7 @@ Ticks PerfCounters::stop(Task* t, Error* error) { if (pt_state) { infallible_perf_event_disable_if_open(pt_state->pt_perf_event_fd); } + infallible_perf_event_disable_if_open(fd_async_signal_accelerator); } return ticks; } @@ -1120,4 +1127,183 @@ Ticks PerfCounters::read_ticks(Task* t, Error* error) { return ret; } +#ifdef BPF +class BpfAccelerator { +public: + static std::shared_ptr get_or_create(); + + ScopedFd create_counter(pid_t tid); + void match_regs_and_open_counter(const Registers& regs, ScopedFd& counter); + uint64_t skips() const { + return *bpf_skips; + } + + // Can't be private because of make_shared. + BpfAccelerator(struct bpf_object* bpf_obj, int bpf_prog_fd, + user_regs_struct* bpf_regs, uint64_t* bpf_skips) + : bpf_obj(bpf_obj), bpf_prog_fd(bpf_prog_fd), bpf_regs(bpf_regs), bpf_skips(bpf_skips) + {} + + ~BpfAccelerator() { + munmap(bpf_skips, 4096); + munmap(bpf_regs, 4096); + bpf_object__close(bpf_obj); + } + +private: + static std::shared_ptr singleton; + + struct perf_event_attr attr; + struct bpf_object* bpf_obj; + // Not a ScopedFd because the bpf_object maintains ownership. + int bpf_prog_fd; + user_regs_struct* bpf_regs; + uint64_t* bpf_skips; +}; + +std::shared_ptr BpfAccelerator::singleton; + +/* static */ std::shared_ptr BpfAccelerator::get_or_create() { + static int initialized; + if (BpfAccelerator::singleton) { + return BpfAccelerator::singleton; + } + + if (!initialized) { + initialized = -1; + + libbpf_set_strict_mode(LIBBPF_STRICT_DIRECT_ERRS); + string path = resource_path() + "share/rr/async_event_filter.o"; + struct bpf_object* obj = bpf_object__open(path.c_str()); + if ((intptr_t)obj <= 0) { + LOG(error) << "Failed to find bpf at " << path; + return nullptr; + } + if (bpf_object__load(obj) < 0) { + LOG(error) << "Failed to load bpf at " << path << " into the kernel. Do we have permissions?"; + bpf_object__close(obj); + return nullptr; + } + int bpf_map_fd = bpf_object__find_map_fd_by_name(obj, "registers"); + if (bpf_map_fd < 0) { + CLEAN_FATAL() << "rr's bpf at " << path << " is corrupt"; + return nullptr; + } + struct bpf_program* prog = bpf_program__next(NULL, obj); + if (!prog) { + CLEAN_FATAL() << "rr's bpf at " << path << " is corrupt"; + return nullptr; + } + int bpf_prog_fd = bpf_program__fd(prog); + if (bpf_prog_fd < 0) { + CLEAN_FATAL() << "rr's bpf at " << path << " is corrupt"; + return nullptr; + } + + auto bpf_regs = (struct user_regs_struct*) + mmap(NULL, 4096, PROT_READ | PROT_WRITE, + MAP_SHARED, bpf_map_fd, 0); + if (bpf_regs == MAP_FAILED) { + CLEAN_FATAL() << "Failed to mmap bpf maps"; + return nullptr; + } + + bpf_map_fd = bpf_object__find_map_fd_by_name(obj, "skips"); + if (bpf_map_fd < 0) { + CLEAN_FATAL() << "rr's bpf at " << path << " is corrupt"; + return nullptr; + } + + auto bpf_skips = (uint64_t*) + mmap(NULL, 4096, PROT_READ | PROT_WRITE, + MAP_SHARED, bpf_map_fd, 0); + if (bpf_regs == MAP_FAILED) { + CLEAN_FATAL() << "Failed to mmap bpf maps"; + return nullptr; + } + + BpfAccelerator::singleton = + std::make_shared(obj, bpf_prog_fd, bpf_regs, bpf_skips); + memset(&singleton->attr, 0, sizeof(singleton->attr)); + singleton->attr.type = PERF_TYPE_BREAKPOINT; + singleton->attr.size = sizeof(attr); + singleton->attr.bp_type = HW_BREAKPOINT_X; + singleton->attr.bp_len = sizeof(long); + singleton->attr.sample_period = 1; + singleton->attr.sample_type = PERF_SAMPLE_IP; + singleton->attr.pinned = 1; + singleton->attr.exclude_kernel = 1; + singleton->attr.exclude_hv = 1; + singleton->attr.wakeup_events = 1; + singleton->attr.precise_ip = 3; + singleton->attr.disabled = 1; + initialized = 1; + } + + return BpfAccelerator::singleton; +} + +ScopedFd BpfAccelerator::create_counter(pid_t tid) { + attr.bp_addr = 0; + ScopedFd fd = start_counter(tid, -1, &attr); + + struct f_owner_ex own; + own.type = F_OWNER_TID; + own.pid = tid; + if (fcntl(fd, F_SETOWN_EX, &own)) { + FATAL() << "Failed to SETOWN_EX bpf-accelerated breakpoint fd"; + } + + make_counter_async(fd, SIGTRAP); + + if (ioctl(fd, PERF_EVENT_IOC_SET_BPF, bpf_prog_fd)) { + FATAL() << "Failed PERF_EVENT_IOC_SET_BPF"; + } + + return fd; +} + +void BpfAccelerator::match_regs_and_open_counter(const Registers& regs, ScopedFd& fd) { + attr.bp_addr = regs.ip().register_value(); + if (ioctl(fd, PERF_EVENT_IOC_MODIFY_ATTRIBUTES, &attr)) { + FATAL() << "Failed PERF_EVENT_IOC_MODIFY_ATTRIBUTES"; + } + + auto r = regs.get_ptrace(); + memcpy(bpf_regs, &r, sizeof(struct user_regs_struct)); + *bpf_skips = 0; + + infallible_perf_event_enable_if_open(fd); +} + +bool PerfCounters::accelerate_async_signal(const Registers& regs) { + if (!fd_async_signal_accelerator.is_open()) { + if (!bpf) { + bpf = BpfAccelerator::get_or_create(); + } + + if (!bpf) { + return false; + } + + fd_async_signal_accelerator = bpf->create_counter(tid); + } + + if (!fd_async_signal_accelerator.is_open()) { + return false; + } + + bpf->match_regs_and_open_counter(regs, fd_async_signal_accelerator); + return true; +} + +uint64_t PerfCounters::bpf_skips() const { + if (!bpf) { + return 0; + } + + return bpf->skips(); +} +#endif + } // namespace rr diff --git a/src/PerfCounters.h b/src/PerfCounters.h index 744c9d6daad..c9982f6a1dd 100644 --- a/src/PerfCounters.h +++ b/src/PerfCounters.h @@ -21,7 +21,9 @@ namespace rr { +class Registers; class Task; +class BpfAccelerator; enum TicksSemantics { TICKS_RETIRED_CONDITIONAL_BRANCHES, @@ -175,6 +177,21 @@ class PerfCounters { */ static void start_pt_copy_thread(); + /** + * Try to use BPF to accelerate async signal processing + */ +#ifdef BPF + bool accelerate_async_signal(const Registers& regs); + uint64_t bpf_skips() const; +#else + bool accelerate_async_signal(const Registers&) { + return false; + } + uint64_t bpf_skips() const { + return 0; + } +#endif + private: template void reset_arch_extras(); @@ -212,6 +229,11 @@ class PerfCounters { // aarch64 specific counter to detect use of ll/sc instructions ScopedFd fd_strex_counter; + // BPF-enabled hardware breakpoint for fast async signal emulation. + ScopedFd fd_async_signal_accelerator; + + std::shared_ptr bpf; + std::unique_ptr pt_state; TicksSemantics ticks_semantics_; diff --git a/src/ReplaySession.cc b/src/ReplaySession.cc index 40da2d5d009..6c42c896a56 100644 --- a/src/ReplaySession.cc +++ b/src/ReplaySession.cc @@ -982,6 +982,7 @@ Completion ReplaySession::emulate_async_signal( * be dealt with. */ bool pending_SIGTRAP = false; bool did_set_internal_breakpoints = false; + bool did_set_bpf_breakpoint = false; RunCommand SIGTRAP_run_command = RUN_CONTINUE; /* Step 2: more slowly, find our way to the target ticks and @@ -1042,17 +1043,21 @@ Completion ReplaySession::emulate_async_signal( // breakpoint instruction in the tracee would have triggered a // deterministic signal instead of an async one. // So we must have hit our internal breakpoint. - ASSERT(t, did_set_internal_breakpoints); + ASSERT(t, did_set_internal_breakpoints || did_set_bpf_breakpoint); // We didn't do an internal singlestep, and if we'd done a // user-requested singlestep we would have hit the above case. ASSERT(t, !trap_reasons.singlestep); - if (t->ip().undo_executed_bkpt(t->arch()) == in_syscallbuf_syscall_hook) { - t->vm()->remove_breakpoint(ip, BKPT_INTERNAL); - t->vm()->remove_breakpoint(in_syscallbuf_syscall_hook, BKPT_INTERNAL); - t->move_ip_before_breakpoint(); - return COMPLETE; + if (did_set_internal_breakpoints) { + if (t->ip().undo_executed_bkpt(t->arch()) == in_syscallbuf_syscall_hook) { + t->vm()->remove_breakpoint(ip, BKPT_INTERNAL); + t->vm()->remove_breakpoint(in_syscallbuf_syscall_hook, BKPT_INTERNAL); + t->move_ip_before_breakpoint(); + return COMPLETE; + } + ASSERT(t, regs.ip() == t->ip().undo_executed_bkpt(t->arch())); + } else { + LOG(debug) << " fast-forwarded through " << t->hpc.bpf_skips() << " breakpoint hits with bpf"; } - ASSERT(t, regs.ip() == t->ip().undo_executed_bkpt(t->arch())); /* Case (1) above: cover the tracks of * our internal breakpoint, and go * check again if we're at the @@ -1060,7 +1065,9 @@ Completion ReplaySession::emulate_async_signal( LOG(debug) << " trap was for target $ip"; pending_SIGTRAP = false; - t->move_ip_before_breakpoint(); + if (did_set_internal_breakpoints) { + t->move_ip_before_breakpoint(); + } /* We just backed up the $ip, but * rewound it over an |int $3| * instruction, which couldn't have @@ -1093,6 +1100,7 @@ Completion ReplaySession::emulate_async_signal( } did_set_internal_breakpoints = false; } + did_set_bpf_breakpoint = false; if (at_target) { /* Case (2) above: done. */ @@ -1117,11 +1125,16 @@ Completion ReplaySession::emulate_async_signal( * no slower than single-stepping our way to * the target execution point. */ LOG(debug) << " breaking on target $ip"; - t->vm()->add_breakpoint(ip, BKPT_INTERNAL); - if (in_syscallbuf_syscall_hook) { - t->vm()->add_breakpoint(in_syscallbuf_syscall_hook, BKPT_INTERNAL); + if (is_x86_string_instruction_at(t, ip) || !t->hpc.accelerate_async_signal(regs)) { + t->vm()->add_breakpoint(ip, BKPT_INTERNAL); + + if (in_syscallbuf_syscall_hook) { + t->vm()->add_breakpoint(in_syscallbuf_syscall_hook, BKPT_INTERNAL); + } + did_set_internal_breakpoints = true; + } else { + did_set_bpf_breakpoint = true; } - did_set_internal_breakpoints = true; continue_or_step(t, constraints, RESUME_UNLIMITED_TICKS); SIGTRAP_run_command = constraints.command; } else { diff --git a/src/Task.cc b/src/Task.cc index 5ffae6915ea..728668133b8 100644 --- a/src/Task.cc +++ b/src/Task.cc @@ -1387,6 +1387,9 @@ TrapReasons Task::compute_trap_reasons() { << " expected breakpoint at " << ip_at_breakpoint << ", got siginfo " << si; } + // If we got a SIGTRAP via a FASYNC signal it must be our bpf-enabled + // hardware breakpoint. + reasons.breakpoint |= si.si_code == SI_SIGIO; } return reasons; } diff --git a/src/bpf/async_event_filter.c b/src/bpf/async_event_filter.c new file mode 100644 index 00000000000..7ee2f997b49 --- /dev/null +++ b/src/bpf/async_event_filter.c @@ -0,0 +1,65 @@ +/* -*- Mode: C++; tab-width: 8; c-basic-offset: 2; indent-tabs-mode: nil; -*- */ + +#include +#include +#include +#include + +const uint32_t REGISTER_COUNT = sizeof(struct pt_regs)/sizeof(uint64_t); + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, REGISTER_COUNT); + __uint(map_flags, BPF_F_MMAPABLE); + __type(key, uint32_t); + __type(value, uint64_t); +} registers SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 1); + __uint(map_flags, BPF_F_MMAPABLE); + __type(key, uint32_t); + __type(value, uint64_t); +} skips SEC(".maps"); + +SEC("perf_event") +int match_registers(struct bpf_perf_event_data* event) { +#define CHECK_REG(name) \ + do { \ + const uint32_t i = offsetof(struct pt_regs, name) / sizeof(uint64_t); \ + uint64_t* reg = bpf_map_lookup_elem(®isters, &i); \ + if (!reg) { \ + return 1; \ + } \ + if (event->regs.name != *reg) { \ + const uint32_t j = 0; \ + uint64_t* s = bpf_map_lookup_elem(&skips, &j); \ + if (s) { \ + *s += 1; \ + } \ + return 0; \ + } \ + } while(0) + + CHECK_REG(r15); + CHECK_REG(r14); + CHECK_REG(r13); + CHECK_REG(r12); + CHECK_REG(rbp); + CHECK_REG(rbx); + CHECK_REG(r11); + CHECK_REG(r10); + CHECK_REG(r9); + CHECK_REG(r8); + CHECK_REG(rax); + CHECK_REG(rcx); + CHECK_REG(rdx); + CHECK_REG(rsi); + CHECK_REG(rdi); + CHECK_REG(rsp); + + return 1; +} + +char _license[] SEC("license") = "Dual MIT/GPL"; diff --git a/src/fast_forward.cc b/src/fast_forward.cc index 1eeeb86dc1e..4bc371055eb 100644 --- a/src/fast_forward.cc +++ b/src/fast_forward.cc @@ -404,7 +404,8 @@ static int fallible_read_byte(Task* t, remote_ptr ip) { return byte; } -bool is_string_instruction_at(Task* t, remote_code_ptr ip) { +#if defined(__i386__) || defined(__x86_64__) +bool is_x86_string_instruction_at(Task* t, remote_code_ptr ip) { bool found_rep = false; remote_ptr bare_ip = ip.to_data_ptr(); while (true) { @@ -421,6 +422,7 @@ bool is_string_instruction_at(Task* t, remote_code_ptr ip) { ++bare_ip; } } +#endif static bool is_string_instruction_before(Task* t, remote_code_ptr ip) { remote_ptr bare_ip = ip.to_data_ptr(); @@ -447,7 +449,7 @@ bool maybe_at_or_after_x86_string_instruction(Task* t) { return false; } - return is_string_instruction_at(t, t->ip()) || + return is_x86_string_instruction_at(t, t->ip()) || is_string_instruction_before(t, t->ip()); } @@ -456,7 +458,7 @@ bool at_x86_string_instruction(Task* t) { return false; } - return is_string_instruction_at(t, t->ip()); + return is_x86_string_instruction_at(t, t->ip()); } } // namespace rr diff --git a/src/fast_forward.h b/src/fast_forward.h index 944d93c8ffa..18620689b98 100644 --- a/src/fast_forward.h +++ b/src/fast_forward.h @@ -60,6 +60,14 @@ bool maybe_at_or_after_x86_string_instruction(Task* t); /* Return true if the instruction at t->ip() is a string instruction */ bool at_x86_string_instruction(Task* t); +#if defined(__i386__) || defined(__x86_64__) +bool is_x86_string_instruction_at(Task* t, remote_code_ptr ip); +#else +inline bool is_x86_string_instruction_at(Task*, remote_code_ptr) { + return false; +} +#endif + } // namespace rr #endif // RR_FAST_FORWARD_H_