From 743aafe93dda2095ec61f11aced7521297916f73 Mon Sep 17 00:00:00 2001 From: Kyle Huey Date: Sun, 21 Apr 2024 17:25:56 -0700 Subject: [PATCH] Add a path for BPF-accelerated async signal emulation. Starting in kernel 6.10 BPF filters can choose whether or not to trigger the SIGIO behavior for a perf event that becomes readable. We combine that with a hardware breakpoint and a BPF filter that matches the GPRs to produce an accelerated internal breakpoint type that can fast forward through loop iterations to deliver async signals. On one trace this reduced rr's replay overhead by 94%. This adds a runtime dependency on libbpf and a compile time dependency on clang --target bpf. rr also needs CAP_BPF and CAP_PERFMON to use this feature. Because of all of that, this isn't really suitable for wide use at this point and is instead a CMake feature usebpf. Set -Dusebpf=ON to test it. --- CMakeLists.txt | 26 +++++++++- src/PerfCounters.cc | 99 ++++++++++++++++++++++++++++++++++++ src/PerfCounters.h | 15 ++++++ src/ReplaySession.cc | 35 ++++++++----- src/Task.cc | 3 ++ src/bpf/async_event_filter.c | 50 ++++++++++++++++++ src/fast_forward.cc | 6 +-- src/fast_forward.h | 8 +++ 8 files changed, 226 insertions(+), 16 deletions(-) create mode 100644 src/bpf/async_event_filter.c diff --git a/CMakeLists.txt b/CMakeLists.txt index 5bca571ed01..de5873ba185 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -225,6 +225,16 @@ set(REQUIRED_LIBS zlib ) +option(usebpf "Enable bpf acceleration") + +if(usebpf) + add_definitions(-DUSEBPF=1) + set(REQUIRED_LIBS + ${REQUIRED_LIBS} + libbpf + ) +endif(usebpf) + foreach(required_lib ${REQUIRED_LIBS}) string(TOUPPER ${required_lib} PKG) if(NOT SKIP_PKGCONFIG) @@ -679,6 +689,19 @@ post_build_executable(rr) set(RR_BIN rr) add_dependencies(rr Generated) +if(usebpf) + add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/share/rr/async_event_filter.o + DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/src/bpf/async_event_filter.c + COMMAND clang -g -target bpf -Wall -O2 -c ${CMAKE_CURRENT_SOURCE_DIR}/src/bpf/async_event_filter.c -o ${CMAKE_CURRENT_BINARY_DIR}/share/rr/async_event_filter.o) + + install(FILES ${CMAKE_CURRENT_BINARY_DIR}/share/rr/async_event_filter.o + DESTINATION ${CMAKE_INSTALL_DATADIR}/rr) + + add_custom_target(BPF DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/share/rr/async_event_filter.o) + + add_dependencies(rr BPF) +endif() + option(strip "Strip debug info from rr binary") set(RR_MAIN_LINKER_FLAGS ${LINKER_FLAGS}) @@ -711,12 +734,13 @@ endif() target_link_libraries(rr ${CMAKE_DL_LIBS} ${ZLIB_LDFLAGS} + ${LIBBPF_LDFLAGS} brotli ) if(staticlibs) # Urgh ... this might not work for everyone, but there doesn't seem to be - # a way to persuade pkg-confing/pkg_check_modules to produce the right flags + # a way to persuade pkg-config/pkg_check_modules to produce the right flags target_link_libraries(rr -L/home/roc/lib -l:libcapnp.a -l:libkj.a) # Note that this works for both clang++ and g++ set(RR_MAIN_LINKER_FLAGS "-static-libstdc++ ${RR_MAIN_LINKER_FLAGS}") diff --git a/src/PerfCounters.cc b/src/PerfCounters.cc index 31d2ae7f665..f98d43d22f3 100644 --- a/src/PerfCounters.cc +++ b/src/PerfCounters.cc @@ -17,6 +17,11 @@ #include #include +#ifdef USEBPF +#include +#include +#endif + #include #include #include @@ -954,6 +959,7 @@ void PerfCounters::close() { fd_minus_ticks_measure.close(); fd_useless_counter.close(); fd_ticks_in_transaction.close(); + fd_async_signal_accelerator.close(); } Ticks PerfCounters::stop(Task* t, Error* error) { @@ -980,6 +986,7 @@ Ticks PerfCounters::stop(Task* t, Error* error) { if (pt_state) { infallible_perf_event_disable_if_open(pt_state->pt_perf_event_fd); } + infallible_perf_event_disable_if_open(fd_async_signal_accelerator); } return ticks; } @@ -1090,4 +1097,96 @@ Ticks PerfCounters::read_ticks(Task* t, Error* error) { return ret; } +#ifdef USEBPF +bool PerfCounters::accelerate_async_signal(const Registers& regs) { + static int initialized; + static struct perf_event_attr attr; + static int bpf_prog_fd; + static struct user_regs_struct* bpf_regs; + + if (!fd_async_signal_accelerator.is_open()) { + if (!initialized) { + initialized = -1; + + attr.type = PERF_TYPE_BREAKPOINT; + attr.size = sizeof(attr); + attr.bp_type = HW_BREAKPOINT_X; + attr.bp_len = sizeof(long); + attr.sample_period = 1; + attr.sample_type = PERF_SAMPLE_IP; + attr.pinned = 1; + attr.exclude_kernel = 1; + attr.exclude_hv = 1; + attr.wakeup_events = 1; + attr.precise_ip = 3; + attr.disabled = 1; + + libbpf_set_strict_mode(LIBBPF_STRICT_DIRECT_ERRS); + string path = resource_path() + "share/rr/async_event_filter.o"; + struct bpf_object* obj = bpf_object__open(path.c_str()); + if ((intptr_t)obj <= 0) { + return false; + } + if (bpf_object__load(obj) < 0) { + return false; + } + int bpf_map_fd = ScopedFd(bpf_object__find_map_fd_by_name(obj, "registers")); + if (bpf_map_fd < 0) { + return false; + } + struct bpf_program* prog = bpf_program__next(NULL, obj); + if (!prog) { + return false; + } + bpf_prog_fd = bpf_program__fd(prog); + if (bpf_prog_fd < 0) { + return false; + } + + bpf_regs = (struct user_regs_struct*) + mmap(NULL, 4096, PROT_READ | PROT_WRITE, + MAP_SHARED, bpf_map_fd, 0); + if (!bpf_regs) { + return false; + } + + initialized = 1; + } else if (initialized < 0) { + return false; + } + + attr.bp_addr = 0; + fd_async_signal_accelerator = start_counter(tid, -1, &attr); + + struct f_owner_ex own; + own.type = F_OWNER_TID; + own.pid = tid; + if (fcntl(fd_async_signal_accelerator, F_SETOWN_EX, &own)) { + FATAL() << "Failed to SETOWN_EX bpf-accelerated breakpoint fd"; + } + + make_counter_async(fd_async_signal_accelerator, SIGTRAP); + + if (ioctl(fd_async_signal_accelerator, PERF_EVENT_IOC_SET_BPF, bpf_prog_fd)) { + FATAL() << "Failed PERF_EVENT_IOC_SET_BPF"; + } + } + + if (!fd_async_signal_accelerator.is_open()) { + return false; + } + + attr.bp_addr = regs.ip().register_value(); + if (ioctl(fd_async_signal_accelerator, PERF_EVENT_IOC_MODIFY_ATTRIBUTES, &attr)) { + FATAL() << "Failed PERF_EVENT_IOC_MODIFY_ATTRIBUTES"; + } + + auto r = regs.get_ptrace(); + memcpy(bpf_regs, &r, sizeof(struct user_regs_struct)); + + infallible_perf_event_enable_if_open(fd_async_signal_accelerator); + return true; +} +#endif + } // namespace rr diff --git a/src/PerfCounters.h b/src/PerfCounters.h index 744c9d6daad..f8bdc64cad6 100644 --- a/src/PerfCounters.h +++ b/src/PerfCounters.h @@ -21,6 +21,7 @@ namespace rr { +class Registers; class Task; enum TicksSemantics { @@ -175,6 +176,17 @@ class PerfCounters { */ static void start_pt_copy_thread(); + /** + * Try to use BPF to accelerate async signal processing + */ +#ifdef USEBPF + bool accelerate_async_signal(const Registers& regs); +#else + bool accelerate_async_signal(const Registers&) { + return false; + } +#endif + private: template void reset_arch_extras(); @@ -212,6 +224,9 @@ class PerfCounters { // aarch64 specific counter to detect use of ll/sc instructions ScopedFd fd_strex_counter; + // BPF-enabled hardware breakpoint for fast async signal emulation. + ScopedFd fd_async_signal_accelerator; + std::unique_ptr pt_state; TicksSemantics ticks_semantics_; diff --git a/src/ReplaySession.cc b/src/ReplaySession.cc index 0f041270e31..7fa20683e9e 100644 --- a/src/ReplaySession.cc +++ b/src/ReplaySession.cc @@ -982,6 +982,7 @@ Completion ReplaySession::emulate_async_signal( * be dealt with. */ bool pending_SIGTRAP = false; bool did_set_internal_breakpoints = false; + bool did_set_bpf_breakpoint = false; RunCommand SIGTRAP_run_command = RUN_CONTINUE; /* Step 2: more slowly, find our way to the target ticks and @@ -1042,17 +1043,19 @@ Completion ReplaySession::emulate_async_signal( // breakpoint instruction in the tracee would have triggered a // deterministic signal instead of an async one. // So we must have hit our internal breakpoint. - ASSERT(t, did_set_internal_breakpoints); + ASSERT(t, did_set_internal_breakpoints || did_set_bpf_breakpoint); // We didn't do an internal singlestep, and if we'd done a // user-requested singlestep we would have hit the above case. ASSERT(t, !trap_reasons.singlestep); - if (t->ip().undo_executed_bkpt(t->arch()) == in_syscallbuf_syscall_hook) { - t->vm()->remove_breakpoint(ip, BKPT_INTERNAL); - t->vm()->remove_breakpoint(in_syscallbuf_syscall_hook, BKPT_INTERNAL); - t->move_ip_before_breakpoint(); - return COMPLETE; + if (did_set_internal_breakpoints) { + if (t->ip().undo_executed_bkpt(t->arch()) == in_syscallbuf_syscall_hook) { + t->vm()->remove_breakpoint(ip, BKPT_INTERNAL); + t->vm()->remove_breakpoint(in_syscallbuf_syscall_hook, BKPT_INTERNAL); + t->move_ip_before_breakpoint(); + return COMPLETE; + } + ASSERT(t, regs.ip() == t->ip().undo_executed_bkpt(t->arch())); } - ASSERT(t, regs.ip() == t->ip().undo_executed_bkpt(t->arch())); /* Case (1) above: cover the tracks of * our internal breakpoint, and go * check again if we're at the @@ -1060,7 +1063,9 @@ Completion ReplaySession::emulate_async_signal( LOG(debug) << " trap was for target $ip"; pending_SIGTRAP = false; - t->move_ip_before_breakpoint(); + if (did_set_internal_breakpoints) { + t->move_ip_before_breakpoint(); + } /* We just backed up the $ip, but * rewound it over an |int $3| * instruction, which couldn't have @@ -1093,6 +1098,7 @@ Completion ReplaySession::emulate_async_signal( } did_set_internal_breakpoints = false; } + did_set_bpf_breakpoint = false; if (at_target) { /* Case (2) above: done. */ @@ -1117,11 +1123,16 @@ Completion ReplaySession::emulate_async_signal( * no slower than single-stepping our way to * the target execution point. */ LOG(debug) << " breaking on target $ip"; - t->vm()->add_breakpoint(ip, BKPT_INTERNAL); - if (in_syscallbuf_syscall_hook) { - t->vm()->add_breakpoint(in_syscallbuf_syscall_hook, BKPT_INTERNAL); + if (is_x86_string_instruction_at(t, ip) || !t->hpc.accelerate_async_signal(regs)) { + t->vm()->add_breakpoint(ip, BKPT_INTERNAL); + + if (in_syscallbuf_syscall_hook) { + t->vm()->add_breakpoint(in_syscallbuf_syscall_hook, BKPT_INTERNAL); + } + did_set_internal_breakpoints = true; + } else { + did_set_bpf_breakpoint = true; } - did_set_internal_breakpoints = true; continue_or_step(t, constraints, RESUME_UNLIMITED_TICKS); SIGTRAP_run_command = constraints.command; } else { diff --git a/src/Task.cc b/src/Task.cc index 5ffae6915ea..728668133b8 100644 --- a/src/Task.cc +++ b/src/Task.cc @@ -1387,6 +1387,9 @@ TrapReasons Task::compute_trap_reasons() { << " expected breakpoint at " << ip_at_breakpoint << ", got siginfo " << si; } + // If we got a SIGTRAP via a FASYNC signal it must be our bpf-enabled + // hardware breakpoint. + reasons.breakpoint |= si.si_code == SI_SIGIO; } return reasons; } diff --git a/src/bpf/async_event_filter.c b/src/bpf/async_event_filter.c new file mode 100644 index 00000000000..bd5d5cbb3c0 --- /dev/null +++ b/src/bpf/async_event_filter.c @@ -0,0 +1,50 @@ +#include +#include +#include +#include + +const uint32_t REGISTER_COUNT = sizeof(struct pt_regs)/sizeof(uint64_t); + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, REGISTER_COUNT); + __uint(map_flags, BPF_F_MMAPABLE); + __type(key, uint32_t); + __type(value, uint64_t); +} registers SEC(".maps"); + +SEC("perf_event") +int match_registers(struct bpf_perf_event_data* event) { +#define CHECK_REG(name) \ + { \ + const uint32_t i = offsetof(struct pt_regs, name) / sizeof(uint64_t); \ + uint64_t* reg = bpf_map_lookup_elem(®isters, &i); \ + if (!reg) { \ + return 1; \ + } \ + if (event->regs.name != *reg) { \ + return 0; \ + } \ + } + + CHECK_REG(r15) + CHECK_REG(r14) + CHECK_REG(r13) + CHECK_REG(r12) + CHECK_REG(rbp) + CHECK_REG(rbx) + CHECK_REG(r11) + CHECK_REG(r10) + CHECK_REG(r9) + CHECK_REG(r8) + CHECK_REG(rax) + CHECK_REG(rcx) + CHECK_REG(rdx) + CHECK_REG(rsi) + CHECK_REG(rdi) + CHECK_REG(rip) + + return 1; +} + +char _license[] SEC("license") = "GPL"; diff --git a/src/fast_forward.cc b/src/fast_forward.cc index 1eeeb86dc1e..30531c60ee2 100644 --- a/src/fast_forward.cc +++ b/src/fast_forward.cc @@ -404,7 +404,7 @@ static int fallible_read_byte(Task* t, remote_ptr ip) { return byte; } -bool is_string_instruction_at(Task* t, remote_code_ptr ip) { +bool is_x86_string_instruction_at(Task* t, remote_code_ptr ip) { bool found_rep = false; remote_ptr bare_ip = ip.to_data_ptr(); while (true) { @@ -447,7 +447,7 @@ bool maybe_at_or_after_x86_string_instruction(Task* t) { return false; } - return is_string_instruction_at(t, t->ip()) || + return is_x86_string_instruction_at(t, t->ip()) || is_string_instruction_before(t, t->ip()); } @@ -456,7 +456,7 @@ bool at_x86_string_instruction(Task* t) { return false; } - return is_string_instruction_at(t, t->ip()); + return is_x86_string_instruction_at(t, t->ip()); } } // namespace rr diff --git a/src/fast_forward.h b/src/fast_forward.h index 944d93c8ffa..c0398a6889d 100644 --- a/src/fast_forward.h +++ b/src/fast_forward.h @@ -60,6 +60,14 @@ bool maybe_at_or_after_x86_string_instruction(Task* t); /* Return true if the instruction at t->ip() is a string instruction */ bool at_x86_string_instruction(Task* t); +#if defined(__i386__) || defined(__x86_64__) +bool is_x86_string_instruction_at(Task* t, remote_code_ptr ip); +#else +bool is_x86_string_instruction_at(Task*, remote_code_ptr) { + return false; +} +#endif + } // namespace rr #endif // RR_FAST_FORWARD_H_