From 13fb44422d415cebe9dcaec33894210de060b7c2 Mon Sep 17 00:00:00 2001 From: Robert O'Callahan Date: Wed, 14 Aug 2024 00:01:02 +1200 Subject: [PATCH] Use `PERF_RECORD_SWITCH` events when `perf_event_paranoid` is 2 and the kernel supports it --- CMakeLists.txt | 6 +- src/ContextSwitchEvent.cc | 175 ++++++++++++++++++++++++++++ src/ContextSwitchEvent.h | 70 +++++++++++ src/PerfCounterBuffers.cc | 7 +- src/RecordSession.cc | 23 ---- src/RecordTask.cc | 11 +- src/RecordTask.h | 3 +- src/preload/preload_interface.h | 6 + src/preload/syscallbuf.c | 17 ++- src/record_signal.cc | 8 +- src/test/check_environment_test.run | 5 - src/test/perf_event.c | 6 + src/test/perf_event_mmap.c | 1 + 13 files changed, 297 insertions(+), 41 deletions(-) create mode 100644 src/ContextSwitchEvent.cc create mode 100644 src/ContextSwitchEvent.h delete mode 100644 src/test/check_environment_test.run diff --git a/CMakeLists.txt b/CMakeLists.txt index 23d878af735..2f05ff8b5d9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -581,6 +581,7 @@ set(RR_SOURCES src/Command.cc src/CompressedReader.cc src/CompressedWriter.cc + src/ContextSwitchEvent.cc src/CPUFeaturesCommand.cc src/CPUIDBugDetector.cc src/DiversionSession.cc @@ -1803,11 +1804,6 @@ if(BUILD_TESTS) \$ENV{DESTDIR}\${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}/rr/testsuite/obj/bin_dir)") endif(INSTALL_TESTSUITE) - add_test(check_environment - bash source_dir/src/test/check_environment_test.run) - set_tests_properties(check_environment - PROPERTIES FAIL_REGULAR_EXPRESSION "rr needs /proc/sys/kernel/perf_event_paranoid <= 1") - foreach(test ${BASIC_TESTS} ${TESTS_WITH_PROGRAM}) if (NOT x86ish AND ${test} MATCHES "^x86/.*") continue() diff --git a/src/ContextSwitchEvent.cc b/src/ContextSwitchEvent.cc new file mode 100644 index 00000000000..a1d71fd1c06 --- /dev/null +++ b/src/ContextSwitchEvent.cc @@ -0,0 +1,175 @@ +/* -*- Mode: C++; tab-width: 8; c-basic-offset: 2; indent-tabs-mode: nil; -*- */ + +#include "ContextSwitchEvent.h" + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "log.h" +#include "util.h" + +using namespace std; + +namespace rr { + +static optional read_perf_event_paranoid() { + ScopedFd fd("/proc/sys/kernel/perf_event_paranoid", O_RDONLY); + if (fd.is_open()) { + char buf[100]; + ssize_t size = read(fd, buf, sizeof(buf) - 1); + if (size >= 0) { + buf[size] = 0; + return atoi(buf); + } + } + return nullopt; +} + +static volatile int sigio_count; + +static void sigio_handler(int, siginfo_t*, void*) { + ++sigio_count; +} + +static bool can_use_switch_records() { + struct perf_event_attr attr; + memset(&attr, 0, sizeof(attr)); + attr.size = sizeof(attr); + attr.type = PERF_TYPE_SOFTWARE; + attr.config = PERF_COUNT_SW_DUMMY; + attr.sample_period = 1; + attr.watermark = 1; + // We can't easily check PERF_RECORD_SWITCH directly + // because there's no reliable way (as far as I know) to + // force a context switch but still recover if no signal is + // generated. So we test that generating a PERF_RECORD_MMAP + // raises a signal instead. + attr.mmap_data = 1; + attr.wakeup_watermark = 1; + attr.exclude_kernel = 1; + attr.exclude_guest = 1; + attr.disabled = 1; + + ScopedFd fd(syscall(SYS_perf_event_open, &attr, 0, -1, -1, 0)); + if (!fd.is_open()) { + LOG(warn) << "Couldn't open a dummy event"; + return false; + } + + PerfCounterBuffers buffers; + buffers.allocate(fd, page_size(), 0); + + int ret = fcntl(fd, F_SETFL, FASYNC); + if (ret < 0) { + FATAL() << "Can't make fd async"; + } + struct f_owner_ex own; + own.type = F_OWNER_TID; + own.pid = gettid(); + ret = fcntl(fd, F_SETOWN_EX, &own); + if (ret < 0) { + FATAL() << "Failed to fcntl(SETOWN_EX)"; + } + + struct sigaction sa; + struct sigaction old_sa; + sa.sa_sigaction = sigio_handler; + sigemptyset(&sa.sa_mask); + sa.sa_flags = SA_SIGINFO; + ret = sigaction(SIGIO, &sa, &old_sa); + if (ret < 0) { + FATAL() << "Failed to install sighandler"; + } + + sigio_count = 0; + ret = ioctl(fd, PERF_EVENT_IOC_ENABLE, 0); + if (ret < 0) { + FATAL() << "Failed to enable event"; + } + void* p = mmap(nullptr, 1, PROT_READ, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + if (p == MAP_FAILED) { + FATAL() << "Failed to mmap"; + } + ret = ioctl(fd, PERF_EVENT_IOC_DISABLE, 0); + if (ret < 0) { + FATAL() << "Failed to disable event"; + } + + ret = munmap(p, 1); + if (ret < 0) { + FATAL() << "Failed to munmap"; + } + ret = sigaction(SIGIO, &old_sa, nullptr); + if (ret < 0) { + FATAL() << "Failed to clean up sighandler"; + } + + if (sigio_count == 0) { + // Old kernel + LOG(info) << "PERF_RECORD_MMAP watermark failed to deliver signal"; + return false; + } + if (sigio_count > 1) { + FATAL() << "Invalid SIGIO count"; + } + + return true; +} + +static ContextSwitchEventStrategy init_strategy() { + if (has_effective_caps(uint64_t(1) << CAP_SYS_ADMIN) || + has_effective_caps(uint64_t(1) << CAP_PERFMON)) { + return ContextSwitchEventStrategy::STRATEGY_SW_CONTEXT_SWITCHES; + } + optional perf_event_paranoid = read_perf_event_paranoid(); + if (perf_event_paranoid.has_value() && *perf_event_paranoid < 2) { + return ContextSwitchEventStrategy::STRATEGY_SW_CONTEXT_SWITCHES; + } + + if (can_use_switch_records()) { + return ContextSwitchEventStrategy::STRATEGY_RECORD_SWITCH; + } + + string paranoid_value = "unknown"; + if (perf_event_paranoid.has_value()) { + paranoid_value = *perf_event_paranoid; + } + CLEAN_FATAL() << + "rr needs /proc/sys/kernel/perf_event_paranoid <= 1, but it is " + << paranoid_value << ".\n" + << "Change it to 1, or use 'rr record -n' (slow).\n" + << "Consider putting 'kernel.perf_event_paranoid = 1' in /etc/sysctl.d/10-rr.conf.\n" + << "See 'man 8 sysctl', 'man 5 sysctl.d' (systemd systems)\n" + << "and 'man 5 sysctl.conf' (non-systemd systems) for more details."; + return ContextSwitchEventStrategy::STRATEGY_SW_CONTEXT_SWITCHES; +} + +ContextSwitchEventStrategy ContextSwitchEvent::strategy() { + static ContextSwitchEventStrategy strat = init_strategy(); + return strat; +} + +void ContextSwitchEvent::init(ScopedFd tracee_fd) { + tracee_fd_ = std::move(tracee_fd); + if (strategy() == ContextSwitchEventStrategy::STRATEGY_RECORD_SWITCH) { + mmap_buffer = make_unique(); + mmap_buffer->allocate(tracee_fd_, page_size(), 0); + } +} + +void ContextSwitchEvent::drain_events() { + if (mmap_buffer) { + while (auto packet = mmap_buffer->next_packet()) { + } + } +} + +} // namespace rr diff --git a/src/ContextSwitchEvent.h b/src/ContextSwitchEvent.h new file mode 100644 index 00000000000..7753059f818 --- /dev/null +++ b/src/ContextSwitchEvent.h @@ -0,0 +1,70 @@ +/* -*- Mode: C++; tab-width: 8; c-basic-offset: 2; indent-tabs-mode: nil; -*- */ + +#ifndef RR_CONTEXT_SWITCH_EVENT_H_ +#define RR_CONTEXT_SWITCH_EVENT_H_ + +#include + +#include + +#include "PerfCounterBuffers.h" +#include "ScopedFd.h" +#include "preload/preload_interface.h" + +namespace rr { + +/** + * For syscall buffering, we need to interrupt a tracee when it would block. + * We do this by configuring a perf event to detect when the tracee is subject + * to a context switch. When the perf event fires, it delivers a signal to the + * tracee. The tracee syscallbuf code allocates the event fd and rr retrieves + * it. We do it that way because both the tracee and rr need access to the + * event fd. + * + * We can use `PERF_COUNT_SW_CONTEXT_SWITCHES` as the event. This is easy but + * since it's a kernel event, unprivileged rr can't use it when + * `perf_event_paranoid` is >= 2. + * + * Alternatively we can configure a dummy event and observe `PERF_RECORD_SWITCH` + * records. This works with unprivileged rr when `perf_event_paranoid` == 2. + * To trigger a signal when we get a `PERF_RECORD_SWITCH`, we set + * `wakeup_watermark` so that appending any record to the ring buffer triggers + * a wakeup. This requries configuring a ring buffer per tracee task; we can't + * use a single ring buffer for multiple tracees, since when a tracee blocks + * we need to send a signal directly to that specific tracee, not any others + * and not rr. (We could deliver to rr and have rr interrupt the right tracee + * but that would be slow.) + * Unfortunately, in Linux kernels before 6.10, `watermark_wakeup` doesn't + * trigger signals associated with the event fd. This bug was fixed in 6.10. + * + * So this class manages all the necessary logic. In particular we have to figure + * out which strategy to use. We prefer to use `PERF_COUNT_SW_CONTEXT_SWITCHES` + * if possible since we don't have to allocate ring buffers for those, so we'll + * first check if that works. If it doesn't, we'll test if `PERF_RECORD_SWITCH` + * works properly. If it doesn't, we produce the right error message and abort. + * Then, if we're using `PERF_RECORD_SWITCH`, we need to allocate the ring buffer + * and configure `wakeup_watermark`. + */ +class ContextSwitchEvent { +public: + void init(ScopedFd tracee_fd); + + ScopedFd& tracee_fd() { return tracee_fd_; } + + // We need to determine the strategy before we configure syscallbuf to create + // its tracee perf event fds. + static ContextSwitchEventStrategy strategy(); + + void drain_events(); + +private: + // The fd retrieved from the tracee task that created it. + ScopedFd tracee_fd_; + // If we're using `PERF_RECORD_SWITCH` records, the + // buffer we're using to trigger the watermark-wakeups. + std::unique_ptr mmap_buffer; +}; + +} // namespace rr + +#endif /* RR_CONTEXT_SWITCH_EVENT_H_ */ diff --git a/src/PerfCounterBuffers.cc b/src/PerfCounterBuffers.cc index 2aa05b2ca47..68dc16b7ad4 100644 --- a/src/PerfCounterBuffers.cc +++ b/src/PerfCounterBuffers.cc @@ -51,10 +51,15 @@ optional PerfCounterBuffers::next_packet() { FATAL() << "Can't offer more than one packet at a time"; } - uint64_t data_end = *reinterpret_cast(mmap_header->data_head); + // Equivalent of kernel's READ_ONCE. This value is written + // by the kernel. + uint64_t data_end = + *reinterpret_cast(&mmap_header->data_head); if (mmap_header->data_tail >= data_end) { return nullopt; } + // Force memory barrier to ensure that we see all memory updates that were + // performed before `data_head `was updated. __sync_synchronize(); char* data_buf = reinterpret_cast(mmap_header) + mmap_header->data_offset; diff --git a/src/RecordSession.cc b/src/RecordSession.cc index 8488b549a19..146d2091433 100644 --- a/src/RecordSession.cc +++ b/src/RecordSession.cc @@ -2395,29 +2395,6 @@ static string lookup_by_path(const string& name) { unsetenv(SYSCALLBUF_ENABLED_ENV_VAR); } else { setenv(SYSCALLBUF_ENABLED_ENV_VAR, "1", 1); - - if (!has_effective_caps(uint64_t(1) << CAP_SYS_ADMIN) && - !has_effective_caps(uint64_t(1) << CAP_PERFMON)) { - ScopedFd fd("/proc/sys/kernel/perf_event_paranoid", O_RDONLY); - if (fd.is_open()) { - char buf[100]; - ssize_t size = read(fd, buf, sizeof(buf) - 1); - if (size >= 0) { - buf[size] = 0; - int val = atoi(buf); - if (val > 1) { - fprintf(stderr, - "rr needs /proc/sys/kernel/perf_event_paranoid <= 1, but it is %d.\n" - "Change it to 1, or use 'rr record -n' (slow).\n" - "Consider putting 'kernel.perf_event_paranoid = 1' in /etc/sysctl.d/10-rr.conf.\n" - "See 'man 8 sysctl', 'man 5 sysctl.d' (systemd systems)\n" - "and 'man 5 sysctl.conf' (non-systemd systems) for more details.\n", - val); - exit(1); - } - } - } - } } vector env = current_env(); diff --git a/src/RecordTask.cc b/src/RecordTask.cc index a0bdb8d84b4..06f2a0c4b3a 100644 --- a/src/RecordTask.cc +++ b/src/RecordTask.cc @@ -11,6 +11,7 @@ #include #include "AutoRemoteSyscalls.h" +#include "ContextSwitchEvent.h" #include "PreserveFileMonitor.h" #include "RecordSession.h" #include "WaitManager.h" @@ -420,6 +421,12 @@ template static void do_preload_init_arch(RecordTask* t) { auto cpu_binding_ptr = REMOTE_PTR_FIELD(params.globals.rptr(), cpu_binding); t->write_mem(cpu_binding_ptr, cpu_binding); t->record_local(cpu_binding_ptr, &cpu_binding); + + auto context_switch_event_strategy = ContextSwitchEvent::strategy(); + auto context_switch_event_strategy_ptr = + REMOTE_PTR_FIELD(params.globals.rptr(), context_switch_event_strategy); + t->write_mem(context_switch_event_strategy_ptr, context_switch_event_strategy); + t->record_local(context_switch_event_strategy_ptr, &context_switch_event_strategy); } void RecordTask::push_syscall_event(int syscallno) { @@ -519,7 +526,7 @@ template void RecordTask::init_buffers_arch() { desched_fd_child = args.desched_counter_fd; // Prevent the child from closing this fd fds->add_monitor(this, desched_fd_child, new PreserveFileMonitor()); - desched_fd = remote.retrieve_fd(desched_fd_child); + desched_fd.init(remote.retrieve_fd(desched_fd_child)); if (trace_writer().supports_file_data_cloning() && session().use_read_cloning()) { @@ -712,6 +719,8 @@ void RecordTask::will_resume_execution(ResumeRequest, WaitRequest, } } } + + desched_fd.drain_events(); } vector RecordTask::syscallbuf_syscall_entry_breakpoints() { diff --git a/src/RecordTask.h b/src/RecordTask.h index 6a0976e6492..7f42e678259 100644 --- a/src/RecordTask.h +++ b/src/RecordTask.h @@ -3,6 +3,7 @@ #ifndef RR_RECORD_TASK_H_ #define RR_RECORD_TASK_H_ +#include "ContextSwitchEvent.h" #include "Registers.h" #include "Task.h" #include "TraceFrame.h" @@ -737,7 +738,7 @@ class RecordTask final : public Task { // Syscallbuf state SyscallbufCodeLayout syscallbuf_code_layout; - ScopedFd desched_fd; + ContextSwitchEvent desched_fd; /* Value of hdr->num_rec_bytes when the buffer was flushed */ uint32_t flushed_num_rec_bytes; /* Nonzero after the trace recorder has flushed the diff --git a/src/preload/preload_interface.h b/src/preload/preload_interface.h index 6395666f811..1946b8c718a 100644 --- a/src/preload/preload_interface.h +++ b/src/preload/preload_interface.h @@ -230,6 +230,11 @@ struct mprotect_record { int32_t padding; }; +enum ContextSwitchEventStrategy { + STRATEGY_SW_CONTEXT_SWITCHES, + STRATEGY_RECORD_SWITCH +}; + /** * Must be arch-independent. * Variables used to communicate between preload and rr. @@ -298,6 +303,7 @@ struct preload_globals { unsigned char fdt_uniform; /* The CPU we're bound to, if any; -1 if not bound. Not read during replay. */ int32_t cpu_binding; + enum ContextSwitchEventStrategy context_switch_event_strategy; }; /** diff --git a/src/preload/syscallbuf.c b/src/preload/syscallbuf.c index 40c2bf60252..918e7555950 100644 --- a/src/preload/syscallbuf.c +++ b/src/preload/syscallbuf.c @@ -663,7 +663,22 @@ static int open_desched_event_counter(pid_t tid) { local_memset(&attr, 0, sizeof(attr)); attr.size = sizeof(attr); attr.type = PERF_TYPE_SOFTWARE; - attr.config = PERF_COUNT_SW_CONTEXT_SWITCHES; + switch (globals.context_switch_event_strategy) { + case STRATEGY_SW_CONTEXT_SWITCHES: + attr.config = PERF_COUNT_SW_CONTEXT_SWITCHES; + break; + case STRATEGY_RECORD_SWITCH: + attr.config = PERF_COUNT_SW_DUMMY; + attr.watermark = 1; + attr.context_switch = 1; + attr.wakeup_watermark = 1; + attr.exclude_kernel = 1; + attr.exclude_guest = 1; + break; + default: + fatal("Unknown strategy"); + break; + } attr.disabled = 1; attr.sample_period = 1; diff --git a/src/record_signal.cc b/src/record_signal.cc index 7759d662cfb..12c8cf8d85e 100644 --- a/src/record_signal.cc +++ b/src/record_signal.cc @@ -162,15 +162,15 @@ static bool try_grow_map(RecordTask* t, siginfo_t* si) { } void disarm_desched_event(RecordTask* t) { - if (t->desched_fd.is_open() && - ioctl(t->desched_fd, PERF_EVENT_IOC_DISABLE, 0)) { + ScopedFd& fd = t->desched_fd.tracee_fd(); + if (fd.is_open() && ioctl(fd, PERF_EVENT_IOC_DISABLE, 0)) { FATAL() << "Failed to disarm desched event"; } } void arm_desched_event(RecordTask* t) { - if (t->desched_fd.is_open() && - ioctl(t->desched_fd, PERF_EVENT_IOC_ENABLE, 0)) { + ScopedFd& fd = t->desched_fd.tracee_fd(); + if (fd.is_open() && ioctl(fd, PERF_EVENT_IOC_ENABLE, 0)) { FATAL() << "Failed to arm desched event"; } } diff --git a/src/test/check_environment_test.run b/src/test/check_environment_test.run deleted file mode 100644 index 768b6c262fb..00000000000 --- a/src/test/check_environment_test.run +++ /dev/null @@ -1,5 +0,0 @@ -paranoid=`cat /proc/sys/kernel/perf_event_paranoid` -if [ $paranoid -gt 1 ]; then - echo "rr needs /proc/sys/kernel/perf_event_paranoid <= 1, but it is $paranoid." - exit 1 -fi diff --git a/src/test/perf_event.c b/src/test/perf_event.c index 895ef3c7e9d..4f68275542f 100644 --- a/src/test/perf_event.c +++ b/src/test/perf_event.c @@ -28,6 +28,12 @@ int main(void) { attr.config = PERF_COUNT_SW_CONTEXT_SWITCHES; counter_fd = sys_perf_event_open(&attr, 0 /*self*/, -1 /*any cpu*/, -1, 0); + if (counter_fd < 0) { + test_assert(errno == EACCES); + atomic_puts("Skipping test because counter is not available"); + atomic_puts("EXIT-SUCCESS"); + return 0; + } test_assert(0 <= counter_fd); atomic_printf("num descheds: %" PRIu64 "\n", get_desched()); diff --git a/src/test/perf_event_mmap.c b/src/test/perf_event_mmap.c index 0a256065156..956460e73cc 100644 --- a/src/test/perf_event_mmap.c +++ b/src/test/perf_event_mmap.c @@ -17,6 +17,7 @@ int main(void) { attr.size = sizeof(attr); attr.type = PERF_TYPE_SOFTWARE; attr.config = PERF_COUNT_SW_CPU_CLOCK; + attr.exclude_kernel = 1; attr.sample_period = 1000000; attr.sample_type = PERF_SAMPLE_IP;