Skip to content

Commit

Permalink
Use PERF_RECORD_SWITCH events when perf_event_paranoid is 2 and t…
Browse files Browse the repository at this point in the history
…he kernel supports it
  • Loading branch information
rocallahan committed Aug 13, 2024
1 parent cfad2e3 commit 13fb444
Show file tree
Hide file tree
Showing 13 changed files with 297 additions and 41 deletions.
6 changes: 1 addition & 5 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -581,6 +581,7 @@ set(RR_SOURCES
src/Command.cc
src/CompressedReader.cc
src/CompressedWriter.cc
src/ContextSwitchEvent.cc
src/CPUFeaturesCommand.cc
src/CPUIDBugDetector.cc
src/DiversionSession.cc
Expand Down Expand Up @@ -1803,11 +1804,6 @@ if(BUILD_TESTS)
\$ENV{DESTDIR}\${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}/rr/testsuite/obj/bin_dir)")
endif(INSTALL_TESTSUITE)

add_test(check_environment
bash source_dir/src/test/check_environment_test.run)
set_tests_properties(check_environment
PROPERTIES FAIL_REGULAR_EXPRESSION "rr needs /proc/sys/kernel/perf_event_paranoid <= 1")

foreach(test ${BASIC_TESTS} ${TESTS_WITH_PROGRAM})
if (NOT x86ish AND ${test} MATCHES "^x86/.*")
continue()
Expand Down
175 changes: 175 additions & 0 deletions src/ContextSwitchEvent.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,175 @@
/* -*- Mode: C++; tab-width: 8; c-basic-offset: 2; indent-tabs-mode: nil; -*- */

#include "ContextSwitchEvent.h"

#include <fcntl.h>
#include <linux/perf_event.h>
#include <signal.h>
#include <stdint.h>
#include <sys/mman.h>
#include <sys/syscall.h>
#include <unistd.h>

#include <optional>
#include <string>

#include "log.h"
#include "util.h"

using namespace std;

namespace rr {

static optional<int> read_perf_event_paranoid() {
ScopedFd fd("/proc/sys/kernel/perf_event_paranoid", O_RDONLY);
if (fd.is_open()) {
char buf[100];
ssize_t size = read(fd, buf, sizeof(buf) - 1);
if (size >= 0) {
buf[size] = 0;
return atoi(buf);
}
}
return nullopt;
}

static volatile int sigio_count;

static void sigio_handler(int, siginfo_t*, void*) {
++sigio_count;
}

static bool can_use_switch_records() {
struct perf_event_attr attr;
memset(&attr, 0, sizeof(attr));
attr.size = sizeof(attr);
attr.type = PERF_TYPE_SOFTWARE;
attr.config = PERF_COUNT_SW_DUMMY;
attr.sample_period = 1;
attr.watermark = 1;
// We can't easily check PERF_RECORD_SWITCH directly
// because there's no reliable way (as far as I know) to
// force a context switch but still recover if no signal is
// generated. So we test that generating a PERF_RECORD_MMAP
// raises a signal instead.
attr.mmap_data = 1;
attr.wakeup_watermark = 1;
attr.exclude_kernel = 1;
attr.exclude_guest = 1;
attr.disabled = 1;

ScopedFd fd(syscall(SYS_perf_event_open, &attr, 0, -1, -1, 0));
if (!fd.is_open()) {
LOG(warn) << "Couldn't open a dummy event";
return false;
}

PerfCounterBuffers buffers;
buffers.allocate(fd, page_size(), 0);

int ret = fcntl(fd, F_SETFL, FASYNC);
if (ret < 0) {
FATAL() << "Can't make fd async";
}
struct f_owner_ex own;
own.type = F_OWNER_TID;
own.pid = gettid();
ret = fcntl(fd, F_SETOWN_EX, &own);
if (ret < 0) {
FATAL() << "Failed to fcntl(SETOWN_EX)";
}

struct sigaction sa;
struct sigaction old_sa;
sa.sa_sigaction = sigio_handler;
sigemptyset(&sa.sa_mask);
sa.sa_flags = SA_SIGINFO;
ret = sigaction(SIGIO, &sa, &old_sa);
if (ret < 0) {
FATAL() << "Failed to install sighandler";
}

sigio_count = 0;
ret = ioctl(fd, PERF_EVENT_IOC_ENABLE, 0);
if (ret < 0) {
FATAL() << "Failed to enable event";
}
void* p = mmap(nullptr, 1, PROT_READ, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
if (p == MAP_FAILED) {
FATAL() << "Failed to mmap";
}
ret = ioctl(fd, PERF_EVENT_IOC_DISABLE, 0);
if (ret < 0) {
FATAL() << "Failed to disable event";
}

ret = munmap(p, 1);
if (ret < 0) {
FATAL() << "Failed to munmap";
}
ret = sigaction(SIGIO, &old_sa, nullptr);
if (ret < 0) {
FATAL() << "Failed to clean up sighandler";
}

if (sigio_count == 0) {
// Old kernel
LOG(info) << "PERF_RECORD_MMAP watermark failed to deliver signal";
return false;
}
if (sigio_count > 1) {
FATAL() << "Invalid SIGIO count";
}

return true;
}

static ContextSwitchEventStrategy init_strategy() {
if (has_effective_caps(uint64_t(1) << CAP_SYS_ADMIN) ||
has_effective_caps(uint64_t(1) << CAP_PERFMON)) {
return ContextSwitchEventStrategy::STRATEGY_SW_CONTEXT_SWITCHES;
}
optional<int> perf_event_paranoid = read_perf_event_paranoid();
if (perf_event_paranoid.has_value() && *perf_event_paranoid < 2) {
return ContextSwitchEventStrategy::STRATEGY_SW_CONTEXT_SWITCHES;
}

if (can_use_switch_records()) {
return ContextSwitchEventStrategy::STRATEGY_RECORD_SWITCH;
}

string paranoid_value = "unknown";
if (perf_event_paranoid.has_value()) {
paranoid_value = *perf_event_paranoid;
}
CLEAN_FATAL() <<
"rr needs /proc/sys/kernel/perf_event_paranoid <= 1, but it is "
<< paranoid_value << ".\n"
<< "Change it to 1, or use 'rr record -n' (slow).\n"
<< "Consider putting 'kernel.perf_event_paranoid = 1' in /etc/sysctl.d/10-rr.conf.\n"
<< "See 'man 8 sysctl', 'man 5 sysctl.d' (systemd systems)\n"
<< "and 'man 5 sysctl.conf' (non-systemd systems) for more details.";
return ContextSwitchEventStrategy::STRATEGY_SW_CONTEXT_SWITCHES;
}

ContextSwitchEventStrategy ContextSwitchEvent::strategy() {
static ContextSwitchEventStrategy strat = init_strategy();
return strat;
}

void ContextSwitchEvent::init(ScopedFd tracee_fd) {
tracee_fd_ = std::move(tracee_fd);
if (strategy() == ContextSwitchEventStrategy::STRATEGY_RECORD_SWITCH) {
mmap_buffer = make_unique<PerfCounterBuffers>();
mmap_buffer->allocate(tracee_fd_, page_size(), 0);
}
}

void ContextSwitchEvent::drain_events() {
if (mmap_buffer) {
while (auto packet = mmap_buffer->next_packet()) {
}
}
}

} // namespace rr
70 changes: 70 additions & 0 deletions src/ContextSwitchEvent.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
/* -*- Mode: C++; tab-width: 8; c-basic-offset: 2; indent-tabs-mode: nil; -*- */

#ifndef RR_CONTEXT_SWITCH_EVENT_H_
#define RR_CONTEXT_SWITCH_EVENT_H_

#include <stdint.h>

#include <memory>

#include "PerfCounterBuffers.h"
#include "ScopedFd.h"
#include "preload/preload_interface.h"

namespace rr {

/**
* For syscall buffering, we need to interrupt a tracee when it would block.
* We do this by configuring a perf event to detect when the tracee is subject
* to a context switch. When the perf event fires, it delivers a signal to the
* tracee. The tracee syscallbuf code allocates the event fd and rr retrieves
* it. We do it that way because both the tracee and rr need access to the
* event fd.
*
* We can use `PERF_COUNT_SW_CONTEXT_SWITCHES` as the event. This is easy but
* since it's a kernel event, unprivileged rr can't use it when
* `perf_event_paranoid` is >= 2.
*
* Alternatively we can configure a dummy event and observe `PERF_RECORD_SWITCH`
* records. This works with unprivileged rr when `perf_event_paranoid` == 2.
* To trigger a signal when we get a `PERF_RECORD_SWITCH`, we set
* `wakeup_watermark` so that appending any record to the ring buffer triggers
* a wakeup. This requries configuring a ring buffer per tracee task; we can't
* use a single ring buffer for multiple tracees, since when a tracee blocks
* we need to send a signal directly to that specific tracee, not any others
* and not rr. (We could deliver to rr and have rr interrupt the right tracee
* but that would be slow.)
* Unfortunately, in Linux kernels before 6.10, `watermark_wakeup` doesn't
* trigger signals associated with the event fd. This bug was fixed in 6.10.
*
* So this class manages all the necessary logic. In particular we have to figure
* out which strategy to use. We prefer to use `PERF_COUNT_SW_CONTEXT_SWITCHES`
* if possible since we don't have to allocate ring buffers for those, so we'll
* first check if that works. If it doesn't, we'll test if `PERF_RECORD_SWITCH`
* works properly. If it doesn't, we produce the right error message and abort.
* Then, if we're using `PERF_RECORD_SWITCH`, we need to allocate the ring buffer
* and configure `wakeup_watermark`.
*/
class ContextSwitchEvent {
public:
void init(ScopedFd tracee_fd);

ScopedFd& tracee_fd() { return tracee_fd_; }

// We need to determine the strategy before we configure syscallbuf to create
// its tracee perf event fds.
static ContextSwitchEventStrategy strategy();

void drain_events();

private:
// The fd retrieved from the tracee task that created it.
ScopedFd tracee_fd_;
// If we're using `PERF_RECORD_SWITCH` records, the
// buffer we're using to trigger the watermark-wakeups.
std::unique_ptr<PerfCounterBuffers> mmap_buffer;
};

} // namespace rr

#endif /* RR_CONTEXT_SWITCH_EVENT_H_ */
7 changes: 6 additions & 1 deletion src/PerfCounterBuffers.cc
Original file line number Diff line number Diff line change
Expand Up @@ -51,10 +51,15 @@ optional<PerfCounterBuffers::Packet> PerfCounterBuffers::next_packet() {
FATAL() << "Can't offer more than one packet at a time";
}

uint64_t data_end = *reinterpret_cast<volatile uint64_t*>(mmap_header->data_head);
// Equivalent of kernel's READ_ONCE. This value is written
// by the kernel.
uint64_t data_end =
*reinterpret_cast<volatile unsigned long long*>(&mmap_header->data_head);
if (mmap_header->data_tail >= data_end) {
return nullopt;
}
// Force memory barrier to ensure that we see all memory updates that were
// performed before `data_head `was updated.
__sync_synchronize();

char* data_buf = reinterpret_cast<char*>(mmap_header) + mmap_header->data_offset;
Expand Down
23 changes: 0 additions & 23 deletions src/RecordSession.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2395,29 +2395,6 @@ static string lookup_by_path(const string& name) {
unsetenv(SYSCALLBUF_ENABLED_ENV_VAR);
} else {
setenv(SYSCALLBUF_ENABLED_ENV_VAR, "1", 1);

if (!has_effective_caps(uint64_t(1) << CAP_SYS_ADMIN) &&
!has_effective_caps(uint64_t(1) << CAP_PERFMON)) {
ScopedFd fd("/proc/sys/kernel/perf_event_paranoid", O_RDONLY);
if (fd.is_open()) {
char buf[100];
ssize_t size = read(fd, buf, sizeof(buf) - 1);
if (size >= 0) {
buf[size] = 0;
int val = atoi(buf);
if (val > 1) {
fprintf(stderr,
"rr needs /proc/sys/kernel/perf_event_paranoid <= 1, but it is %d.\n"
"Change it to 1, or use 'rr record -n' (slow).\n"
"Consider putting 'kernel.perf_event_paranoid = 1' in /etc/sysctl.d/10-rr.conf.\n"
"See 'man 8 sysctl', 'man 5 sysctl.d' (systemd systems)\n"
"and 'man 5 sysctl.conf' (non-systemd systems) for more details.\n",
val);
exit(1);
}
}
}
}
}

vector<string> env = current_env();
Expand Down
11 changes: 10 additions & 1 deletion src/RecordTask.cc
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
#include <sys/syscall.h>

#include "AutoRemoteSyscalls.h"
#include "ContextSwitchEvent.h"
#include "PreserveFileMonitor.h"
#include "RecordSession.h"
#include "WaitManager.h"
Expand Down Expand Up @@ -420,6 +421,12 @@ template <typename Arch> static void do_preload_init_arch(RecordTask* t) {
auto cpu_binding_ptr = REMOTE_PTR_FIELD(params.globals.rptr(), cpu_binding);
t->write_mem(cpu_binding_ptr, cpu_binding);
t->record_local(cpu_binding_ptr, &cpu_binding);

auto context_switch_event_strategy = ContextSwitchEvent::strategy();
auto context_switch_event_strategy_ptr =
REMOTE_PTR_FIELD(params.globals.rptr(), context_switch_event_strategy);
t->write_mem(context_switch_event_strategy_ptr, context_switch_event_strategy);
t->record_local(context_switch_event_strategy_ptr, &context_switch_event_strategy);
}

void RecordTask::push_syscall_event(int syscallno) {
Expand Down Expand Up @@ -519,7 +526,7 @@ template <typename Arch> void RecordTask::init_buffers_arch() {
desched_fd_child = args.desched_counter_fd;
// Prevent the child from closing this fd
fds->add_monitor(this, desched_fd_child, new PreserveFileMonitor());
desched_fd = remote.retrieve_fd(desched_fd_child);
desched_fd.init(remote.retrieve_fd(desched_fd_child));

if (trace_writer().supports_file_data_cloning() &&
session().use_read_cloning()) {
Expand Down Expand Up @@ -712,6 +719,8 @@ void RecordTask::will_resume_execution(ResumeRequest, WaitRequest,
}
}
}

desched_fd.drain_events();
}

vector<remote_code_ptr> RecordTask::syscallbuf_syscall_entry_breakpoints() {
Expand Down
3 changes: 2 additions & 1 deletion src/RecordTask.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
#ifndef RR_RECORD_TASK_H_
#define RR_RECORD_TASK_H_

#include "ContextSwitchEvent.h"
#include "Registers.h"
#include "Task.h"
#include "TraceFrame.h"
Expand Down Expand Up @@ -737,7 +738,7 @@ class RecordTask final : public Task {
// Syscallbuf state

SyscallbufCodeLayout syscallbuf_code_layout;
ScopedFd desched_fd;
ContextSwitchEvent desched_fd;
/* Value of hdr->num_rec_bytes when the buffer was flushed */
uint32_t flushed_num_rec_bytes;
/* Nonzero after the trace recorder has flushed the
Expand Down
6 changes: 6 additions & 0 deletions src/preload/preload_interface.h
Original file line number Diff line number Diff line change
Expand Up @@ -230,6 +230,11 @@ struct mprotect_record {
int32_t padding;
};

enum ContextSwitchEventStrategy {
STRATEGY_SW_CONTEXT_SWITCHES,
STRATEGY_RECORD_SWITCH
};

/**
* Must be arch-independent.
* Variables used to communicate between preload and rr.
Expand Down Expand Up @@ -298,6 +303,7 @@ struct preload_globals {
unsigned char fdt_uniform;
/* The CPU we're bound to, if any; -1 if not bound. Not read during replay. */
int32_t cpu_binding;
enum ContextSwitchEventStrategy context_switch_event_strategy;
};

/**
Expand Down
Loading

2 comments on commit 13fb444

@GitMensch
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In which environments should rr record (without slow) work now when perf_event_paranoid > 1?
Does it make any difference in performance on those systems if perf_event_paranoid == -1?

@rocallahan
Copy link
Collaborator Author

@rocallahan rocallahan commented on 13fb444 Aug 17, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Any kernel >= 6.10 should work.

The PERF_RECORD_SWITCH mechanism is a bit less efficient so we prefer to use the current mechanism if perf_event_paranoid allows it (but only a little bit; I don't think anyone will notice).

I'll write some documentation about this soon.

Please sign in to comment.