-
Notifications
You must be signed in to change notification settings - Fork 588
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Use
PERF_RECORD_SWITCH
events when perf_event_paranoid
is 2 and t…
…he kernel supports it
- Loading branch information
1 parent
cfad2e3
commit 13fb444
Showing
13 changed files
with
297 additions
and
41 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,175 @@ | ||
/* -*- Mode: C++; tab-width: 8; c-basic-offset: 2; indent-tabs-mode: nil; -*- */ | ||
|
||
#include "ContextSwitchEvent.h" | ||
|
||
#include <fcntl.h> | ||
#include <linux/perf_event.h> | ||
#include <signal.h> | ||
#include <stdint.h> | ||
#include <sys/mman.h> | ||
#include <sys/syscall.h> | ||
#include <unistd.h> | ||
|
||
#include <optional> | ||
#include <string> | ||
|
||
#include "log.h" | ||
#include "util.h" | ||
|
||
using namespace std; | ||
|
||
namespace rr { | ||
|
||
static optional<int> read_perf_event_paranoid() { | ||
ScopedFd fd("/proc/sys/kernel/perf_event_paranoid", O_RDONLY); | ||
if (fd.is_open()) { | ||
char buf[100]; | ||
ssize_t size = read(fd, buf, sizeof(buf) - 1); | ||
if (size >= 0) { | ||
buf[size] = 0; | ||
return atoi(buf); | ||
} | ||
} | ||
return nullopt; | ||
} | ||
|
||
static volatile int sigio_count; | ||
|
||
static void sigio_handler(int, siginfo_t*, void*) { | ||
++sigio_count; | ||
} | ||
|
||
static bool can_use_switch_records() { | ||
struct perf_event_attr attr; | ||
memset(&attr, 0, sizeof(attr)); | ||
attr.size = sizeof(attr); | ||
attr.type = PERF_TYPE_SOFTWARE; | ||
attr.config = PERF_COUNT_SW_DUMMY; | ||
attr.sample_period = 1; | ||
attr.watermark = 1; | ||
// We can't easily check PERF_RECORD_SWITCH directly | ||
// because there's no reliable way (as far as I know) to | ||
// force a context switch but still recover if no signal is | ||
// generated. So we test that generating a PERF_RECORD_MMAP | ||
// raises a signal instead. | ||
attr.mmap_data = 1; | ||
attr.wakeup_watermark = 1; | ||
attr.exclude_kernel = 1; | ||
attr.exclude_guest = 1; | ||
attr.disabled = 1; | ||
|
||
ScopedFd fd(syscall(SYS_perf_event_open, &attr, 0, -1, -1, 0)); | ||
if (!fd.is_open()) { | ||
LOG(warn) << "Couldn't open a dummy event"; | ||
return false; | ||
} | ||
|
||
PerfCounterBuffers buffers; | ||
buffers.allocate(fd, page_size(), 0); | ||
|
||
int ret = fcntl(fd, F_SETFL, FASYNC); | ||
if (ret < 0) { | ||
FATAL() << "Can't make fd async"; | ||
} | ||
struct f_owner_ex own; | ||
own.type = F_OWNER_TID; | ||
own.pid = gettid(); | ||
ret = fcntl(fd, F_SETOWN_EX, &own); | ||
if (ret < 0) { | ||
FATAL() << "Failed to fcntl(SETOWN_EX)"; | ||
} | ||
|
||
struct sigaction sa; | ||
struct sigaction old_sa; | ||
sa.sa_sigaction = sigio_handler; | ||
sigemptyset(&sa.sa_mask); | ||
sa.sa_flags = SA_SIGINFO; | ||
ret = sigaction(SIGIO, &sa, &old_sa); | ||
if (ret < 0) { | ||
FATAL() << "Failed to install sighandler"; | ||
} | ||
|
||
sigio_count = 0; | ||
ret = ioctl(fd, PERF_EVENT_IOC_ENABLE, 0); | ||
if (ret < 0) { | ||
FATAL() << "Failed to enable event"; | ||
} | ||
void* p = mmap(nullptr, 1, PROT_READ, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); | ||
if (p == MAP_FAILED) { | ||
FATAL() << "Failed to mmap"; | ||
} | ||
ret = ioctl(fd, PERF_EVENT_IOC_DISABLE, 0); | ||
if (ret < 0) { | ||
FATAL() << "Failed to disable event"; | ||
} | ||
|
||
ret = munmap(p, 1); | ||
if (ret < 0) { | ||
FATAL() << "Failed to munmap"; | ||
} | ||
ret = sigaction(SIGIO, &old_sa, nullptr); | ||
if (ret < 0) { | ||
FATAL() << "Failed to clean up sighandler"; | ||
} | ||
|
||
if (sigio_count == 0) { | ||
// Old kernel | ||
LOG(info) << "PERF_RECORD_MMAP watermark failed to deliver signal"; | ||
return false; | ||
} | ||
if (sigio_count > 1) { | ||
FATAL() << "Invalid SIGIO count"; | ||
} | ||
|
||
return true; | ||
} | ||
|
||
static ContextSwitchEventStrategy init_strategy() { | ||
if (has_effective_caps(uint64_t(1) << CAP_SYS_ADMIN) || | ||
has_effective_caps(uint64_t(1) << CAP_PERFMON)) { | ||
return ContextSwitchEventStrategy::STRATEGY_SW_CONTEXT_SWITCHES; | ||
} | ||
optional<int> perf_event_paranoid = read_perf_event_paranoid(); | ||
if (perf_event_paranoid.has_value() && *perf_event_paranoid < 2) { | ||
return ContextSwitchEventStrategy::STRATEGY_SW_CONTEXT_SWITCHES; | ||
} | ||
|
||
if (can_use_switch_records()) { | ||
return ContextSwitchEventStrategy::STRATEGY_RECORD_SWITCH; | ||
} | ||
|
||
string paranoid_value = "unknown"; | ||
if (perf_event_paranoid.has_value()) { | ||
paranoid_value = *perf_event_paranoid; | ||
} | ||
CLEAN_FATAL() << | ||
"rr needs /proc/sys/kernel/perf_event_paranoid <= 1, but it is " | ||
<< paranoid_value << ".\n" | ||
<< "Change it to 1, or use 'rr record -n' (slow).\n" | ||
<< "Consider putting 'kernel.perf_event_paranoid = 1' in /etc/sysctl.d/10-rr.conf.\n" | ||
<< "See 'man 8 sysctl', 'man 5 sysctl.d' (systemd systems)\n" | ||
<< "and 'man 5 sysctl.conf' (non-systemd systems) for more details."; | ||
return ContextSwitchEventStrategy::STRATEGY_SW_CONTEXT_SWITCHES; | ||
} | ||
|
||
ContextSwitchEventStrategy ContextSwitchEvent::strategy() { | ||
static ContextSwitchEventStrategy strat = init_strategy(); | ||
return strat; | ||
} | ||
|
||
void ContextSwitchEvent::init(ScopedFd tracee_fd) { | ||
tracee_fd_ = std::move(tracee_fd); | ||
if (strategy() == ContextSwitchEventStrategy::STRATEGY_RECORD_SWITCH) { | ||
mmap_buffer = make_unique<PerfCounterBuffers>(); | ||
mmap_buffer->allocate(tracee_fd_, page_size(), 0); | ||
} | ||
} | ||
|
||
void ContextSwitchEvent::drain_events() { | ||
if (mmap_buffer) { | ||
while (auto packet = mmap_buffer->next_packet()) { | ||
} | ||
} | ||
} | ||
|
||
} // namespace rr |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,70 @@ | ||
/* -*- Mode: C++; tab-width: 8; c-basic-offset: 2; indent-tabs-mode: nil; -*- */ | ||
|
||
#ifndef RR_CONTEXT_SWITCH_EVENT_H_ | ||
#define RR_CONTEXT_SWITCH_EVENT_H_ | ||
|
||
#include <stdint.h> | ||
|
||
#include <memory> | ||
|
||
#include "PerfCounterBuffers.h" | ||
#include "ScopedFd.h" | ||
#include "preload/preload_interface.h" | ||
|
||
namespace rr { | ||
|
||
/** | ||
* For syscall buffering, we need to interrupt a tracee when it would block. | ||
* We do this by configuring a perf event to detect when the tracee is subject | ||
* to a context switch. When the perf event fires, it delivers a signal to the | ||
* tracee. The tracee syscallbuf code allocates the event fd and rr retrieves | ||
* it. We do it that way because both the tracee and rr need access to the | ||
* event fd. | ||
* | ||
* We can use `PERF_COUNT_SW_CONTEXT_SWITCHES` as the event. This is easy but | ||
* since it's a kernel event, unprivileged rr can't use it when | ||
* `perf_event_paranoid` is >= 2. | ||
* | ||
* Alternatively we can configure a dummy event and observe `PERF_RECORD_SWITCH` | ||
* records. This works with unprivileged rr when `perf_event_paranoid` == 2. | ||
* To trigger a signal when we get a `PERF_RECORD_SWITCH`, we set | ||
* `wakeup_watermark` so that appending any record to the ring buffer triggers | ||
* a wakeup. This requries configuring a ring buffer per tracee task; we can't | ||
* use a single ring buffer for multiple tracees, since when a tracee blocks | ||
* we need to send a signal directly to that specific tracee, not any others | ||
* and not rr. (We could deliver to rr and have rr interrupt the right tracee | ||
* but that would be slow.) | ||
* Unfortunately, in Linux kernels before 6.10, `watermark_wakeup` doesn't | ||
* trigger signals associated with the event fd. This bug was fixed in 6.10. | ||
* | ||
* So this class manages all the necessary logic. In particular we have to figure | ||
* out which strategy to use. We prefer to use `PERF_COUNT_SW_CONTEXT_SWITCHES` | ||
* if possible since we don't have to allocate ring buffers for those, so we'll | ||
* first check if that works. If it doesn't, we'll test if `PERF_RECORD_SWITCH` | ||
* works properly. If it doesn't, we produce the right error message and abort. | ||
* Then, if we're using `PERF_RECORD_SWITCH`, we need to allocate the ring buffer | ||
* and configure `wakeup_watermark`. | ||
*/ | ||
class ContextSwitchEvent { | ||
public: | ||
void init(ScopedFd tracee_fd); | ||
|
||
ScopedFd& tracee_fd() { return tracee_fd_; } | ||
|
||
// We need to determine the strategy before we configure syscallbuf to create | ||
// its tracee perf event fds. | ||
static ContextSwitchEventStrategy strategy(); | ||
|
||
void drain_events(); | ||
|
||
private: | ||
// The fd retrieved from the tracee task that created it. | ||
ScopedFd tracee_fd_; | ||
// If we're using `PERF_RECORD_SWITCH` records, the | ||
// buffer we're using to trigger the watermark-wakeups. | ||
std::unique_ptr<PerfCounterBuffers> mmap_buffer; | ||
}; | ||
|
||
} // namespace rr | ||
|
||
#endif /* RR_CONTEXT_SWITCH_EVENT_H_ */ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.
13fb444
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
In which environments should
rr record
(without slow) work now when perf_event_paranoid > 1?Does it make any difference in performance on those systems if perf_event_paranoid == -1?
13fb444
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Any kernel >= 6.10 should work.
The
PERF_RECORD_SWITCH
mechanism is a bit less efficient so we prefer to use the current mechanism ifperf_event_paranoid
allows it (but only a little bit; I don't think anyone will notice).I'll write some documentation about this soon.