From d5ed041100e12ca06a48db81145b27aecde4048e Mon Sep 17 00:00:00 2001 From: Robert O'Callahan Date: Thu, 1 Aug 2024 23:52:44 +1200 Subject: [PATCH] Reset `cpu_id(_start)` rseq fields every time a new task is scheduled. Some software assumes the kernel does this, and depends on it by caching user data in these fields, on the assumption that the kernel will clear out the cached data when the task is newly scheduled onto a core. --- CMakeLists.txt | 1 + src/RecordSession.cc | 3 ++ src/ReplaySession.cc | 5 +++ src/ReplaySession.h | 1 + src/Task.cc | 13 +++++++ src/Task.h | 5 +++ src/record_syscall.cc | 3 +- src/test/rseq_cpu_id_reset.c | 74 ++++++++++++++++++++++++++++++++++++ 8 files changed, 104 insertions(+), 1 deletion(-) create mode 100644 src/test/rseq_cpu_id_reset.c diff --git a/CMakeLists.txt b/CMakeLists.txt index 0897c188d11..e35253db52c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1283,6 +1283,7 @@ set(BASIC_TESTS redzone_integrity rename rlimit + rseq_cpu_id_reset rusage samask save_data_fd diff --git a/src/RecordSession.cc b/src/RecordSession.cc index 63c788fc5be..8488b549a19 100644 --- a/src/RecordSession.cc +++ b/src/RecordSession.cc @@ -2647,6 +2647,9 @@ RecordSession::RecordResult RecordSession::record_step() { } prev_task->pop_event(EV_SCHED); } + if (t->tuid() != prev_task_tuid) { + t->will_schedule(); + } // Have to disable context-switching until we know it's safe // to allow switching the context. diff --git a/src/ReplaySession.cc b/src/ReplaySession.cc index 6c42c896a56..f0f7d1f7409 100644 --- a/src/ReplaySession.cc +++ b/src/ReplaySession.cc @@ -1820,6 +1820,11 @@ ReplayTask* ReplaySession::setup_replay_one_trace_frame(ReplayTask* t) { t = revive_task_for_exec(); } + if (t->tuid() != last_task_tuid) { + t->will_schedule(); + last_task_tuid = t->tuid(); + } + LOG(debug) << "[event " << trace_frame.time() << "] " << t->rec_tid << ": replaying " << Event(ev) << "; state " << (ev.is_syscall_event() ? state_name(ev.Syscall().state) diff --git a/src/ReplaySession.h b/src/ReplaySession.h index 74433739883..9dce24e6a9b 100644 --- a/src/ReplaySession.h +++ b/src/ReplaySession.h @@ -430,6 +430,7 @@ class ReplaySession final : public Session { siginfo_t last_siginfo_; Flags flags_; FastForwardStatus fast_forward_status; + TaskUid last_task_tuid; bool skip_next_execution_event; bool replay_stops_at_first_execve_; bool detected_transient_error_; diff --git a/src/Task.cc b/src/Task.cc index ea3ec348378..407e422e55e 100644 --- a/src/Task.cc +++ b/src/Task.cc @@ -3331,6 +3331,19 @@ void Task::write_zeroes(unique_ptr* remote, remote_ptr vm()->notify_written(initial_addr, initial_size, 0); } +void Task::will_schedule() { + if (rseq_state) { + // Relying on rseq_t being the same across architectures. + int cpu = session().trace_stream()->bound_to_cpu(); + uint32_t cpu_id = cpu >= 0 ? cpu : 0; + auto addr = REMOTE_PTR_FIELD(rseq_state->ptr.cast(), cpu_id_start); + bool ok = true; + write_mem(addr, cpu_id, &ok); + addr = REMOTE_PTR_FIELD(rseq_state->ptr.cast(), cpu_id); + write_mem(addr, cpu_id, &ok); + } +} + const TraceStream* Task::trace_stream() const { if (session().as_record()) { return &session().as_record()->trace_writer(); diff --git a/src/Task.h b/src/Task.h index 21a86e80658..58316aae2c1 100644 --- a/src/Task.h +++ b/src/Task.h @@ -827,6 +827,11 @@ class Task { const void* buf, bool* ok = nullptr, uint32_t flags = 0); + /** + * This task has been selected to run next. + */ + void will_schedule(); + SupportedArch detect_syscall_arch(); /** diff --git a/src/record_syscall.cc b/src/record_syscall.cc index 34e45f8d24b..102f75219fc 100644 --- a/src/record_syscall.cc +++ b/src/record_syscall.cc @@ -4317,7 +4317,8 @@ static Switchable rec_prepare_syscall_arch(RecordTask* t, t->write_mem(addr, cpu_id); t->record_local(addr, &cpu_id); addr = REMOTE_PTR_FIELD(rseq, cpu_id_start); - uint32_t cpu_id_start = 0; + int cpu = t->session().trace_stream()->bound_to_cpu(); + uint32_t cpu_id_start = cpu >= 0 ? cpu : 0; t->write_mem(addr, cpu_id_start); t->record_local(addr, &cpu_id_start); t->rseq_state = nullptr; diff --git a/src/test/rseq_cpu_id_reset.c b/src/test/rseq_cpu_id_reset.c new file mode 100644 index 00000000000..fca17942477 --- /dev/null +++ b/src/test/rseq_cpu_id_reset.c @@ -0,0 +1,74 @@ +/* -*- Mode: C; tab-width: 8; c-basic-offset: 2; indent-tabs-mode: nil; -*- */ + +#include "util.h" + +static const uint32_t RSEQ_SIG = 0x12345678; + +static int to_main_fds[2]; +static int from_main_fds[2]; + +static const int PING_PONG_ITERATIONS = 5; + +static const uint32_t CPU_INVALID = 10000000; + +static int main_child(void) { + struct rseq* rs_ptr = + (struct rseq*)mmap(NULL, sizeof(struct rseq), PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + test_assert(rs_ptr != MAP_FAILED); + rs_ptr->cpu_id_start = CPU_INVALID; + rs_ptr->cpu_id = CPU_INVALID; + + int ret = syscall(RR_rseq, rs_ptr, sizeof(*rs_ptr), 0, RSEQ_SIG); + if (ret == -1 && errno == ENOSYS) { + atomic_puts("rseq not supported; ignoring test"); + atomic_puts("EXIT-SUCCESS"); + return 0; + } + test_assert(ret == 0); + test_assert(rs_ptr->cpu_id_start < CPU_INVALID); + test_assert(rs_ptr->cpu_id < CPU_INVALID); + + for (int i = 0; i < PING_PONG_ITERATIONS; ++i) { + char ch; + rs_ptr->cpu_id_start = CPU_INVALID; + rs_ptr->cpu_id = CPU_INVALID; + test_assert(1 == write(to_main_fds[1], "y", 1)); + // Under rr (or taskset to a single core) there must be a + // context switch here. Some code exists that expects + // rseq::cpu_id(_start) to be reset to the current core index + // every time a task is scheduled onto a core. + test_assert(1 == read(from_main_fds[0], &ch, 1)); + test_assert(rs_ptr->cpu_id_start < CPU_INVALID); + test_assert(rs_ptr->cpu_id < CPU_INVALID); + } + + return 0; +} + +static int main_child_wrapper(__attribute__((unused)) void* arg) { + exit(main_child()); +} + +int main(void) { + test_assert(0 == pipe(to_main_fds)); + test_assert(0 == pipe(from_main_fds)); + + const size_t stack_size = 1 << 20; + void* stack = mmap(NULL, stack_size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + test_assert(stack != MAP_FAILED); + + /* Do the real work in a thread that doesn't have glibc's rseq setup installed */ + clone(main_child_wrapper, stack + stack_size, + CLONE_VM | CLONE_FS | CLONE_FILES | CLONE_THREAD | CLONE_SIGHAND, + NULL, NULL, NULL, NULL); + + for (int i = 0; i < PING_PONG_ITERATIONS; ++i) { + char ch; + test_assert(1 == read(to_main_fds[0], &ch, 1)); + test_assert(1 == write(from_main_fds[1], "x", 1)); + } + + atomic_puts("EXIT-SUCCESS"); + return 0; +}