diff --git a/CMakeLists.txt b/CMakeLists.txt index 0897c188d11..e35253db52c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1283,6 +1283,7 @@ set(BASIC_TESTS redzone_integrity rename rlimit + rseq_cpu_id_reset rusage samask save_data_fd diff --git a/src/RecordSession.cc b/src/RecordSession.cc index 63c788fc5be..8488b549a19 100644 --- a/src/RecordSession.cc +++ b/src/RecordSession.cc @@ -2647,6 +2647,9 @@ RecordSession::RecordResult RecordSession::record_step() { } prev_task->pop_event(EV_SCHED); } + if (t->tuid() != prev_task_tuid) { + t->will_schedule(); + } // Have to disable context-switching until we know it's safe // to allow switching the context. diff --git a/src/ReplaySession.cc b/src/ReplaySession.cc index 6c42c896a56..f0f7d1f7409 100644 --- a/src/ReplaySession.cc +++ b/src/ReplaySession.cc @@ -1820,6 +1820,11 @@ ReplayTask* ReplaySession::setup_replay_one_trace_frame(ReplayTask* t) { t = revive_task_for_exec(); } + if (t->tuid() != last_task_tuid) { + t->will_schedule(); + last_task_tuid = t->tuid(); + } + LOG(debug) << "[event " << trace_frame.time() << "] " << t->rec_tid << ": replaying " << Event(ev) << "; state " << (ev.is_syscall_event() ? state_name(ev.Syscall().state) diff --git a/src/ReplaySession.h b/src/ReplaySession.h index 74433739883..9dce24e6a9b 100644 --- a/src/ReplaySession.h +++ b/src/ReplaySession.h @@ -430,6 +430,7 @@ class ReplaySession final : public Session { siginfo_t last_siginfo_; Flags flags_; FastForwardStatus fast_forward_status; + TaskUid last_task_tuid; bool skip_next_execution_event; bool replay_stops_at_first_execve_; bool detected_transient_error_; diff --git a/src/Task.cc b/src/Task.cc index ea3ec348378..407e422e55e 100644 --- a/src/Task.cc +++ b/src/Task.cc @@ -3331,6 +3331,19 @@ void Task::write_zeroes(unique_ptr* remote, remote_ptr vm()->notify_written(initial_addr, initial_size, 0); } +void Task::will_schedule() { + if (rseq_state) { + // Relying on rseq_t being the same across architectures. + int cpu = session().trace_stream()->bound_to_cpu(); + uint32_t cpu_id = cpu >= 0 ? cpu : 0; + auto addr = REMOTE_PTR_FIELD(rseq_state->ptr.cast(), cpu_id_start); + bool ok = true; + write_mem(addr, cpu_id, &ok); + addr = REMOTE_PTR_FIELD(rseq_state->ptr.cast(), cpu_id); + write_mem(addr, cpu_id, &ok); + } +} + const TraceStream* Task::trace_stream() const { if (session().as_record()) { return &session().as_record()->trace_writer(); diff --git a/src/Task.h b/src/Task.h index 21a86e80658..58316aae2c1 100644 --- a/src/Task.h +++ b/src/Task.h @@ -827,6 +827,11 @@ class Task { const void* buf, bool* ok = nullptr, uint32_t flags = 0); + /** + * This task has been selected to run next. + */ + void will_schedule(); + SupportedArch detect_syscall_arch(); /** diff --git a/src/record_syscall.cc b/src/record_syscall.cc index 34e45f8d24b..102f75219fc 100644 --- a/src/record_syscall.cc +++ b/src/record_syscall.cc @@ -4317,7 +4317,8 @@ static Switchable rec_prepare_syscall_arch(RecordTask* t, t->write_mem(addr, cpu_id); t->record_local(addr, &cpu_id); addr = REMOTE_PTR_FIELD(rseq, cpu_id_start); - uint32_t cpu_id_start = 0; + int cpu = t->session().trace_stream()->bound_to_cpu(); + uint32_t cpu_id_start = cpu >= 0 ? cpu : 0; t->write_mem(addr, cpu_id_start); t->record_local(addr, &cpu_id_start); t->rseq_state = nullptr; diff --git a/src/test/rseq_cpu_id_reset.c b/src/test/rseq_cpu_id_reset.c new file mode 100644 index 00000000000..fca17942477 --- /dev/null +++ b/src/test/rseq_cpu_id_reset.c @@ -0,0 +1,74 @@ +/* -*- Mode: C; tab-width: 8; c-basic-offset: 2; indent-tabs-mode: nil; -*- */ + +#include "util.h" + +static const uint32_t RSEQ_SIG = 0x12345678; + +static int to_main_fds[2]; +static int from_main_fds[2]; + +static const int PING_PONG_ITERATIONS = 5; + +static const uint32_t CPU_INVALID = 10000000; + +static int main_child(void) { + struct rseq* rs_ptr = + (struct rseq*)mmap(NULL, sizeof(struct rseq), PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + test_assert(rs_ptr != MAP_FAILED); + rs_ptr->cpu_id_start = CPU_INVALID; + rs_ptr->cpu_id = CPU_INVALID; + + int ret = syscall(RR_rseq, rs_ptr, sizeof(*rs_ptr), 0, RSEQ_SIG); + if (ret == -1 && errno == ENOSYS) { + atomic_puts("rseq not supported; ignoring test"); + atomic_puts("EXIT-SUCCESS"); + return 0; + } + test_assert(ret == 0); + test_assert(rs_ptr->cpu_id_start < CPU_INVALID); + test_assert(rs_ptr->cpu_id < CPU_INVALID); + + for (int i = 0; i < PING_PONG_ITERATIONS; ++i) { + char ch; + rs_ptr->cpu_id_start = CPU_INVALID; + rs_ptr->cpu_id = CPU_INVALID; + test_assert(1 == write(to_main_fds[1], "y", 1)); + // Under rr (or taskset to a single core) there must be a + // context switch here. Some code exists that expects + // rseq::cpu_id(_start) to be reset to the current core index + // every time a task is scheduled onto a core. + test_assert(1 == read(from_main_fds[0], &ch, 1)); + test_assert(rs_ptr->cpu_id_start < CPU_INVALID); + test_assert(rs_ptr->cpu_id < CPU_INVALID); + } + + return 0; +} + +static int main_child_wrapper(__attribute__((unused)) void* arg) { + exit(main_child()); +} + +int main(void) { + test_assert(0 == pipe(to_main_fds)); + test_assert(0 == pipe(from_main_fds)); + + const size_t stack_size = 1 << 20; + void* stack = mmap(NULL, stack_size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + test_assert(stack != MAP_FAILED); + + /* Do the real work in a thread that doesn't have glibc's rseq setup installed */ + clone(main_child_wrapper, stack + stack_size, + CLONE_VM | CLONE_FS | CLONE_FILES | CLONE_THREAD | CLONE_SIGHAND, + NULL, NULL, NULL, NULL); + + for (int i = 0; i < PING_PONG_ITERATIONS; ++i) { + char ch; + test_assert(1 == read(to_main_fds[0], &ch, 1)); + test_assert(1 == write(from_main_fds[1], "x", 1)); + } + + atomic_puts("EXIT-SUCCESS"); + return 0; +}