From d2e03fd796b44cd2f31b7884fb82fc3d743d81d0 Mon Sep 17 00:00:00 2001 From: Robert O'Callahan Date: Fri, 10 May 2024 17:33:23 +1200 Subject: [PATCH] Support PR_SET_VMA_ANON_NAME --- CMakeLists.txt | 1 + src/AddressSpace.cc | 112 ++++++++++++++++++++++++--------- src/AddressSpace.h | 10 +++ src/Task.cc | 33 +++++++++- src/record_syscall.cc | 6 +- src/replay_syscall.cc | 7 ++- src/test/prctl_anon_vma_name.c | 44 +++++++++++++ 7 files changed, 177 insertions(+), 36 deletions(-) create mode 100644 src/test/prctl_anon_vma_name.c diff --git a/CMakeLists.txt b/CMakeLists.txt index ed31fdd58b8..5bca571ed01 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1177,6 +1177,7 @@ set(BASIC_TESTS ppoll ppoll_deliver prctl + prctl_anon_vma_name prctl_caps prctl_deathsig prctl_name diff --git a/src/AddressSpace.cc b/src/AddressSpace.cc index c052a5bae0c..bfeb7440dbd 100644 --- a/src/AddressSpace.cc +++ b/src/AddressSpace.cc @@ -243,6 +243,21 @@ AddressSpace::Mapping::Mapping(const Mapping& other) AddressSpace::Mapping::~Mapping() {} +AddressSpace::Mapping AddressSpace::Mapping::subrange(MemoryRange range, + std::function f) { + Mapping mapping( + f(map.subrange(range.start(), range.end())), + f(recorded_map.subrange(range.start(), range.end())), + emu_file, clone_stat(mapped_file_stat), + local_addr ? local_addr + (range.start() - map.start()) : 0, + monitored_shared_memory + ? monitored_shared_memory->subrange(range.start() - map.start(), + range.size()) + : nullptr); + mapping.flags = flags; + return mapping; +} + AddressSpace::~AddressSpace() { for (auto& m : mem) { if (m.second.local_addr) { @@ -953,47 +968,26 @@ void AddressSpace::protect(Task* t, remote_ptr addr, size_t num_bytes, // If the first segment we protect underflows the // region, remap the underflow region with previous // prot. - auto monitored = m.monitored_shared_memory; if (m.map.start() < new_start) { - Mapping underflow( - m.map.subrange(m.map.start(), rem.start()), - m.recorded_map.subrange(m.recorded_map.start(), rem.start()), - m.emu_file, clone_stat(m.mapped_file_stat), m.local_addr, - std::move(monitored)); - underflow.flags = m.flags; + Mapping underflow = m.subrange(MemoryRange(m.map.start(), new_start), + [](const KernelMapping& km) { return km; }); add_to_map(underflow); } // Remap the overlapping region with the new prot. remote_ptr new_end = min(rem.end(), m.map.end()); int new_prot = prot & (PROT_READ | PROT_WRITE | PROT_EXEC); - Mapping overlap( - m.map.subrange(new_start, new_end).set_prot(new_prot), - m.recorded_map.subrange(new_start, new_end).set_prot(new_prot), - m.emu_file, clone_stat(m.mapped_file_stat), - m.local_addr ? m.local_addr + (new_start - m.map.start()) : 0, - m.monitored_shared_memory - ? m.monitored_shared_memory->subrange(new_start - m.map.start(), - new_end - new_start) - : nullptr); - overlap.flags = m.flags; + Mapping overlap = m.subrange(MemoryRange(new_start, new_end), + [new_prot](const KernelMapping& km) { return km.set_prot(new_prot); }); add_to_map(overlap); last_overlap = overlap.map; // If the last segment we protect overflows the // region, remap the overflow region with previous // prot. - if (rem.end() < m.map.end()) { - Mapping overflow( - m.map.subrange(rem.end(), m.map.end()), - m.recorded_map.subrange(rem.end(), m.map.end()), m.emu_file, - clone_stat(m.mapped_file_stat), - m.local_addr ? m.local_addr + (rem.end() - m.map.start()) : 0, - m.monitored_shared_memory - ? m.monitored_shared_memory->subrange(rem.end() - m.map.start(), - m.map.end() - rem.end()) - : nullptr); - overflow.flags = m.flags; + if (new_end < m.map.end()) { + Mapping overflow = m.subrange(MemoryRange(new_end, m.map.end()), + [](const KernelMapping& km) { return km; }); add_to_map(overflow); } }; @@ -1485,6 +1479,68 @@ void AddressSpace::did_fork_into(Task* t) { } } +void AddressSpace::set_anon_name(Task* t, MemoryRange range, const std::string* name) { + bool saw_only_anonymous = true; + MemoryRange last_overlap; + auto setter = [this, t, &name, &saw_only_anonymous, &last_overlap](Mapping m, MemoryRange rem) { + if (!(m.map.flags() & MAP_ANONYMOUS) || !saw_only_anonymous) { + saw_only_anonymous = false; + return; + } + remove_from_map(m.map); + + if (m.map.start() < rem.start()) { + Mapping underflow = m.subrange(MemoryRange(m.map.start(), rem.start()), + [](const KernelMapping& km) { return km; }); + add_to_map(underflow); + } + // Remap the overlapping region with the new prot. + remote_ptr new_end = min(rem.end(), m.map.end()); + + Mapping overlap; + if (!name && (m.map.flags() & MAP_SHARED)) { + // We're resetting the name to whatever the original name was. + string new_name = read_kernel_mapping(t, rem.start()).fsname(); + overlap = m.subrange(MemoryRange(rem.start(), new_end), + [&new_name, t](const KernelMapping& km) { + if (km.fsname().empty() && !t->session().is_recording()) { + // record_map case + return km; + } + return km.set_fsname(new_name); + }); + } else { + string new_name; + if (name) { + if (m.map.flags() & MAP_SHARED) { + new_name = "[anon_shmem:" + *name + "]"; + } else { + new_name = "[anon:" + *name + "]"; + } + } + overlap = m.subrange(MemoryRange(rem.start(), new_end), + [&new_name](const KernelMapping& km) { return km.set_fsname(new_name); }); + } + add_to_map(overlap); + last_overlap = overlap.map; + + // If the last segment we protect overflows the + // region, remap the overflow region with previous + // prot. + if (new_end < m.map.end()) { + Mapping overflow = m.subrange(MemoryRange(new_end, m.map.end()), + [](const KernelMapping& km) { return km; }); + add_to_map(overflow); + } + }; + for_each_in_range(range.start(), range.size(), setter, ITERATE_CONTIGUOUS); + if (last_overlap.size()) { + // All mappings that we altered which might need coalescing + // are adjacent to |last_overlap|. + coalesce_around(t, mem.find(last_overlap)); + } +} + static string strip_deleted(const string& s) { static const char deleted[] = " (deleted)"; ssize_t find_deleted = s.size() - (sizeof(deleted) - 1); diff --git a/src/AddressSpace.h b/src/AddressSpace.h index 3ce4a32ac86..16bba4854fe 100644 --- a/src/AddressSpace.h +++ b/src/AddressSpace.h @@ -115,6 +115,10 @@ class KernelMapping : public MemoryRange { return KernelMapping(start(), end(), fsname_, device_, inode_, prot, flags_, offset); } + KernelMapping set_fsname(const std::string& name) const { + return KernelMapping(start(), end(), name, device_, inode_, prot_, flags_, + offset); + } /** * Dump a representation of |this| to a string in a format @@ -273,6 +277,8 @@ class AddressSpace : public HasTaskSet { new (this) Mapping(other); return *this; } + Mapping subrange(MemoryRange range, + std::function f); const KernelMapping map; // The corresponding KernelMapping in the recording. During recording, @@ -369,6 +375,10 @@ class AddressSpace : public HasTaskSet { remote_ptr interp_base() const { return interp_base_; } void set_interp_base(remote_ptr base) { interp_base_ = base; } + // Set anonymous region name as per PR_SET_VMA_ANON_NAME. + // Stops at the first unmapped memory page. + void set_anon_name(Task* t, MemoryRange range, const std::string* name); + /** * Assuming the last retired instruction has raised a SIGTRAP * and might be a breakpoint trap instruction, return the type diff --git a/src/Task.cc b/src/Task.cc index 8952929dc94..5ffae6915ea 100644 --- a/src/Task.cc +++ b/src/Task.cc @@ -625,7 +625,8 @@ void Task::on_syscall_exit_arch(int syscallno, const Registers& regs) { // SYS_rrcall_mprotect_record always fails with ENOSYS, though we want to // note its usage here. if (regs.syscall_failed() && !is_mprotect_syscall(syscallno, regs.arch()) - && !is_pkey_mprotect_syscall(syscallno, regs.arch())) { + && !is_pkey_mprotect_syscall(syscallno, regs.arch()) + && !is_prctl_syscall(syscallno, regs.arch())) { return; } @@ -676,13 +677,43 @@ void Task::on_syscall_exit_arch(int syscallno, const Registers& regs) { case Arch::prctl: switch ((int)regs.orig_arg1_signed()) { case PR_SET_SECCOMP: + if (regs.syscall_failed()) { + return; + } if (regs.arg2() == SECCOMP_MODE_FILTER && session().is_recording()) { seccomp_bpf_enabled = true; } break; case PR_SET_NAME: + if (regs.syscall_failed()) { + return; + } did_prctl_set_prname(regs.arg2()); break; + case PR_SET_VMA: { + switch ((unsigned long)regs.arg2()) { + case PR_SET_VMA_ANON_NAME: { + if (regs.syscall_failed() && + regs.syscall_result_signed() != -ENOMEM && + regs.syscall_result_signed() != -EBADF) { + return; + } + remote_ptr start = regs.arg3(); + size_t size = regs.arg4(); + remote_ptr name_ptr = regs.arg5(); + if (!name_ptr.is_null()) { + string name = read_c_str(name_ptr); + vm()->set_anon_name(this, MemoryRange(start, size), &name); + } else { + vm()->set_anon_name(this, MemoryRange(start, size), nullptr); + } + break; + } + default: + break; + } + break; + } } return; diff --git a/src/record_syscall.cc b/src/record_syscall.cc index a0ded2a56b1..f136d3d1990 100644 --- a/src/record_syscall.cc +++ b/src/record_syscall.cc @@ -4784,12 +4784,8 @@ static Switchable rec_prepare_syscall_arch(RecordTask* t, } case PR_SET_VMA: { - switch (regs.arg2()) { + switch ((unsigned long)regs.arg2()) { case PR_SET_VMA_ANON_NAME: - // PR_SET_VMA_ANON_NAME is used to communicate additional details - // about the VMA to the kernel. VMAs with different anonymous - // names are not merged by the kernel. None of this affects rr, - // and this prctl has no outparams. break; default: syscall_state.expect_errno = EINVAL; diff --git a/src/replay_syscall.cc b/src/replay_syscall.cc index 7317ab94266..5f1f5ed60d8 100644 --- a/src/replay_syscall.cc +++ b/src/replay_syscall.cc @@ -1138,6 +1138,7 @@ static void rep_process_syscall_arch(ReplayTask* t, ReplayTraceStep* step, case Arch::pkey_mprotect: case Arch::sigreturn: case Arch::rt_sigreturn: + case Arch::prctl: break; default: return; @@ -1209,11 +1210,13 @@ static void rep_process_syscall_arch(ReplayTask* t, ReplayTraceStep* step, RR_FALLTHROUGH; case Arch::prctl: { auto arg1 = t->regs().arg1(); - if (sys == Arch::prctl && (Arch::arch() != aarch64 || - arg1 != PR_SET_SPECULATION_CTRL)) { + if (sys == Arch::prctl && + (Arch::arch() != aarch64 || arg1 != PR_SET_SPECULATION_CTRL) && + (unsigned long)t->regs().arg1() != PR_SET_VMA) { // On aarch64 PR_SET_SPECULATION_CTRL affects the pstate // register during the system call, so we need to replay // it, otherwise we'll get a mismatch there. + // We want to replay PR_SET_VMA as well. return; } } diff --git a/src/test/prctl_anon_vma_name.c b/src/test/prctl_anon_vma_name.c new file mode 100644 index 00000000000..98abbfa3c12 --- /dev/null +++ b/src/test/prctl_anon_vma_name.c @@ -0,0 +1,44 @@ +/* -*- Mode: C; tab-width: 8; c-basic-offset: 2; indent-tabs-mode: nil; -*- */ + +#include "util.h" + +int main(void) { + size_t page_size = sysconf(_SC_PAGESIZE); + + for (int i = 0; i < 2; ++i) { + char* p = (char*)mmap(NULL, 5*page_size, PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | (i ? MAP_SHARED : MAP_PRIVATE), -1, 0); + test_assert(p != MAP_FAILED); + munmap(p + 3*page_size, page_size); + + int ret = prctl(PR_SET_VMA, PR_SET_VMA_ANON_NAME, p + page_size, page_size, "abc"); + if (ret < 0 && errno == EINVAL) { + atomic_puts("PR_SET_VMA_ANON_NAME not supported, skipping test"); + atomic_puts("EXIT-SUCCESS"); + return 0; + } + test_assert(ret == 0); + + ret = prctl(PR_SET_VMA, PR_SET_VMA_ANON_NAME, p + page_size, page_size*3, "def"); + test_assert(ret == -1 && errno == ENOMEM); + + ret = prctl(PR_SET_VMA, PR_SET_VMA_ANON_NAME, p + page_size, page_size*3, "$$$"); + test_assert(ret == -1 && errno == EINVAL); + + ret = prctl(PR_SET_VMA, PR_SET_VMA_ANON_NAME, p + page_size, page_size*3, ""); + test_assert(ret == -1 && errno == ENOMEM); + + ret = prctl(PR_SET_VMA, PR_SET_VMA_ANON_NAME, p + page_size, page_size*2, NULL); + test_assert(ret == 0); + + int fd = open("/proc/self/exe", O_RDONLY); + test_assert(fd >= 0); + char* p2 = mmap(p + 3*page_size, page_size, PROT_READ, MAP_FIXED | MAP_PRIVATE, fd, 0); + test_assert(p2 != MAP_FAILED); + ret = prctl(PR_SET_VMA, PR_SET_VMA_ANON_NAME, p, page_size*4, "ghi"); + test_assert(ret == -1 && errno == EBADF); + } + + atomic_puts("EXIT-SUCCESS"); + return 0; +}