diff --git a/src/AutoRemoteSyscalls.cc b/src/AutoRemoteSyscalls.cc index 7c5ad33b7f4..0fb3c927350 100644 --- a/src/AutoRemoteSyscalls.cc +++ b/src/AutoRemoteSyscalls.cc @@ -270,7 +270,11 @@ void AutoRemoteSyscalls::restore_state_to(Task* t) { regs.set_arg1(regs.orig_arg1()); t->set_regs(regs); if (t->enter_syscall(true)) { - t->resume_execution(RESUME_SYSCALL, RESUME_WAIT, RESUME_NO_TICKS); + if (!t->resume_execution(RESUME_SYSCALL, RESUME_WAIT_NO_EXIT, RESUME_NO_TICKS)) { + // Tracee died unexpectedly, there is nothing more we can do. + // Do not restore the status, we want callers to see that the task died. + return; + } } regs.set_ip(initial_ip); regs.set_syscallno(regs.original_syscallno()); @@ -286,7 +290,11 @@ void AutoRemoteSyscalls::restore_state_to(Task* t) { t->set_regs(regs); RecordTask* rt = static_cast(t); while (true) { - rt->resume_execution(RESUME_CONT, RESUME_WAIT, RESUME_NO_TICKS); + if (!rt->resume_execution(RESUME_CONT, RESUME_WAIT_NO_EXIT, RESUME_NO_TICKS)) { + // Tracee died unexpectedly, there is nothing more we can do. + // Do not restore the status, we want callers to see that the task died. + return; + } if (rt->ptrace_event()) break; rt->stash_sig(); @@ -365,17 +373,18 @@ long AutoRemoteSyscalls::syscall_base(int syscallno, Registers& callregs) { bool from_seccomp = initial_at_seccomp && t->ptrace_event() == PTRACE_EVENT_SECCOMP; if (use_singlestep_path && !from_seccomp) { while (true) { - t->resume_execution(RESUME_SINGLESTEP, RESUME_WAIT, RESUME_NO_TICKS); + if (!t->resume_execution(RESUME_SINGLESTEP, RESUME_WAIT_NO_EXIT, RESUME_NO_TICKS)) { + // Tracee was killed, there is nothing more we can do. + // Ensure callers see the task death status. + restore_wait_status = t->status(); + return -ESRCH; + } LOG(debug) << "Used singlestep path; status=" << t->status(); // When a PTRACE_EVENT_EXIT is returned we don't update registers if (t->ip() != callregs.ip()) { // We entered the syscall, so stop now break; } - if (t->ptrace_event() == PTRACE_EVENT_EXIT) { - // We died, just let it be - break; - } if (t->stop_sig() == SIGTRAP && t->get_siginfo().si_code == TRAP_TRACE) { // On aarch64, if we were previously in a syscall-exit stop, continuing // with PTRACE_SINGLESTEP will result in incurring a trap upon execution @@ -389,26 +398,26 @@ long AutoRemoteSyscalls::syscall_base(int syscallno, Registers& callregs) { ASSERT(t, false) << "Unexpected status " << t->status(); } } else { - bool exited = false; if (from_seccomp) { LOG(debug) << "Skipping enter_syscall - already at seccomp stop"; } else { - exited = !t->enter_syscall(true); + if (!t->enter_syscall(true)) { + // Tracee was killed, there is nothing more we can do. + // Ensure callers see the task death status. + restore_wait_status = t->status(); + return -ESRCH; + } LOG(debug) << "Used enter_syscall; status=" << t->status(); } - // proceed to syscall exit - if (!exited) { - t->resume_execution(RESUME_SYSCALL, RESUME_WAIT, RESUME_NO_TICKS); - LOG(debug) << "syscall exit status=" << t->status(); - } - } - while (true) { - // If the syscall caused the task to exit, just stop now with that status. - if (t->seen_ptrace_exit_event() || t->status().reaped()) { + if (!t->resume_execution(RESUME_SYSCALL, RESUME_WAIT_NO_EXIT, RESUME_NO_TICKS)) { + // Tracee was killed, there is nothing more we can do. + // Ensure callers see the task death status. restore_wait_status = t->status(); - LOG(debug) << "Task is dying, no status result"; return -ESRCH; } + LOG(debug) << "syscall exit status=" << t->status(); + } + while (true) { if (t->status().is_syscall() || (t->stop_sig() == SIGTRAP && is_kernel_trap(t->get_siginfo().si_code))) { @@ -418,19 +427,31 @@ long AutoRemoteSyscalls::syscall_base(int syscallno, Registers& callregs) { } if (is_clone_syscall(syscallno, t->arch()) && t->clone_syscall_is_complete(&new_tid_, t->arch())) { - t->resume_execution(RESUME_SYSCALL, RESUME_WAIT, RESUME_NO_TICKS); + if (!t->resume_execution(RESUME_SYSCALL, RESUME_WAIT_NO_EXIT, RESUME_NO_TICKS)) { + // Tracee was killed, there is nothing more we can do. + // Ensure callers see the task death status. + restore_wait_status = t->status(); + return -ESRCH; + } LOG(debug) << "got clone event; new status=" << t->status(); continue; } if (ignore_signal(t)) { if (t->regs().syscall_may_restart()) { if (!t->enter_syscall(true)) { - // We died, just let it be - break; + // Tracee was killed, there is nothing more we can do. + // Ensure callers see the task death status. + restore_wait_status = t->status(); + return -ESRCH; } LOG(debug) << "signal ignored; restarting syscall, status=" << t->status(); - t->resume_execution(RESUME_SYSCALL, RESUME_WAIT, RESUME_NO_TICKS); + if (!t->resume_execution(RESUME_SYSCALL, RESUME_WAIT_NO_EXIT, RESUME_NO_TICKS)) { + // Tracee was killed, there is nothing more we can do. + // Ensure callers see the task death status. + restore_wait_status = t->status(); + return -ESRCH; + } LOG(debug) << "syscall exit status=" << t->status(); continue; } diff --git a/src/DiversionSession.cc b/src/DiversionSession.cc index 53eb5d295a0..93cad2d5e24 100644 --- a/src/DiversionSession.cc +++ b/src/DiversionSession.cc @@ -198,16 +198,20 @@ DiversionSession::DiversionResult DiversionSession::diversion_step( while (true) { switch (command) { - case RUN_CONTINUE: + case RUN_CONTINUE: { LOG(debug) << "Continuing to next syscall"; - t->resume_execution(RESUME_SYSEMU, RESUME_WAIT, RESUME_UNLIMITED_TICKS, - signal_to_deliver); + bool ok = t->resume_execution(RESUME_SYSEMU, RESUME_WAIT, + RESUME_UNLIMITED_TICKS, signal_to_deliver); + ASSERT(t, ok) << "Tracee was killed unexpectedly"; break; - case RUN_SINGLESTEP: + } + case RUN_SINGLESTEP: { LOG(debug) << "Stepping to next insn/syscall"; - t->resume_execution(RESUME_SYSEMU_SINGLESTEP, RESUME_WAIT, - RESUME_UNLIMITED_TICKS, signal_to_deliver); + bool ok = t->resume_execution(RESUME_SYSEMU_SINGLESTEP, RESUME_WAIT, + RESUME_UNLIMITED_TICKS, signal_to_deliver); + ASSERT(t, ok) << "Tracee was killed unexpectedly"; break; + } default: FATAL() << "Illegal run command " << command; } diff --git a/src/Monkeypatcher.cc b/src/Monkeypatcher.cc index 7f7f29ae717..c34293b1327 100644 --- a/src/Monkeypatcher.cc +++ b/src/Monkeypatcher.cc @@ -1284,6 +1284,11 @@ bool Monkeypatcher::try_patch_syscall(RecordTask* t, bool entering_syscall) { // We want our mmap records to be associated with the next (PATCH_SYSCALL) // event, not a FLUSH_SYSCALLBUF event. t->maybe_flush_syscallbuf(); + if (!t->is_stopped()) { + // Tracee was unexpectedly kicked out of a ptrace-stop by SIGKILL or + // equivalent. Abort trying to patch. + return false; + } if (arch == aarch64) { return try_patch_syscall_aarch64(t, entering_syscall); diff --git a/src/RecordSession.cc b/src/RecordSession.cc index 5c823925ddd..e24756899c0 100644 --- a/src/RecordSession.cc +++ b/src/RecordSession.cc @@ -381,8 +381,8 @@ void RecordSession::handle_seccomp_traced_syscall(RecordTask* t, t->set_regs(regs); t->vm()->add_breakpoint(ret_addr, BKPT_INTERNAL); while (true) { - t->resume_execution(RESUME_SYSCALL, RESUME_WAIT, RESUME_NO_TICKS); - if (t->ptrace_event() == PTRACE_EVENT_EXIT) { + if (!t->resume_execution(RESUME_SYSCALL, RESUME_WAIT_NO_EXIT, RESUME_NO_TICKS)) { + // Tracee exited unexpectedly return; } ASSERT(t, !t->ptrace_event()); @@ -439,7 +439,12 @@ void RecordSession::handle_seccomp_traced_syscall(RecordTask* t, Registers r = orig_regs; r.set_original_syscallno(syscall_number_for_gettid(t->arch())); t->set_regs(r); - t->resume_execution(RESUME_SYSCALL, RESUME_WAIT, RESUME_NO_TICKS); + if (!t->resume_execution(RESUME_SYSCALL, RESUME_WAIT_NO_EXIT, RESUME_NO_TICKS)) { + // Tracee died unexpectedly. We did not enter a syscall. + // We shouldn't try to resume it now. + step_state->continue_type = RecordSession::DONT_CONTINUE; + return; + } t->set_regs(orig_regs); } @@ -602,7 +607,8 @@ static void handle_seccomp_trap(RecordTask* t, // The tracee is currently in the seccomp ptrace-stop. Advance it to the // syscall-exit stop so that when we try to deliver the SIGSYS via // PTRACE_SINGLESTEP, that doesn't trigger a SIGTRAP stop. - t->resume_execution(RESUME_SYSCALL, RESUME_WAIT, RESUME_NO_TICKS); + // If this fails, that's fine, we're not going to deliver the SIGSYS. + t->resume_execution(RESUME_SYSCALL, RESUME_WAIT_NO_EXIT, RESUME_NO_TICKS); } // Don't continue yet. At the next iteration of record_step, if we @@ -719,6 +725,7 @@ bool RecordSession::handle_ptrace_event(RecordTask** t_ptr, // to the PTRACE_EVENT_EXIT. This avoids the race where our // PTRACE_CONT might kick us out of the PTRACE_EVENT_EXIT before // we can process it. + // If this fails because of *another* SIGKILL that's fine. t->wait(); break; default: @@ -761,7 +768,13 @@ bool RecordSession::handle_ptrace_event(RecordTask** t_ptr, *t_ptr = t; // Tell t that it is actually stopped, because the stop we got is really // for this task, not the old dead task. - t->did_waitpid(status); + if (!t->did_waitpid(status)) { + // This is totally untested and almost certainly broken, but if the + // task was SIGKILLed out of the EXEC stop then we should probably + // just pretend the exec never happened. + step_state->continue_type = CONTINUE_SYSCALL; + break; + } } t->post_exec(); t->session().scheduler().did_exit_execve(t); @@ -929,8 +942,7 @@ static void advance_to_disarm_desched_syscall(RecordTask* t) { /* TODO: send this through main loop. */ /* TODO: mask off signals and avoid this loop. */ do { - t->resume_execution(RESUME_SYSCALL, RESUME_WAIT, RESUME_UNLIMITED_TICKS); - if (t->seen_ptrace_exit_event()) { + if (!t->resume_execution(RESUME_SYSCALL, RESUME_WAIT_NO_EXIT, RESUME_UNLIMITED_TICKS)) { return; } if (t->status().is_syscall()) { @@ -968,8 +980,8 @@ static void advance_to_disarm_desched_syscall(RecordTask* t) { } } while (!t->is_disarm_desched_event_syscall()); - // Exit the syscall. - t->resume_execution(RESUME_SYSCALL, RESUME_WAIT, RESUME_NO_TICKS); + // Exit the syscall. If this fails, that's fine, we can ignore it. + t->resume_execution(RESUME_SYSCALL, RESUME_WAIT_NO_EXIT, RESUME_NO_TICKS); } /** @@ -1127,10 +1139,13 @@ void RecordSession::syscall_state_changed(RecordTask* t, Registers orig_regs = r; r.set_original_syscallno(-1); t->set_regs(r); - t->resume_execution(RESUME_SYSCALL, RESUME_WAIT, RESUME_NO_TICKS); - ASSERT(t, t->ip() == r.ip()); - t->set_regs(orig_regs); - maybe_trigger_emulated_ptrace_syscall_exit_stop(t); + // If this fails because of premature exit, don't mess with the + // task anymore. + if (t->resume_execution(RESUME_SYSCALL, RESUME_WAIT_NO_EXIT, RESUME_NO_TICKS)) { + ASSERT(t, t->ip() == r.ip()); + t->set_regs(orig_regs); + maybe_trigger_emulated_ptrace_syscall_exit_stop(t); + } return; } last_task_switchable = PREVENT_SWITCH; @@ -1465,9 +1480,7 @@ static bool preinject_signal(RecordTask* t) { */ LOG(debug) << " maybe not in signal-stop (status " << t->status() << "); doing tgkill(SYSCALLBUF_DESCHED_SIGNAL)"; - t->move_to_signal_stop(); - - if (t->status().ptrace_event() == PTRACE_EVENT_EXIT) { + if (!t->move_to_signal_stop()) { /* We raced with an exit (e.g. due to a pending SIGKILL). */ return false; } @@ -1488,7 +1501,7 @@ static bool preinject_signal(RecordTask* t) { /** * Returns true if the signal should be delivered. * Returns false if this signal should not be delivered because another signal - * occurred during delivery. + * occurred during delivery or there was a premature exit. * Must call t->stashed_signal_processed() once we're ready to unmask signals. */ static bool inject_handled_signal(RecordTask* t) { @@ -1506,7 +1519,9 @@ static bool inject_handled_signal(RecordTask* t) { // XXX we assume the kernel won't respond by notifying us of a different // signal. We don't want to do this with signals blocked because that will // save a bogus signal mask in the signal frame. - t->resume_execution(RESUME_SINGLESTEP, RESUME_WAIT, RESUME_NO_TICKS, sig); + if (!t->resume_execution(RESUME_SINGLESTEP, RESUME_WAIT_NO_EXIT, RESUME_NO_TICKS, sig)) { + return false; + } // Signal injection can change the sigmask due to sa_mask effects, lack of // SA_NODEFER, and signal frame construction triggering a synchronous // SIGSEGV. @@ -2549,6 +2564,8 @@ RecordSession::RecordResult RecordSession::record_step() { StepState step_state(CONTINUE); + ASSERT(t, t->is_stopped()) << "Somehow we're not stopped here; status=" + << t->status(); bool did_enter_syscall; if (rescheduled.by_waitpid && handle_ptrace_event(&t, &step_state, &result, &did_enter_syscall)) { @@ -2561,37 +2578,41 @@ RecordSession::RecordResult RecordSession::record_step() { if (did_enter_syscall && t->ev().type() == EV_SYSCALL) { syscall_state_changed(t, &step_state); } - } else if (rescheduled.by_waitpid && handle_signal_event(t, &step_state)) { - // Tracee may have exited while processing descheds; handle that. - if (handle_ptrace_exit_event(t)) { - // t may have been deleted. - last_task_switchable = ALLOW_SWITCH; - return result; - } } else { - runnable_state_changed(t, &step_state, &result, rescheduled.by_waitpid); + ASSERT(t, t->is_stopped()) << "handle_ptrace_event left us in a not-stopped state"; + if (rescheduled.by_waitpid && handle_signal_event(t, &step_state)) { + // Tracee may have exited while processing descheds; handle that. + if (handle_ptrace_exit_event(t)) { + // t may have been deleted. + last_task_switchable = ALLOW_SWITCH; + return result; + } + } else { + ASSERT(t, t->is_stopped()) << "handle_signal_event left us in a not-stopped state"; + runnable_state_changed(t, &step_state, &result, rescheduled.by_waitpid); - if (result.status != STEP_CONTINUE || - step_state.continue_type == DONT_CONTINUE) { - return result; - } + if (result.status != STEP_CONTINUE || + step_state.continue_type == DONT_CONTINUE) { + return result; + } - switch (t->ev().type()) { - case EV_DESCHED: - desched_state_changed(t); - break; - case EV_SYSCALL: - syscall_state_changed(t, &step_state); - break; - case EV_SIGNAL: - case EV_SIGNAL_DELIVERY: - if (signal_state_changed(t, &step_state)) { - // t may have been deleted - return result; - } - break; - default: - break; + switch (t->ev().type()) { + case EV_DESCHED: + desched_state_changed(t); + break; + case EV_SYSCALL: + syscall_state_changed(t, &step_state); + break; + case EV_SIGNAL: + case EV_SIGNAL_DELIVERY: + if (signal_state_changed(t, &step_state)) { + // t may have been deleted + return result; + } + break; + default: + break; + } } } diff --git a/src/ReplaySession.cc b/src/ReplaySession.cc index d82616721f4..209d2151de1 100644 --- a/src/ReplaySession.cc +++ b/src/ReplaySession.cc @@ -492,7 +492,8 @@ Completion ReplaySession::cont_syscall_boundary( } else { ResumeRequest resume_how = constraints.is_singlestep() ? RESUME_SYSEMU_SINGLESTEP : RESUME_SYSEMU; - t->resume_execution(resume_how, RESUME_WAIT, ticks_request); + bool ok = t->resume_execution(resume_how, RESUME_WAIT_NO_EXIT, ticks_request); + ASSERT(t, ok) << "Tracee died unexpectedly"; } switch (t->stop_sig()) { @@ -525,7 +526,8 @@ Completion ReplaySession::cont_syscall_boundary( syscall_seccomp_ordering_ = PTRACE_SYSCALL_BEFORE_SECCOMP; } // Eat the following event, either a seccomp or syscall notification - t->resume_execution(RESUME_SYSEMU, RESUME_WAIT, ticks_request); + bool ok = t->resume_execution(RESUME_SYSEMU, RESUME_WAIT_NO_EXIT, ticks_request); + ASSERT(t, ok) << "Tracee died unexpectedly"; } t->apply_syscall_entry_regs(); @@ -726,14 +728,16 @@ Completion ReplaySession::continue_or_step(ReplayTask* t, TicksRequest tick_request, ResumeRequest resume_how) { if (constraints.command == RUN_SINGLESTEP) { - t->resume_execution(RESUME_SINGLESTEP, RESUME_WAIT, tick_request); + bool ok = t->resume_execution(RESUME_SINGLESTEP, RESUME_WAIT_NO_EXIT, tick_request); + ASSERT(t, ok) << "Tracee died unexpectedly"; handle_unrecorded_cpuid_fault(t, constraints); } else if (constraints.command == RUN_SINGLESTEP_FAST_FORWARD) { fast_forward_status |= fast_forward_through_instruction( t, RESUME_SINGLESTEP, constraints.stop_before_states); handle_unrecorded_cpuid_fault(t, constraints); } else { - t->resume_execution(resume_how, RESUME_WAIT, tick_request); + bool ok = t->resume_execution(resume_how, RESUME_WAIT_NO_EXIT, tick_request); + ASSERT(t, ok) << "Tracee died unexpectedly"; if (t->stop_sig() == 0) { auto type = AddressSpace::rr_page_syscall_from_exit_point(t->arch(), t->ip()); if (type && type->traced == AddressSpace::UNTRACED) { @@ -1670,7 +1674,8 @@ static void end_task(ReplayTask* t) { r.set_syscallno(syscall_number_for_exit(t->arch())); t->set_regs(r); // Enter the syscall. - t->resume_execution(RESUME_CONT, RESUME_WAIT, RESUME_NO_TICKS); + bool ok = t->resume_execution(RESUME_CONT, RESUME_WAIT, RESUME_NO_TICKS); + ASSERT(t, ok) << "Tracee died unexpectedly"; if (t->session().done_initial_exec()) { ASSERT(t, t->ptrace_event() == PTRACE_EVENT_EXIT); t->did_handle_ptrace_exit_event(); @@ -2116,7 +2121,9 @@ void ReplaySession::reattach_tasks(ScopedFd new_tracee_socket, ScopedFd new_trac // Get stop events for all tasks for (auto& entry : task_map) { Task* t = entry.second; - t->wait(); + if (!t->wait()) { + FATAL() << "Task " << t->tid << " killed unexpectedly"; + } if (SIGSTOP != t->status().group_stop()) { WaitStatus failed_status = t->status(); FATAL() << "Unexpected stop " << failed_status << " for " << t->tid; diff --git a/src/Scheduler.cc b/src/Scheduler.cc index 8c6754b1b0c..42c9f42ceaa 100644 --- a/src/Scheduler.cc +++ b/src/Scheduler.cc @@ -229,8 +229,10 @@ bool WaitAggregator::try_wait(RecordTask* t) { return false; } LOGM(debug) << "wait on " << t->tid << " returns " << result.status; - t->did_waitpid(result.status); - return true; + // If did_waitpid fails then the task left the stop prematurely + // due to SIGKILL or equivalent, and we should report that we did not get + // a stop. + return t->did_waitpid(result.status); } bool WaitAggregator::try_wait_exit(RecordTask* t) { @@ -245,9 +247,11 @@ bool WaitAggregator::try_wait_exit(RecordTask* t) { options.consume = false; WaitResult result = WaitManager::wait_exit(options); switch (result.code) { - case WAIT_OK: - t->did_waitpid(result.status); + case WAIT_OK: { + bool ok = t->did_waitpid(result.status); + ASSERT(t, ok) << "did_waitpid shouldn't fail for exit statuses"; return true; + } case WAIT_NO_STATUS: // This can happen when the task is in zap_pid_ns_processes waiting for all tasks // in the pid-namespace to exit. It's not in a signal stop, but it's also not @@ -286,7 +290,7 @@ bool Scheduler::is_task_runnable(RecordTask* t, WaitAggregator& wait_aggregator, } LOGM(debug) << "Task event is " << t->ev(); - if (!t->may_be_blocked()) { + if (!t->may_be_blocked() && (t->is_stopped() || t->was_reaped())) { LOGM(debug) << " " << t->tid << " isn't blocked"; if (t->schedule_frozen) { LOGM(debug) << " " << t->tid << " but is frozen"; @@ -303,12 +307,17 @@ bool Scheduler::is_task_runnable(RecordTask* t, WaitAggregator& wait_aggregator, t->emulate_SIGCONT(); // We shouldn't run any user code since there is at least one signal // pending. - t->resume_execution(RESUME_SYSCALL, RESUME_WAIT, RESUME_NO_TICKS); - *by_waitpid = true; - must_run_task = t; - LOGM(debug) << " Got " << t->tid - << " out of emulated stop due to pending SIGCONT"; - return true; + if (t->resume_execution(RESUME_SYSCALL, RESUME_WAIT_NO_EXIT, RESUME_NO_TICKS)) { + *by_waitpid = true; + must_run_task = t; + LOGM(debug) << " Got " << t->tid + << " out of emulated stop due to pending SIGCONT"; + return true; + } + // Tracee exited unexpectedly. Reexamine it now in case it has a new + // status we can use. Note that we cleared `t->emulated_stop_type` + // so we won't end up here again. + return is_task_runnable(t, wait_aggregator, by_waitpid); } else { LOGM(debug) << " " << t->tid << " is stopped by ptrace or signal"; // We have no way to detect a SIGCONT coming from outside the tracees. @@ -346,7 +355,11 @@ bool Scheduler::is_task_runnable(RecordTask* t, WaitAggregator& wait_aggregator, // These syscalls never really block but the kernel may report that // the task is not stopped yet if we pass WNOHANG. To make them // behave predictably, do a blocking wait. - t->wait(); + if (!t->wait()) { + // Task got SIGKILL or equivalent while trying to process the stop. + // Ignore this event and we'll process the new status later. + return false; + } *by_waitpid = true; must_run_task = t; LOGM(debug) << " " << syscall_name(t->ev().Syscall().number, t->arch()) @@ -602,7 +615,11 @@ static RecordTask* find_waited_task(RecordSession& session, pid_t tid, WaitStatu } if (waited->detached_proxy) { - waited->did_waitpid(status); + if (!waited->did_waitpid(status)) { + // Proxy died unexpectedly during the waitpid, just ignore + // the stop. + return nullptr; + } pid_t parent_rec_tid = waited->get_parent_pid(); LOGM(debug) << " ... but it's a detached process."; RecordTask *parent = session.find_task(parent_rec_tid); @@ -696,7 +713,13 @@ Scheduler::Rescheduled Scheduler::reschedule(Switchable switchable) { if (!waited) { continue; } - waited->did_waitpid(status); + if (!waited->did_waitpid(status)) { + // Tracee exited stop prematurely due to SIGKILL or equivalent. + // Pretend the stop didn't happen. + continue; + } + result.by_waitpid = true; + LOGM(debug) << " new status is " << current_->status(); // Another task just became runnable, we're no longer in unlimited // ticks mode unlimited_ticks_mode = false; @@ -714,7 +737,15 @@ Scheduler::Rescheduled Scheduler::reschedule(Switchable switchable) { timeout = elapsed > 0.05 ? 0.0 : 0.05 - elapsed; LOGM(debug) << " But that's not our current task..."; } else { - current_->wait(timeout); + if (current_->wait(timeout)) { + result.by_waitpid = true; + LOGM(debug) << " new status is " << current_->status(); + } else { + // A SIGKILL or equivalent kicked the task out of the stop. + // We are now running towards PTRACE_EVENT_EXIT or zombie status. + // Even though we're PREVENT_SWITCH, we still have to switch. + // The task won't be stopped so this is handled below. + } break; } } @@ -725,11 +756,11 @@ Scheduler::Rescheduled Scheduler::reschedule(Switchable switchable) { strevent(current_->event), 1000.0 * wait_duration); } #endif - result.by_waitpid = true; - LOGM(debug) << " new status is " << current_->status(); } - validate_scheduled_task(); - return result; + if (current_->is_stopped() || current_->was_reaped()) { + validate_scheduled_task(); + return result; + } } unlimited_ticks_mode = false; @@ -891,8 +922,9 @@ Scheduler::Rescheduled Scheduler::reschedule(Switchable switchable) { status.ptrace_event() == PTRACE_EVENT_EXIT || status.reaped()) << "Scheduled task should have been blocked"; - next->did_waitpid(status); - if (in_exec_tgid && next->tgid() != in_exec_tgid) { + if (!next->did_waitpid(status)) { + next = nullptr; + } else if (in_exec_tgid && next->tgid() != in_exec_tgid) { // Some threadgroup is doing execve and this task isn't in // that threadgroup. Don't schedule this task until the execve // is complete. diff --git a/src/Scheduler.h b/src/Scheduler.h index 7534f8540c2..94e8b16e0b7 100644 --- a/src/Scheduler.h +++ b/src/Scheduler.h @@ -106,6 +106,10 @@ class Scheduler { * The new current() task is guaranteed to either have already been * runnable, or have been made runnable by a waitpid status change (in * which case, result.by_waitpid will be true. + * + * After this, if Rescheduled::interrupted_by_signal is false, + * and there is a new current task, its is_stopped() must + * be true. */ struct Rescheduled { bool interrupted_by_signal; diff --git a/src/Task.cc b/src/Task.cc index 8746d940f38..bb962e84098 100644 --- a/src/Task.cc +++ b/src/Task.cc @@ -277,7 +277,8 @@ void Task::finish_emulated_syscall() { // Passing RESUME_NO_TICKS here is not only a small performance optimization, // but also avoids counting an event if the instruction immediately following // a syscall instruction is a conditional branch. - resume_execution(RESUME_SYSCALL, RESUME_WAIT, RESUME_NO_TICKS); + bool ok = resume_execution(RESUME_SYSCALL, RESUME_WAIT_NO_EXIT, RESUME_NO_TICKS); + ASSERT(this, ok) << "Tracee exited unexpectedly"; set_regs(r); wait_status = WaitStatus(); @@ -888,8 +889,10 @@ bool Task::enter_syscall(bool allow_exit) { Session::SECCOMP_BEFORE_PTRACE_SYSCALL; bool need_seccomp_event = seccomp_bpf_enabled; while (need_ptrace_syscall_event || need_seccomp_event) { - resume_execution(need_ptrace_syscall_event ? RESUME_SYSCALL : RESUME_CONT, - RESUME_WAIT, RESUME_NO_TICKS); + if (!resume_execution(need_ptrace_syscall_event ? RESUME_SYSCALL : RESUME_CONT, + RESUME_WAIT_NO_EXIT, RESUME_NO_TICKS)) { + return false; + } if (is_ptrace_seccomp_event()) { ASSERT(this, need_seccomp_event); need_seccomp_event = false; @@ -934,7 +937,9 @@ bool Task::exit_syscall() { Session::PTRACE_SYSCALL_BEFORE_SECCOMP) && !is_ptrace_seccomp_event(); while (true) { - resume_execution(RESUME_SYSCALL, RESUME_WAIT, RESUME_NO_TICKS); + if (!resume_execution(RESUME_SYSCALL, RESUME_WAIT_NO_EXIT, RESUME_NO_TICKS)) { + return false; + } if (will_see_seccomp && is_ptrace_seccomp_event()) { will_see_seccomp = false; continue; @@ -1437,7 +1442,7 @@ void Task::work_around_KNL_string_singlestep_bug() { } } -void Task::resume_execution(ResumeRequest how, WaitRequest wait_how, +bool Task::resume_execution(ResumeRequest how, WaitRequest wait_how, TicksRequest tick_period, int sig) { // Ensure our HW debug registers are up to date before we execute any code. // If this fails because the task died, the code below will detect it. @@ -1497,7 +1502,6 @@ void Task::resume_execution(ResumeRequest how, WaitRequest wait_how, flush_regs(); } - bool detected_exit = false; if (session().is_recording() && !seen_ptrace_exit_event()) { /* There's a nasty race where a stopped task gets woken up by a SIGKILL * and advances to the PTRACE_EXIT_EVENT ptrace-stop just before we @@ -1524,24 +1528,42 @@ void Task::resume_execution(ResumeRequest how, WaitRequest wait_how, result.status.ptrace_event() == PTRACE_EVENT_EXIT || result.status.reaped()) << "got " << result.status; - did_waitpid(result.status); - detected_exit = true; + LOG(debug) << "Task " << tid << " exited unexpectedly with status " + << result.status; + if (did_waitpid(result.status)) { + // We reached a new stop (or actually reaped the task). + // Consider this "resume execution" to be done. + return wait_how != RESUME_WAIT_NO_EXIT; + } + ASSERT(this, result.status.ptrace_event() == PTRACE_EVENT_EXIT) + << "did_waitpid should always succeed for reaped() statuses"; + // The tracee must have been kicked out of PTRACE_EVENT_EXIT + // by a SIGKILL (only possible on older kernels). + // If we were supposed to wait, we've failed. + // We can't wait now because on old kernels tasks can block + // indefinitely even after PTRACE_EVENT_EXIT (e.g. due to coredumping). + // We don't know what state it's in exactly, but registers haven't changed + // since nothing has really happened since the last stop. + set_stopped(false); + return RESUME_NONBLOCKING == wait_how; } } - if (detected_exit) { - LOG(debug) << "Task " << tid << " exited unexpectedly"; - } else { - ASSERT(this, setup_succeeded); - ptrace_if_stopped(how, nullptr, (void*)(uintptr_t)sig); - // If ptrace_if_stopped failed, it means we're running along the - // exit path due to a SIGKILL or equivalent, so just like if it - // succeeded, we are stopped and will receive a wait notification. - set_stopped(false); - extra_registers_known = false; - if (RESUME_WAIT == wait_how) { - wait(); + ASSERT(this, setup_succeeded); + ptrace_if_stopped(how, nullptr, (void*)(uintptr_t)sig); + // If ptrace_if_stopped failed, it means we're running along the + // exit path due to a SIGKILL or equivalent, so just like if it + // succeeded, we will eventually receive a wait notification. + set_stopped(false); + extra_registers_known = false; + if (RESUME_NONBLOCKING != wait_how) { + if (!wait()) { + return false; + } + if (wait_how == RESUME_WAIT_NO_EXIT) { + return ptrace_event() != PTRACE_EVENT_EXIT && !was_reaped(); } } + return true; } void Task::set_regs(const Registers& regs) { @@ -1962,7 +1984,7 @@ bool Task::account_for_potential_ptrace_interrupt_stop(WaitStatus status) { return false; } -void Task::wait(double interrupt_after_elapsed) { +bool Task::wait(double interrupt_after_elapsed) { LOG(debug) << "going into blocking wait for " << tid << " ..."; ASSERT(this, session().is_recording() || interrupt_after_elapsed == -1); @@ -1994,12 +2016,8 @@ void Task::wait(double interrupt_after_elapsed) { /* The process died without us getting a PTRACE_EXIT_EVENT notification. * This is possible if the process receives a SIGKILL while in the exit * event stop, but before we were able to read the event notification. - * We handle this situation by synthesizing an exit event, but otherwise - * going about our regular business. */ - LOG(debug) << "Synthesizing PTRACE_EVENT_EXIT for zombie process " << tid; - result.status = WaitStatus::for_ptrace_event(PTRACE_EVENT_EXIT); - break; + return false; } ASSERT(this, result.code == WAIT_NO_STATUS); } @@ -2010,7 +2028,7 @@ void Task::wait(double interrupt_after_elapsed) { LOG(warn) << " PTRACE_INTERRUPT raced with another event " << result.status; } } - did_waitpid(result.status); + return did_waitpid(result.status); } void Task::canonicalize_regs(SupportedArch syscall_arch) { @@ -2087,11 +2105,11 @@ void Task::set_aarch64_tls_register(uintptr_t val) { we tried to set. */ } -void Task::did_waitpid(WaitStatus status) { +bool Task::did_waitpid(WaitStatus status) { if (is_detached_proxy() && (status.stop_sig() == SIGSTOP || status.stop_sig() == SIGCONT)) { LOG(debug) << "Task " << tid << " is a detached proxy, ignoring status " << status; - return; + return true; } LOG(debug) << " Task " << tid << " changed status to " << status; @@ -2110,7 +2128,7 @@ void Task::did_waitpid(WaitStatus status) { // here is get out quickly and the higher-level function should go ahead // and delete us. wait_status = status; - return; + return true; } LOG(debug) << "Unexpected process reap for " << tid; /* Mark buffers as having been destroyed. We missed our chance @@ -2146,11 +2164,13 @@ void Task::did_waitpid(WaitStatus status) { } else if (status.stop_sig()) { if (!ptrace_if_stopped(PTRACE_GETSIGINFO, nullptr, &pending_siginfo)) { LOG(debug) << "Unexpected process death getting siginfo for " << tid; - status = WaitStatus::for_ptrace_event(PTRACE_EVENT_EXIT); + // Let's pretend this stop never happened. + set_stopped(false); + return false; } } - // An unstable exit can cause a task to exit without us having run it, in + // A SIGKILL or equivalent can cause a task to exit without us having run it, in // which case we might have pending register changes for it that are now // irrelevant. In that case we just throw away our register changes and use // whatever the kernel now has. @@ -2201,7 +2221,12 @@ void Task::did_waitpid(WaitStatus status) { #endif else { LOG(debug) << "Unexpected process death for " << tid; - status = WaitStatus::for_ptrace_event(PTRACE_EVENT_EXIT); + // Let's pretend this stop never happened. + // Note that pending_siginfo may have been overwritten above, + // but in that case we're going to ignore this signal-stop + // so it doesn't matter. + set_stopped(false); + return false; } } } @@ -2297,6 +2322,7 @@ void Task::did_waitpid(WaitStatus status) { } did_wait(); + return true; } template @@ -2398,7 +2424,8 @@ Task* Task::clone(CloneReason reason, int flags, remote_ptr stack, // wait() before trying to do anything that might need to // use ptrace to access memory - t->wait(); + bool ok = t->wait(); + ASSERT(t, ok) << "Task " << t->tid << " killed unexpectedly; not sure how to handle this"; t->post_wait_clone(this, flags); if (CLONE_SHARE_THREAD_GROUP & flags) { @@ -3615,7 +3642,9 @@ long Task::ptrace_seize(pid_t tid, Session& session) { sa.sa_flags = 0; // No SA_RESTART, so waitpid() will be interrupted sigaction(SIGALRM, &sa, nullptr); - t->wait(); + if (!t->wait()) { + FATAL() << "Tracee was killed via SIGKILL before reaching SIGSTOP"; + } if (t->ptrace_event() == PTRACE_EVENT_EXIT) { t->proceed_to_exit(); FATAL() << "Tracee died before reaching SIGSTOP\n" @@ -4017,8 +4046,10 @@ void Task::dup_from(Task *other) { /** * Proceeds until the next system call, which is being executed. + * Returns false if did_waitpid failed because the task got SIGKILL + * or equivalent. */ -static void __ptrace_cont(Task* t, ResumeRequest resume_how, +static bool __ptrace_cont(Task* t, ResumeRequest resume_how, SupportedArch syscall_arch, int expect_syscallno, int expect_syscallno2 = -1, pid_t new_tid = -1) { t->resume_execution(resume_how, RESUME_NONBLOCKING, RESUME_NO_TICKS); @@ -4041,7 +4072,9 @@ static void __ptrace_cont(Task* t, ResumeRequest resume_how, t->hpc.set_tid(new_tid); t->tid = new_tid; } - t->did_waitpid(result.status); + if (!t->did_waitpid(result.status)) { + return false; + } if (ReplaySession::is_ignored_signal(t->status().stop_sig())) { t->resume_execution(resume_how, RESUME_NONBLOCKING, RESUME_NO_TICKS); @@ -4060,6 +4093,7 @@ static void __ptrace_cont(Task* t, ResumeRequest resume_how, current_syscall == expect_syscallno2) << "Should be at " << syscall_name(expect_syscallno, syscall_arch) << ", but instead at " << syscall_name(current_syscall, syscall_arch); + return true; } void Task::did_handle_ptrace_exit_event() { @@ -4123,9 +4157,10 @@ void Task::os_exec(SupportedArch exec_arch, std::string filename) * tid, no matter what tid it was before. */ pid_t tgid = real_tgid(); - __ptrace_cont(this, RESUME_SYSCALL, arch(), expect_syscallno, - syscall_number_for_execve(exec_arch), - tgid == tid ? -1 : tgid); + bool ok = __ptrace_cont(this, RESUME_SYSCALL, arch(), expect_syscallno, + syscall_number_for_execve(exec_arch), + tgid == tid ? -1 : tgid); + ASSERT(this, ok) << "Task " << tid << " got killed while trying to exec"; LOG(debug) << this->status() << " " << this->regs(); if (this->regs().syscall_result()) { errno = -this->regs().syscall_result(); @@ -4166,7 +4201,7 @@ void Task::tgkill(int sig) { ASSERT(this, 0 == syscall(SYS_tgkill, real_tgid(), tid, sig)); } -void Task::move_to_signal_stop() +bool Task::move_to_signal_stop() { LOG(debug) << " maybe not in signal-stop (status " << status() << "); doing tgkill(SYSCALLBUF_DESCHED_SIGNAL)"; @@ -4194,7 +4229,9 @@ void Task::move_to_signal_stop() old_ip = old_ip.decrement_by_syscall_insn_length(arch()); } do { - resume_execution(RESUME_SINGLESTEP, RESUME_WAIT, RESUME_NO_TICKS); + if (!resume_execution(RESUME_SINGLESTEP, RESUME_WAIT_NO_EXIT, RESUME_NO_TICKS)) { + return false; + } ASSERT(this, old_ip == ip()) << "Singlestep actually advanced when we " << "just expected a signal; was at " << old_ip << " now at " @@ -4202,6 +4239,7 @@ void Task::move_to_signal_stop() // Ignore any pending TIME_SLICE_SIGNALs and continue until we get our // SYSCALLBUF_DESCHED_SIGNAL. } while (stop_sig() == PerfCounters::TIME_SLICE_SIGNAL); + return true; } bool Task::should_apply_rseq_abort(EventType event_type, remote_code_ptr* new_ip, diff --git a/src/Task.h b/src/Task.h index 8a20bfb071b..62f3a53758d 100644 --- a/src/Task.h +++ b/src/Task.h @@ -78,11 +78,14 @@ enum ResumeRequest { RESUME_SYSEMU_SINGLESTEP = NativeArch::PTRACE_SYSEMU_SINGLESTEP, }; enum WaitRequest { + // Don't wait after resuming. + RESUME_NONBLOCKING, // After resuming, blocking-waitpid() until tracee status // changes. RESUME_WAIT, - // Don't wait after resuming. - RESUME_NONBLOCKING + // Like RESUME_WAIT, but we're not expecting a PTRACE_EVENT_EXIT + // or reap, so return false also in that case. + RESUME_WAIT_NO_EXIT }; enum TicksRequest { // We don't expect to see any ticks (though we seem to on the odd buggy @@ -254,11 +257,17 @@ class Task { pid_t pid_of_pidfd(int fd); /** - * Force the wait status of this to |status|, as if - * |wait()/try_wait()| had returned it. Call this whenever a waitpid - * returned activity for this past. + * Records the wait status of this task as |status|, e.g. if + * |wait()/try_wait()| has returned it. Call this whenever a waitpid + * returned activity for this task. + * If this returns false, then the task was kicked out of a ptrace-stop + * by SIGKILL or equivalent before we could read registers etc. + * We will treat this stop as if it never happened; the caller must + * act as if there was no stop. + * If `status.reaped()` (i.e. fatal signal or normal exit), this always + * returns true. */ - void did_waitpid(WaitStatus status); + bool did_waitpid(WaitStatus status); /** * Syscalls have side effects on registers (e.g. setting the flags register). @@ -575,8 +584,12 @@ class Task { * after that many seconds have elapsed. * * All tracee execution goes through here. + * + * If `wait_how` == RESUME_WAIT and we don't complete a + * did_waitpid() (e.g. because the tracee was SIGKILLed or + * equivalent), this returns false. */ - void resume_execution(ResumeRequest how, WaitRequest wait_how, + bool resume_execution(ResumeRequest how, WaitRequest wait_how, TicksRequest tick_period, int sig = 0); /** Return the session this is part of. */ @@ -712,8 +725,12 @@ class Task { * with the process in a stopped() state. If interrupt_after_elapsed >= 0, * interrupt the task after that many seconds have elapsed. If * interrupt_after_elapsed == 0.0, the interrupt will happen immediately. + * Returns false if the wait failed because we reached a stop but we got + * SIGKILLed (or equivalent) out of it, in which case it is not safe to wait + * because that might block indefinitely waiting for us to acknowledge the + * PTRACE_EVENT_EXIT of other tasks. */ - void wait(double interrupt_after_elapsed = -1); + bool wait(double interrupt_after_elapsed = -1); /** * Currently we don't allow recording across uid changes, so we can @@ -1079,8 +1096,9 @@ class Task { /** * Try to move this task to a signal stop by signaling it with the * syscallbuf desched signal (which is guaranteed not to be blocked). + * Returns false if the task exited unexpectedly. */ - void move_to_signal_stop(); + bool move_to_signal_stop(); // A map from original table to (potentially detached) clone, to preserve // FdTable sharing relationships during a session fork. diff --git a/src/fast_forward.cc b/src/fast_forward.cc index 9e77ccdf8f9..1eeeb86dc1e 100644 --- a/src/fast_forward.cc +++ b/src/fast_forward.cc @@ -168,7 +168,8 @@ FastForwardStatus fast_forward_through_instruction(Task* t, ResumeRequest how, remote_code_ptr ip = t->ip(); - t->resume_execution(how, RESUME_WAIT, RESUME_UNLIMITED_TICKS); + bool ok = t->resume_execution(how, RESUME_WAIT_NO_EXIT, RESUME_UNLIMITED_TICKS); + ASSERT(t, ok) << "Tracee was killed"; if (t->stop_sig() != SIGTRAP) { // we might have stepped into a system call... return result; @@ -310,7 +311,8 @@ FastForwardStatus fast_forward_through_instruction(Task* t, ResumeRequest how, // So, disable watchpoints temporarily. t->vm()->save_watchpoints(); t->vm()->remove_all_watchpoints(); - t->resume_execution(RESUME_CONT, RESUME_WAIT, RESUME_UNLIMITED_TICKS); + ok = t->resume_execution(RESUME_CONT, RESUME_WAIT_NO_EXIT, RESUME_UNLIMITED_TICKS); + ASSERT(t, ok) << "Tracee was killed"; t->vm()->restore_watchpoints(); t->vm()->remove_breakpoint(limit_ip, BKPT_INTERNAL); result.did_fast_forward = true; diff --git a/src/record_signal.cc b/src/record_signal.cc index 75d9197b5cf..4178c064811 100644 --- a/src/record_signal.cc +++ b/src/record_signal.cc @@ -518,7 +518,11 @@ static void handle_desched_event(RecordTask* t) { // syscall may have re-armed the event. disarm_desched_event(t); - t->resume_execution(RESUME_SYSCALL, RESUME_WAIT, RESUME_UNLIMITED_TICKS); + if (!t->resume_execution(RESUME_SYSCALL, RESUME_WAIT_NO_EXIT, RESUME_UNLIMITED_TICKS)) { + LOG(debug) << " (got exit, bailing out)"; + t->push_event(Event::noop()); + return; + } if (t->status().is_syscall()) { t->apply_syscall_entry_regs(); @@ -537,12 +541,6 @@ static void handle_desched_event(RecordTask* t) { // seccomp filter. break; } - if (t->ptrace_event() == PTRACE_EVENT_EXIT) { - LOG(debug) - << " (got exit, bailing out)"; - t->push_event(Event::noop()); - return; - } // Completely ignore spurious desched signals and // signals that aren't going to be delivered to the diff --git a/src/record_syscall.cc b/src/record_syscall.cc index 11e02d6778d..442ed9fb7f2 100644 --- a/src/record_syscall.cc +++ b/src/record_syscall.cc @@ -3355,7 +3355,10 @@ static Switchable prepare_clone(RecordTask* t, TaskSyscallState& syscall_state) } while (true) { - t->resume_execution(RESUME_SYSCALL, RESUME_WAIT, RESUME_NO_TICKS); + if (!t->resume_execution(RESUME_SYSCALL, RESUME_WAIT_NO_EXIT, RESUME_NO_TICKS)) { + // Tracee died unexpectedly during clone. + return ALLOW_SWITCH; + } // XXX handle stray signals? if (t->ptrace_event()) { break; @@ -3366,7 +3369,7 @@ static Switchable prepare_clone(RecordTask* t, TaskSyscallState& syscall_state) LOG(debug) << "clone failed, returning " << errno_name(-t->regs().syscall_result_signed()); syscall_state.emulate_result(t->regs().syscall_result()); - // clone failed and we're existing the syscall with an error. Reenter + // clone failed and we're exiting the syscall with an error. Reenter // the syscall so that we're in the same state as the normal execution // path. t->ev().Syscall().failed_during_preparation = true; diff --git a/src/replay_syscall.cc b/src/replay_syscall.cc index 473e5d71331..fdf4e534ecd 100644 --- a/src/replay_syscall.cc +++ b/src/replay_syscall.cc @@ -218,7 +218,8 @@ template static void prepare_clone(ReplayTask* t) { Registers entry_regs = r; // Run; we will be interrupted by PTRACE_EVENT_CLONE/FORK/VFORK. - t->resume_execution(RESUME_CONT, RESUME_WAIT, RESUME_NO_TICKS); + bool ok = t->resume_execution(RESUME_CONT, RESUME_WAIT_NO_EXIT, RESUME_NO_TICKS); + ASSERT(t, ok) << "Tracee was killed"; pid_t new_tid; while (!t->clone_syscall_is_complete(&new_tid, Arch::arch())) { @@ -227,7 +228,8 @@ template static void prepare_clone(ReplayTask* t) { // state to try the syscall again. ASSERT(t, t->regs().syscall_result_signed() == -EAGAIN); t->set_regs(entry_regs); - t->resume_execution(RESUME_CONT, RESUME_WAIT, RESUME_NO_TICKS); + bool ok = t->resume_execution(RESUME_CONT, RESUME_WAIT_NO_EXIT, RESUME_NO_TICKS); + ASSERT(t, ok) << "Tracee was killed"; } // Get out of the syscall