From 5970a8801ce0175f57867db9c4872843b58c1a1b Mon Sep 17 00:00:00 2001
From: Robert O'Callahan <robert@ocallahan.org>
Date: Wed, 23 Aug 2023 07:34:49 +1200
Subject: [PATCH] When `Task::did_waitpid` can't get registers for a stop
 because we prematurely exited the stop, just ignore the stop.

This makes our state tracking more accurate. Before, when a task is booted out of a ptrace-stop via SIGKILL or equivalent while we're reading its state in `Task::did_waitpid`, we would treat the task as as if it were still stopped, even though now it is either running in the kernel towards PTRACE_EVENT_EXIT or reap, or else stopped in a new PTRACE_EVENT_EXIT stop which we will see in the next wait(). We would cover up the discrepancy by changing the stop to a fake PTRACE_EVENT_EXIT. Instead we now "tell the truth" that the task is not known to be stopped and instead we should wait for its next stop.

The downside is that this requires us to propagate the fact that the task is not actually stopped to various callers, including the callers of `Task::resume_execution`. This is annoying, but it's also helpful to consider how these callers should behave when a task is booted out of a ptrace-stop by SIGKILL and is no longer stopped. It's complex and hard to test, but the underlying kernel behavior is the problem.
---
 src/AutoRemoteSyscalls.cc |  67 ++++++++++++-------
 src/DiversionSession.cc   |  16 +++--
 src/Monkeypatcher.cc      |   9 ++-
 src/RecordSession.cc      | 126 ++++++++++++++++++++---------------
 src/Registers.cc          |  15 +++++
 src/Registers.h           |   2 +
 src/ReplaySession.cc      |  32 ++++-----
 src/Scheduler.cc          |  74 +++++++++++++++------
 src/Scheduler.h           |   4 ++
 src/Task.cc               | 135 +++++++++++++++++++++++++-------------
 src/Task.h                |  45 ++++++++++---
 src/fast_forward.cc       |   6 +-
 src/record_signal.cc      |  12 ++--
 src/record_syscall.cc     |   7 +-
 src/replay_syscall.cc     |   6 +-
 15 files changed, 365 insertions(+), 191 deletions(-)

diff --git a/src/AutoRemoteSyscalls.cc b/src/AutoRemoteSyscalls.cc
index 7c5ad33b7f4..0fb3c927350 100644
--- a/src/AutoRemoteSyscalls.cc
+++ b/src/AutoRemoteSyscalls.cc
@@ -270,7 +270,11 @@ void AutoRemoteSyscalls::restore_state_to(Task* t) {
     regs.set_arg1(regs.orig_arg1());
     t->set_regs(regs);
     if (t->enter_syscall(true)) {
-      t->resume_execution(RESUME_SYSCALL, RESUME_WAIT, RESUME_NO_TICKS);
+      if (!t->resume_execution(RESUME_SYSCALL, RESUME_WAIT_NO_EXIT, RESUME_NO_TICKS)) {
+        // Tracee died unexpectedly, there is nothing more we can do.
+        // Do not restore the status, we want callers to see that the task died.
+        return;
+      }
     }
     regs.set_ip(initial_ip);
     regs.set_syscallno(regs.original_syscallno());
@@ -286,7 +290,11 @@ void AutoRemoteSyscalls::restore_state_to(Task* t) {
     t->set_regs(regs);
     RecordTask* rt = static_cast<RecordTask*>(t);
     while (true) {
-      rt->resume_execution(RESUME_CONT, RESUME_WAIT, RESUME_NO_TICKS);
+      if (!rt->resume_execution(RESUME_CONT, RESUME_WAIT_NO_EXIT, RESUME_NO_TICKS)) {
+        // Tracee died unexpectedly, there is nothing more we can do.
+        // Do not restore the status, we want callers to see that the task died.
+        return;
+      }
       if (rt->ptrace_event())
         break;
       rt->stash_sig();
@@ -365,17 +373,18 @@ long AutoRemoteSyscalls::syscall_base(int syscallno, Registers& callregs) {
   bool from_seccomp = initial_at_seccomp && t->ptrace_event() == PTRACE_EVENT_SECCOMP;
   if (use_singlestep_path && !from_seccomp) {
     while (true) {
-      t->resume_execution(RESUME_SINGLESTEP, RESUME_WAIT, RESUME_NO_TICKS);
+      if (!t->resume_execution(RESUME_SINGLESTEP, RESUME_WAIT_NO_EXIT, RESUME_NO_TICKS)) {
+        // Tracee was killed, there is nothing more we can do.
+        // Ensure callers see the task death status.
+        restore_wait_status = t->status();
+        return -ESRCH;
+      }
       LOG(debug) << "Used singlestep path; status=" << t->status();
       // When a PTRACE_EVENT_EXIT is returned we don't update registers
       if (t->ip() != callregs.ip()) {
         // We entered the syscall, so stop now
         break;
       }
-      if (t->ptrace_event() == PTRACE_EVENT_EXIT) {
-        // We died, just let it be
-        break;
-      }
       if (t->stop_sig() == SIGTRAP && t->get_siginfo().si_code == TRAP_TRACE) {
         // On aarch64, if we were previously in a syscall-exit stop, continuing
         // with PTRACE_SINGLESTEP will result in incurring a trap upon execution
@@ -389,26 +398,26 @@ long AutoRemoteSyscalls::syscall_base(int syscallno, Registers& callregs) {
       ASSERT(t, false) << "Unexpected status " << t->status();
     }
   } else {
-    bool exited = false;
     if (from_seccomp) {
       LOG(debug) << "Skipping enter_syscall - already at seccomp stop";
     } else {
-      exited = !t->enter_syscall(true);
+      if (!t->enter_syscall(true)) {
+        // Tracee was killed, there is nothing more we can do.
+        // Ensure callers see the task death status.
+        restore_wait_status = t->status();
+        return -ESRCH;
+      }
       LOG(debug) << "Used enter_syscall; status=" << t->status();
     }
-    // proceed to syscall exit
-    if (!exited) {
-      t->resume_execution(RESUME_SYSCALL, RESUME_WAIT, RESUME_NO_TICKS);
-      LOG(debug) << "syscall exit status=" << t->status();
-    }
-  }
-  while (true) {
-    // If the syscall caused the task to exit, just stop now with that status.
-    if (t->seen_ptrace_exit_event() || t->status().reaped()) {
+    if (!t->resume_execution(RESUME_SYSCALL, RESUME_WAIT_NO_EXIT, RESUME_NO_TICKS)) {
+      // Tracee was killed, there is nothing more we can do.
+      // Ensure callers see the task death status.
       restore_wait_status = t->status();
-      LOG(debug) << "Task is dying, no status result";
       return -ESRCH;
     }
+    LOG(debug) << "syscall exit status=" << t->status();
+  }
+  while (true) {
     if (t->status().is_syscall() ||
         (t->stop_sig() == SIGTRAP &&
          is_kernel_trap(t->get_siginfo().si_code))) {
@@ -418,19 +427,31 @@ long AutoRemoteSyscalls::syscall_base(int syscallno, Registers& callregs) {
     }
     if (is_clone_syscall(syscallno, t->arch()) &&
         t->clone_syscall_is_complete(&new_tid_, t->arch())) {
-      t->resume_execution(RESUME_SYSCALL, RESUME_WAIT, RESUME_NO_TICKS);
+      if (!t->resume_execution(RESUME_SYSCALL, RESUME_WAIT_NO_EXIT, RESUME_NO_TICKS)) {
+        // Tracee was killed, there is nothing more we can do.
+        // Ensure callers see the task death status.
+        restore_wait_status = t->status();
+        return -ESRCH;
+      }
       LOG(debug) << "got clone event; new status=" << t->status();
       continue;
     }
     if (ignore_signal(t)) {
       if (t->regs().syscall_may_restart()) {
         if (!t->enter_syscall(true)) {
-          // We died, just let it be
-          break;
+          // Tracee was killed, there is nothing more we can do.
+          // Ensure callers see the task death status.
+          restore_wait_status = t->status();
+          return -ESRCH;
         }
         LOG(debug) << "signal ignored; restarting syscall, status="
                    << t->status();
-        t->resume_execution(RESUME_SYSCALL, RESUME_WAIT, RESUME_NO_TICKS);
+        if (!t->resume_execution(RESUME_SYSCALL, RESUME_WAIT_NO_EXIT, RESUME_NO_TICKS)) {
+          // Tracee was killed, there is nothing more we can do.
+          // Ensure callers see the task death status.
+          restore_wait_status = t->status();
+          return -ESRCH;
+        }
         LOG(debug) << "syscall exit status=" << t->status();
         continue;
       }
diff --git a/src/DiversionSession.cc b/src/DiversionSession.cc
index 53eb5d295a0..93cad2d5e24 100644
--- a/src/DiversionSession.cc
+++ b/src/DiversionSession.cc
@@ -198,16 +198,20 @@ DiversionSession::DiversionResult DiversionSession::diversion_step(
 
   while (true) {
     switch (command) {
-      case RUN_CONTINUE:
+      case RUN_CONTINUE: {
         LOG(debug) << "Continuing to next syscall";
-        t->resume_execution(RESUME_SYSEMU, RESUME_WAIT, RESUME_UNLIMITED_TICKS,
-                            signal_to_deliver);
+        bool ok = t->resume_execution(RESUME_SYSEMU, RESUME_WAIT,
+                                      RESUME_UNLIMITED_TICKS, signal_to_deliver);
+        ASSERT(t, ok) << "Tracee was killed unexpectedly";
         break;
-      case RUN_SINGLESTEP:
+      }
+      case RUN_SINGLESTEP: {
         LOG(debug) << "Stepping to next insn/syscall";
-        t->resume_execution(RESUME_SYSEMU_SINGLESTEP, RESUME_WAIT,
-                           RESUME_UNLIMITED_TICKS, signal_to_deliver);
+        bool ok = t->resume_execution(RESUME_SYSEMU_SINGLESTEP, RESUME_WAIT,
+                                      RESUME_UNLIMITED_TICKS, signal_to_deliver);
+        ASSERT(t, ok) << "Tracee was killed unexpectedly";
         break;
+      }
       default:
         FATAL() << "Illegal run command " << command;
     }
diff --git a/src/Monkeypatcher.cc b/src/Monkeypatcher.cc
index 7f7f29ae717..b12cd2eefaa 100644
--- a/src/Monkeypatcher.cc
+++ b/src/Monkeypatcher.cc
@@ -1218,7 +1218,9 @@ bool Monkeypatcher::try_patch_syscall_aarch64(RecordTask* t, bool entering_sysca
   auto success = patch_syscall_with_hook(*this, t, syscall_hooks[0], ip, 4, 0);
   if (!success && entering_syscall) {
     // Need to reenter the syscall to undo exit_syscall_and_prepare_restart
-    t->enter_syscall();
+    if (!t->enter_syscall()) {
+      return false;
+    }
   }
 
   if (!success) {
@@ -1284,6 +1286,11 @@ bool Monkeypatcher::try_patch_syscall(RecordTask* t, bool entering_syscall) {
   // We want our mmap records to be associated with the next (PATCH_SYSCALL)
   // event, not a FLUSH_SYSCALLBUF event.
   t->maybe_flush_syscallbuf();
+  if (!t->is_stopped()) {
+    // Tracee was unexpectedly kicked out of a ptrace-stop by SIGKILL or
+    // equivalent. Abort trying to patch.
+    return false;
+  }
 
   if (arch == aarch64) {
     return try_patch_syscall_aarch64(t, entering_syscall);
diff --git a/src/RecordSession.cc b/src/RecordSession.cc
index 5c823925ddd..c419c185d81 100644
--- a/src/RecordSession.cc
+++ b/src/RecordSession.cc
@@ -381,8 +381,8 @@ void RecordSession::handle_seccomp_traced_syscall(RecordTask* t,
     t->set_regs(regs);
     t->vm()->add_breakpoint(ret_addr, BKPT_INTERNAL);
     while (true) {
-      t->resume_execution(RESUME_SYSCALL, RESUME_WAIT, RESUME_NO_TICKS);
-      if (t->ptrace_event() == PTRACE_EVENT_EXIT) {
+      if (!t->resume_execution(RESUME_SYSCALL, RESUME_WAIT_NO_EXIT, RESUME_NO_TICKS)) {
+        // Tracee exited unexpectedly
         return;
       }
       ASSERT(t, !t->ptrace_event());
@@ -439,7 +439,12 @@ void RecordSession::handle_seccomp_traced_syscall(RecordTask* t,
       Registers r = orig_regs;
       r.set_original_syscallno(syscall_number_for_gettid(t->arch()));
       t->set_regs(r);
-      t->resume_execution(RESUME_SYSCALL, RESUME_WAIT, RESUME_NO_TICKS);
+      if (!t->resume_execution(RESUME_SYSCALL, RESUME_WAIT_NO_EXIT, RESUME_NO_TICKS)) {
+        // Tracee died unexpectedly. We did not enter a syscall.
+        // We shouldn't try to resume it now.
+        step_state->continue_type = RecordSession::DONT_CONTINUE;
+        return;
+      }
       t->set_regs(orig_regs);
     }
 
@@ -602,7 +607,8 @@ static void handle_seccomp_trap(RecordTask* t,
     // The tracee is currently in the seccomp ptrace-stop. Advance it to the
     // syscall-exit stop so that when we try to deliver the SIGSYS via
     // PTRACE_SINGLESTEP, that doesn't trigger a SIGTRAP stop.
-    t->resume_execution(RESUME_SYSCALL, RESUME_WAIT, RESUME_NO_TICKS);
+    // If this fails, that's fine, we're not going to deliver the SIGSYS.
+    t->resume_execution(RESUME_SYSCALL, RESUME_WAIT_NO_EXIT, RESUME_NO_TICKS);
   }
 
   // Don't continue yet. At the next iteration of record_step, if we
@@ -719,6 +725,7 @@ bool RecordSession::handle_ptrace_event(RecordTask** t_ptr,
             // to the PTRACE_EVENT_EXIT. This avoids the race where our
             // PTRACE_CONT might kick us out of the PTRACE_EVENT_EXIT before
             // we can process it.
+            // If this fails because of *another* SIGKILL that's fine.
             t->wait();
             break;
           default:
@@ -761,7 +768,13 @@ bool RecordSession::handle_ptrace_event(RecordTask** t_ptr,
         *t_ptr = t;
         // Tell t that it is actually stopped, because the stop we got is really
         // for this task, not the old dead task.
-        t->did_waitpid(status);
+        if (!t->did_waitpid(status)) {
+          // This is totally untested and almost certainly broken, but if the
+          // task was SIGKILLed out of the EXEC stop then we should probably
+          // just pretend the exec never happened.
+          step_state->continue_type = CONTINUE_SYSCALL;
+          break;
+        }
       }
       t->post_exec();
       t->session().scheduler().did_exit_execve(t);
@@ -929,8 +942,7 @@ static void advance_to_disarm_desched_syscall(RecordTask* t) {
   /* TODO: send this through main loop. */
   /* TODO: mask off signals and avoid this loop. */
   do {
-    t->resume_execution(RESUME_SYSCALL, RESUME_WAIT, RESUME_UNLIMITED_TICKS);
-    if (t->seen_ptrace_exit_event()) {
+    if (!t->resume_execution(RESUME_SYSCALL, RESUME_WAIT_NO_EXIT, RESUME_UNLIMITED_TICKS)) {
       return;
     }
     if (t->status().is_syscall()) {
@@ -968,8 +980,8 @@ static void advance_to_disarm_desched_syscall(RecordTask* t) {
     }
   } while (!t->is_disarm_desched_event_syscall());
 
-  // Exit the syscall.
-  t->resume_execution(RESUME_SYSCALL, RESUME_WAIT, RESUME_NO_TICKS);
+  // Exit the syscall. If this fails, that's fine, we can ignore it.
+  t->resume_execution(RESUME_SYSCALL, RESUME_WAIT_NO_EXIT, RESUME_NO_TICKS);
 }
 
 /**
@@ -1127,10 +1139,13 @@ void RecordSession::syscall_state_changed(RecordTask* t,
         Registers orig_regs = r;
         r.set_original_syscallno(-1);
         t->set_regs(r);
-        t->resume_execution(RESUME_SYSCALL, RESUME_WAIT, RESUME_NO_TICKS);
-        ASSERT(t, t->ip() == r.ip());
-        t->set_regs(orig_regs);
-        maybe_trigger_emulated_ptrace_syscall_exit_stop(t);
+        // If this fails because of premature exit, don't mess with the
+        // task anymore.
+        if (t->resume_execution(RESUME_SYSCALL, RESUME_WAIT_NO_EXIT, RESUME_NO_TICKS)) {
+          ASSERT(t, t->ip() == r.ip());
+          t->set_regs(orig_regs);
+          maybe_trigger_emulated_ptrace_syscall_exit_stop(t);
+        }
         return;
       }
       last_task_switchable = PREVENT_SWITCH;
@@ -1465,9 +1480,7 @@ static bool preinject_signal(RecordTask* t) {
      */
     LOG(debug) << "    maybe not in signal-stop (status " << t->status()
                << "); doing tgkill(SYSCALLBUF_DESCHED_SIGNAL)";
-    t->move_to_signal_stop();
-
-    if (t->status().ptrace_event() == PTRACE_EVENT_EXIT) {
+    if (!t->move_to_signal_stop()) {
       /* We raced with an exit (e.g. due to a pending SIGKILL). */
       return false;
     }
@@ -1488,7 +1501,7 @@ static bool preinject_signal(RecordTask* t) {
 /**
  * Returns true if the signal should be delivered.
  * Returns false if this signal should not be delivered because another signal
- * occurred during delivery.
+ * occurred during delivery or there was a premature exit.
  * Must call t->stashed_signal_processed() once we're ready to unmask signals.
  */
 static bool inject_handled_signal(RecordTask* t) {
@@ -1506,7 +1519,9 @@ static bool inject_handled_signal(RecordTask* t) {
     // XXX we assume the kernel won't respond by notifying us of a different
     // signal. We don't want to do this with signals blocked because that will
     // save a bogus signal mask in the signal frame.
-    t->resume_execution(RESUME_SINGLESTEP, RESUME_WAIT, RESUME_NO_TICKS, sig);
+    if (!t->resume_execution(RESUME_SINGLESTEP, RESUME_WAIT_NO_EXIT, RESUME_NO_TICKS, sig)) {
+      return false;
+    }
     // Signal injection can change the sigmask due to sa_mask effects, lack of
     // SA_NODEFER, and signal frame construction triggering a synchronous
     // SIGSEGV.
@@ -1957,13 +1972,12 @@ bool RecordSession::process_syscall_entry(RecordTask* t, StepState* step_state,
         t->record_event(Event::patch_syscall());
         return true;
       }
-    }
-
-    if (t->ptrace_event() == PTRACE_EVENT_EXIT) {
-      // task exited while we were trying to patch it.
-      // Make sure that this exit event gets processed
-      step_state->continue_type = DONT_CONTINUE;
-      return false;
+      if (!t->is_stopped()) {
+        // task exited while we were trying to patch it.
+        // Make sure that this exit event gets processed
+        step_state->continue_type = DONT_CONTINUE;
+        return false;
+      }
     }
 
     t->push_event(SyscallEvent(t->regs().original_syscallno(), syscall_arch));
@@ -2549,6 +2563,8 @@ RecordSession::RecordResult RecordSession::record_step() {
 
   StepState step_state(CONTINUE);
 
+  ASSERT(t, t->is_stopped()) << "Somehow we're not stopped here; status="
+    << t->status();
   bool did_enter_syscall;
   if (rescheduled.by_waitpid &&
       handle_ptrace_event(&t, &step_state, &result, &did_enter_syscall)) {
@@ -2561,37 +2577,41 @@ RecordSession::RecordResult RecordSession::record_step() {
     if (did_enter_syscall && t->ev().type() == EV_SYSCALL) {
       syscall_state_changed(t, &step_state);
     }
-  } else if (rescheduled.by_waitpid && handle_signal_event(t, &step_state)) {
-    // Tracee may have exited while processing descheds; handle that.
-    if (handle_ptrace_exit_event(t)) {
-      // t may have been deleted.
-      last_task_switchable = ALLOW_SWITCH;
-      return result;
-    }
   } else {
-    runnable_state_changed(t, &step_state, &result, rescheduled.by_waitpid);
+    ASSERT(t, t->is_stopped()) << "handle_ptrace_event left us in a not-stopped state";
+    if (rescheduled.by_waitpid && handle_signal_event(t, &step_state)) {
+      // Tracee may have exited while processing descheds; handle that.
+      if (handle_ptrace_exit_event(t)) {
+        // t may have been deleted.
+        last_task_switchable = ALLOW_SWITCH;
+        return result;
+      }
+    } else {
+      ASSERT(t, t->is_stopped()) << "handle_signal_event left us in a not-stopped state";
+      runnable_state_changed(t, &step_state, &result, rescheduled.by_waitpid);
 
-    if (result.status != STEP_CONTINUE ||
-        step_state.continue_type == DONT_CONTINUE) {
-      return result;
-    }
+      if (result.status != STEP_CONTINUE ||
+          step_state.continue_type == DONT_CONTINUE) {
+        return result;
+      }
 
-    switch (t->ev().type()) {
-      case EV_DESCHED:
-        desched_state_changed(t);
-        break;
-      case EV_SYSCALL:
-        syscall_state_changed(t, &step_state);
-        break;
-      case EV_SIGNAL:
-      case EV_SIGNAL_DELIVERY:
-        if (signal_state_changed(t, &step_state)) {
-          // t may have been deleted
-          return result;
-        }
-        break;
-      default:
-        break;
+      switch (t->ev().type()) {
+        case EV_DESCHED:
+          desched_state_changed(t);
+          break;
+        case EV_SYSCALL:
+          syscall_state_changed(t, &step_state);
+          break;
+        case EV_SIGNAL:
+        case EV_SIGNAL_DELIVERY:
+          if (signal_state_changed(t, &step_state)) {
+            // t may have been deleted
+            return result;
+          }
+          break;
+        default:
+          break;
+      }
     }
   }
 
diff --git a/src/Registers.cc b/src/Registers.cc
index 10a493b02d4..4038b192c34 100644
--- a/src/Registers.cc
+++ b/src/Registers.cc
@@ -708,4 +708,19 @@ ostream& operator<<(ostream& stream, const Registers& r) {
   return stream;
 }
 
+void Registers::emulate_syscall_entry() {
+  set_original_syscallno(syscallno());
+  set_orig_arg1(arg1());
+  /**
+   * The aarch64 kernel has a quirk where if the syscallno is -1 (and only -1),
+   * it will apply the -ENOSYS result before any ptrace entry stop.
+   * On x86, this happens unconditionally for every syscall, but there the
+   * result isn't shared with arg1, and we usually don't care because we have
+   * access to original_syscallno.
+   */
+  if (is_x86ish(arch()) || (arch() == aarch64 && syscallno() == -1)) {
+    set_syscall_result(-ENOSYS);
+  }
+}
+
 } // namespace rr
diff --git a/src/Registers.h b/src/Registers.h
index fec656dfefb..d4da8151536 100644
--- a/src/Registers.h
+++ b/src/Registers.h
@@ -548,6 +548,8 @@ class Registers {
     return !(*this == other);
   }
 
+  void emulate_syscall_entry();
+
 private:
   template <typename Arch>
   void print_register_file_arch(FILE* f, const char* formats[]) const;
diff --git a/src/ReplaySession.cc b/src/ReplaySession.cc
index dc97a35866e..263a245fec0 100644
--- a/src/ReplaySession.cc
+++ b/src/ReplaySession.cc
@@ -492,7 +492,8 @@ Completion ReplaySession::cont_syscall_boundary(
   } else {
     ResumeRequest resume_how =
         constraints.is_singlestep() ? RESUME_SYSEMU_SINGLESTEP : RESUME_SYSEMU;
-    t->resume_execution(resume_how, RESUME_WAIT, ticks_request);
+    bool ok = t->resume_execution(resume_how, RESUME_WAIT_NO_EXIT, ticks_request);
+    ASSERT(t, ok) << "Tracee died unexpectedly";
   }
 
   switch (t->stop_sig()) {
@@ -525,7 +526,8 @@ Completion ReplaySession::cont_syscall_boundary(
       syscall_seccomp_ordering_ = PTRACE_SYSCALL_BEFORE_SECCOMP;
     }
     // Eat the following event, either a seccomp or syscall notification
-    t->resume_execution(RESUME_SYSEMU, RESUME_WAIT, ticks_request);
+    bool ok = t->resume_execution(RESUME_SYSEMU, RESUME_WAIT_NO_EXIT, ticks_request);
+    ASSERT(t, ok) << "Tracee died unexpectedly";
   }
 
   t->apply_syscall_entry_regs();
@@ -557,18 +559,7 @@ static void emulate_syscall_entry(ReplayTask* t, const TraceFrame& frame,
                                   remote_code_ptr syscall_instruction) {
   Registers r = t->regs();
   r.set_ip(syscall_instruction.increment_by_syscall_insn_length(t->arch()));
-  r.set_original_syscallno(r.syscallno());
-  r.set_orig_arg1(r.arg1());
-  /**
-   * The aarch64 kernel has a quirk where if the syscallno is -1 (and only -1),
-   * it will apply the -ENOSYS result before any ptrace entry stop.
-   * On x86, this happens unconditionally for every syscall, but there the
-   * result isn't shared with arg1, and we usually don't care because we have
-   * access to original_syscallno.
-   */
-  if (is_x86ish(t->arch()) || (t->arch() == aarch64 && r.syscallno() == -1)) {
-    r.set_syscall_result(-ENOSYS);
-  }
+  r.emulate_syscall_entry();
   t->set_regs(r);
   t->canonicalize_regs(frame.event().Syscall().arch());
   t->validate_regs();
@@ -726,14 +717,16 @@ Completion ReplaySession::continue_or_step(ReplayTask* t,
                                            TicksRequest tick_request,
                                            ResumeRequest resume_how) {
   if (constraints.command == RUN_SINGLESTEP) {
-    t->resume_execution(RESUME_SINGLESTEP, RESUME_WAIT, tick_request);
+    bool ok = t->resume_execution(RESUME_SINGLESTEP, RESUME_WAIT_NO_EXIT, tick_request);
+    ASSERT(t, ok) << "Tracee died unexpectedly";
     handle_unrecorded_cpuid_fault(t, constraints);
   } else if (constraints.command == RUN_SINGLESTEP_FAST_FORWARD) {
     fast_forward_status |= fast_forward_through_instruction(
         t, RESUME_SINGLESTEP, constraints.stop_before_states);
     handle_unrecorded_cpuid_fault(t, constraints);
   } else {
-    t->resume_execution(resume_how, RESUME_WAIT, tick_request);
+    bool ok = t->resume_execution(resume_how, RESUME_WAIT_NO_EXIT, tick_request);
+    ASSERT(t, ok) << "Tracee died unexpectedly";
     if (t->stop_sig() == 0) {
       auto type = AddressSpace::rr_page_syscall_from_exit_point(t->arch(), t->ip());
       if (type && type->traced == AddressSpace::UNTRACED) {
@@ -1665,7 +1658,8 @@ static void end_task(ReplayTask* t) {
   r.set_syscallno(syscall_number_for_exit(t->arch()));
   t->set_regs(r);
   // Enter the syscall.
-  t->resume_execution(RESUME_CONT, RESUME_WAIT, RESUME_NO_TICKS);
+  bool ok = t->resume_execution(RESUME_CONT, RESUME_WAIT, RESUME_NO_TICKS);
+  ASSERT(t, ok) << "Tracee died unexpectedly";
   if (t->session().done_initial_exec()) {
     ASSERT(t, t->ptrace_event() == PTRACE_EVENT_EXIT);
     t->did_handle_ptrace_exit_event();
@@ -2111,7 +2105,9 @@ void ReplaySession::reattach_tasks(ScopedFd new_tracee_socket, ScopedFd new_trac
   // Get stop events for all tasks
   for (auto& entry : task_map) {
     Task* t = entry.second;
-    t->wait();
+    if (!t->wait()) {
+      FATAL() << "Task " << t->tid << " killed unexpectedly";
+    }
     if (SIGSTOP != t->status().group_stop()) {
       WaitStatus failed_status = t->status();
       FATAL() << "Unexpected stop " << failed_status << " for " << t->tid;
diff --git a/src/Scheduler.cc b/src/Scheduler.cc
index 8c6754b1b0c..42c9f42ceaa 100644
--- a/src/Scheduler.cc
+++ b/src/Scheduler.cc
@@ -229,8 +229,10 @@ bool WaitAggregator::try_wait(RecordTask* t) {
     return false;
   }
   LOGM(debug) << "wait on " << t->tid << " returns " << result.status;
-  t->did_waitpid(result.status);
-  return true;
+  // If did_waitpid fails then the task left the stop prematurely
+  // due to SIGKILL or equivalent, and we should report that we did not get
+  // a stop.
+  return t->did_waitpid(result.status);
 }
 
 bool WaitAggregator::try_wait_exit(RecordTask* t) {
@@ -245,9 +247,11 @@ bool WaitAggregator::try_wait_exit(RecordTask* t) {
   options.consume = false;
   WaitResult result = WaitManager::wait_exit(options);
   switch (result.code) {
-    case WAIT_OK:
-      t->did_waitpid(result.status);
+    case WAIT_OK: {
+      bool ok = t->did_waitpid(result.status);
+      ASSERT(t, ok) << "did_waitpid shouldn't fail for exit statuses";
       return true;
+    }
     case WAIT_NO_STATUS:
       // This can happen when the task is in zap_pid_ns_processes waiting for all tasks
       // in the pid-namespace to exit. It's not in a signal stop, but it's also not
@@ -286,7 +290,7 @@ bool Scheduler::is_task_runnable(RecordTask* t, WaitAggregator& wait_aggregator,
   }
 
   LOGM(debug) << "Task event is " << t->ev();
-  if (!t->may_be_blocked()) {
+  if (!t->may_be_blocked() && (t->is_stopped() || t->was_reaped())) {
     LOGM(debug) << "  " << t->tid << " isn't blocked";
     if (t->schedule_frozen) {
       LOGM(debug) << "  " << t->tid << "  but is frozen";
@@ -303,12 +307,17 @@ bool Scheduler::is_task_runnable(RecordTask* t, WaitAggregator& wait_aggregator,
       t->emulate_SIGCONT();
       // We shouldn't run any user code since there is at least one signal
       // pending.
-      t->resume_execution(RESUME_SYSCALL, RESUME_WAIT, RESUME_NO_TICKS);
-      *by_waitpid = true;
-      must_run_task = t;
-      LOGM(debug) << "  Got " << t->tid
-                 << " out of emulated stop due to pending SIGCONT";
-      return true;
+      if (t->resume_execution(RESUME_SYSCALL, RESUME_WAIT_NO_EXIT, RESUME_NO_TICKS)) {
+        *by_waitpid = true;
+        must_run_task = t;
+        LOGM(debug) << "  Got " << t->tid
+                   << " out of emulated stop due to pending SIGCONT";
+        return true;
+      }
+      // Tracee exited unexpectedly. Reexamine it now in case it has a new
+      // status we can use. Note that we cleared `t->emulated_stop_type`
+      // so we won't end up here again.
+      return is_task_runnable(t, wait_aggregator, by_waitpid);
     } else {
       LOGM(debug) << "  " << t->tid << " is stopped by ptrace or signal";
       // We have no way to detect a SIGCONT coming from outside the tracees.
@@ -346,7 +355,11 @@ bool Scheduler::is_task_runnable(RecordTask* t, WaitAggregator& wait_aggregator,
     // These syscalls never really block but the kernel may report that
     // the task is not stopped yet if we pass WNOHANG. To make them
     // behave predictably, do a blocking wait.
-    t->wait();
+    if (!t->wait()) {
+      // Task got SIGKILL or equivalent while trying to process the stop.
+      // Ignore this event and we'll process the new status later.
+      return false;
+    }
     *by_waitpid = true;
     must_run_task = t;
     LOGM(debug) << "  " << syscall_name(t->ev().Syscall().number, t->arch())
@@ -602,7 +615,11 @@ static RecordTask* find_waited_task(RecordSession& session, pid_t tid, WaitStatu
   }
 
   if (waited->detached_proxy) {
-    waited->did_waitpid(status);
+    if (!waited->did_waitpid(status)) {
+      // Proxy died unexpectedly during the waitpid, just ignore
+      // the stop.
+      return nullptr;
+    }
     pid_t parent_rec_tid = waited->get_parent_pid();
     LOGM(debug) << "    ... but it's a detached process.";
     RecordTask *parent = session.find_task(parent_rec_tid);
@@ -696,7 +713,13 @@ Scheduler::Rescheduled Scheduler::reschedule(Switchable switchable) {
           if (!waited) {
             continue;
           }
-          waited->did_waitpid(status);
+          if (!waited->did_waitpid(status)) {
+            // Tracee exited stop prematurely due to SIGKILL or equivalent.
+            // Pretend the stop didn't happen.
+            continue;
+          }
+          result.by_waitpid = true;
+          LOGM(debug) << "  new status is " << current_->status();
           // Another task just became runnable, we're no longer in unlimited
           // ticks mode
           unlimited_ticks_mode = false;
@@ -714,7 +737,15 @@ Scheduler::Rescheduled Scheduler::reschedule(Switchable switchable) {
           timeout = elapsed > 0.05 ? 0.0 : 0.05 - elapsed;
           LOGM(debug) << "  But that's not our current task...";
         } else {
-          current_->wait(timeout);
+          if (current_->wait(timeout)) {
+            result.by_waitpid = true;
+            LOGM(debug) << "  new status is " << current_->status();
+          } else {
+            // A SIGKILL or equivalent kicked the task out of the stop.
+            // We are now running towards PTRACE_EVENT_EXIT or zombie status.
+            // Even though we're PREVENT_SWITCH, we still have to switch.
+            // The task won't be stopped so this is handled below.
+          }
           break;
         }
       }
@@ -725,11 +756,11 @@ Scheduler::Rescheduled Scheduler::reschedule(Switchable switchable) {
                  strevent(current_->event), 1000.0 * wait_duration);
       }
 #endif
-      result.by_waitpid = true;
-      LOGM(debug) << "  new status is " << current_->status();
     }
-    validate_scheduled_task();
-    return result;
+    if (current_->is_stopped() || current_->was_reaped()) {
+      validate_scheduled_task();
+      return result;
+    }
   }
 
   unlimited_ticks_mode = false;
@@ -891,8 +922,9 @@ Scheduler::Rescheduled Scheduler::reschedule(Switchable switchable) {
                    status.ptrace_event() == PTRACE_EVENT_EXIT ||
                    status.reaped())
             << "Scheduled task should have been blocked";
-        next->did_waitpid(status);
-        if (in_exec_tgid && next->tgid() != in_exec_tgid) {
+        if (!next->did_waitpid(status)) {
+          next = nullptr;
+        } else if (in_exec_tgid && next->tgid() != in_exec_tgid) {
           // Some threadgroup is doing execve and this task isn't in
           // that threadgroup. Don't schedule this task until the execve
           // is complete.
diff --git a/src/Scheduler.h b/src/Scheduler.h
index 7534f8540c2..94e8b16e0b7 100644
--- a/src/Scheduler.h
+++ b/src/Scheduler.h
@@ -106,6 +106,10 @@ class Scheduler {
    * The new current() task is guaranteed to either have already been
    * runnable, or have been made runnable by a waitpid status change (in
    * which case, result.by_waitpid will be true.
+   * 
+   * After this, if Rescheduled::interrupted_by_signal is false,
+   * and there is a new current task, its is_stopped() must
+   * be true.
    */
   struct Rescheduled {
     bool interrupted_by_signal;
diff --git a/src/Task.cc b/src/Task.cc
index 21714dc1240..a27333ef7fb 100644
--- a/src/Task.cc
+++ b/src/Task.cc
@@ -277,7 +277,8 @@ void Task::finish_emulated_syscall() {
   // Passing RESUME_NO_TICKS here is not only a small performance optimization,
   // but also avoids counting an event if the instruction immediately following
   // a syscall instruction is a conditional branch.
-  resume_execution(RESUME_SYSCALL, RESUME_WAIT, RESUME_NO_TICKS);
+  bool ok = resume_execution(RESUME_SYSCALL, RESUME_WAIT_NO_EXIT, RESUME_NO_TICKS);
+  ASSERT(this, ok) << "Tracee exited unexpectedly";
 
   set_regs(r);
   wait_status = WaitStatus();
@@ -888,8 +889,10 @@ bool Task::enter_syscall(bool allow_exit) {
                                        Session::SECCOMP_BEFORE_PTRACE_SYSCALL;
   bool need_seccomp_event = seccomp_bpf_enabled;
   while (need_ptrace_syscall_event || need_seccomp_event) {
-    resume_execution(need_ptrace_syscall_event ? RESUME_SYSCALL : RESUME_CONT,
-                     RESUME_WAIT, RESUME_NO_TICKS);
+    if (!resume_execution(need_ptrace_syscall_event ? RESUME_SYSCALL : RESUME_CONT,
+                          RESUME_WAIT_NO_EXIT, RESUME_NO_TICKS)) {
+      return false;
+    }
     if (is_ptrace_seccomp_event()) {
       ASSERT(this, need_seccomp_event);
       need_seccomp_event = false;
@@ -934,7 +937,9 @@ bool Task::exit_syscall() {
                            Session::PTRACE_SYSCALL_BEFORE_SECCOMP) &&
                           !is_ptrace_seccomp_event();
   while (true) {
-    resume_execution(RESUME_SYSCALL, RESUME_WAIT, RESUME_NO_TICKS);
+    if (!resume_execution(RESUME_SYSCALL, RESUME_WAIT_NO_EXIT, RESUME_NO_TICKS)) {
+      return false;
+    }
     if (will_see_seccomp && is_ptrace_seccomp_event()) {
       will_see_seccomp = false;
       continue;
@@ -967,12 +972,11 @@ bool Task::exit_syscall_and_prepare_restart() {
   // This exits the hijacked SYS_gettid.  Now the tracee is
   // ready to do our bidding.
   if (!exit_syscall()) {
-    // The tracee suddenly exited. To get this to replay correctly, we need to
+    // The tracee unexpectedly exited. To get this to replay correctly, we need to
     // make it look like we really entered the syscall. Then
     // handle_ptrace_exit_event will record something appropriate.
-    r.set_original_syscallno(syscallno);
-    r.set_syscall_result(-ENOSYS);
-    set_regs(r);
+    r.emulate_syscall_entry();
+    override_regs_during_exit(r);
     return false;
   }
   LOG(debug) << "exit_syscall_and_prepare_restart done";
@@ -1437,7 +1441,7 @@ void Task::work_around_KNL_string_singlestep_bug() {
   }
 }
 
-void Task::resume_execution(ResumeRequest how, WaitRequest wait_how,
+bool Task::resume_execution(ResumeRequest how, WaitRequest wait_how,
                             TicksRequest tick_period, int sig) {
   // Ensure our HW debug registers are up to date before we execute any code.
   // If this fails because the task died, the code below will detect it.
@@ -1497,7 +1501,6 @@ void Task::resume_execution(ResumeRequest how, WaitRequest wait_how,
     flush_regs();
   }
 
-  bool detected_exit = false;
   if (session().is_recording() && !seen_ptrace_exit_event()) {
     /* There's a nasty race where a stopped task gets woken up by a SIGKILL
      * and advances to the PTRACE_EXIT_EVENT ptrace-stop just before we
@@ -1524,24 +1527,42 @@ void Task::resume_execution(ResumeRequest how, WaitRequest wait_how,
              result.status.ptrace_event() == PTRACE_EVENT_EXIT ||
                  result.status.reaped())
           << "got " << result.status;
-      did_waitpid(result.status);
-      detected_exit = true;
+      LOG(debug) << "Task " << tid << " exited unexpectedly with status "
+          << result.status;
+      if (did_waitpid(result.status)) {
+        // We reached a new stop (or actually reaped the task).
+        // Consider this "resume execution" to be done.
+        return wait_how != RESUME_WAIT_NO_EXIT;
+      }
+      ASSERT(this, result.status.ptrace_event() == PTRACE_EVENT_EXIT)
+        << "did_waitpid should always succeed for reaped() statuses";
+      // The tracee must have been kicked out of PTRACE_EVENT_EXIT
+      // by a SIGKILL (only possible on older kernels).
+      // If we were supposed to wait, we've failed.
+      // We can't wait now because on old kernels tasks can block
+      // indefinitely even after PTRACE_EVENT_EXIT (e.g. due to coredumping).
+      // We don't know what state it's in exactly, but registers haven't changed
+      // since nothing has really happened since the last stop.
+      set_stopped(false);
+      return RESUME_NONBLOCKING == wait_how;
     }
   }
-  if (detected_exit) {
-    LOG(debug) << "Task " << tid << " exited unexpectedly";
-  } else {
-    ASSERT(this, setup_succeeded);
-    ptrace_if_stopped(how, nullptr, (void*)(uintptr_t)sig);
-    // If ptrace_if_stopped failed, it means we're running along the
-    // exit path due to a SIGKILL or equivalent, so just like if it
-    // succeeded, we are stopped and will receive a wait notification.
-    set_stopped(false);
-    extra_registers_known = false;
-    if (RESUME_WAIT == wait_how) {
-      wait();
+  ASSERT(this, setup_succeeded);
+  ptrace_if_stopped(how, nullptr, (void*)(uintptr_t)sig);
+  // If ptrace_if_stopped failed, it means we're running along the
+  // exit path due to a SIGKILL or equivalent, so just like if it
+  // succeeded, we will eventually receive a wait notification.
+  set_stopped(false);
+  extra_registers_known = false;
+  if (RESUME_NONBLOCKING != wait_how) {
+    if (!wait()) {
+      return false;
+    }
+    if (wait_how == RESUME_WAIT_NO_EXIT) {
+      return ptrace_event() != PTRACE_EVENT_EXIT && !was_reaped();
     }
   }
+  return true;
 }
 
 void Task::set_regs(const Registers& regs) {
@@ -1556,6 +1577,12 @@ void Task::set_regs(const Registers& regs) {
   }
 }
 
+void Task::override_regs_during_exit(const Registers& regs) {
+  orig_syscallno_dirty = true;
+  registers_dirty = true;
+  registers = regs;
+}
+
 void Task::flush_regs() {
   if (registers_dirty) {
     LOG(debug) << "Flushing registers for tid " << tid << " " << registers;
@@ -1962,7 +1989,7 @@ bool Task::account_for_potential_ptrace_interrupt_stop(WaitStatus status) {
   return false;
 }
 
-void Task::wait(double interrupt_after_elapsed) {
+bool Task::wait(double interrupt_after_elapsed) {
   LOG(debug) << "going into blocking wait for " << tid << " ...";
   ASSERT(this, session().is_recording() || interrupt_after_elapsed == -1);
 
@@ -1994,12 +2021,8 @@ void Task::wait(double interrupt_after_elapsed) {
       /* The process died without us getting a PTRACE_EXIT_EVENT notification.
        * This is possible if the process receives a SIGKILL while in the exit
        * event stop, but before we were able to read the event notification.
-       * We handle this situation by synthesizing an exit event, but otherwise
-       * going about our regular business.
        */
-      LOG(debug) << "Synthesizing PTRACE_EVENT_EXIT for zombie process " << tid;
-      result.status = WaitStatus::for_ptrace_event(PTRACE_EVENT_EXIT);
-      break;
+      return false;
     }
     ASSERT(this, result.code == WAIT_NO_STATUS);
   }
@@ -2010,7 +2033,7 @@ void Task::wait(double interrupt_after_elapsed) {
       LOG(warn) << "  PTRACE_INTERRUPT raced with another event " << result.status;
     }
   }
-  did_waitpid(result.status);
+  return did_waitpid(result.status);
 }
 
 void Task::canonicalize_regs(SupportedArch syscall_arch) {
@@ -2087,11 +2110,11 @@ void Task::set_aarch64_tls_register(uintptr_t val) {
      we tried to set. */
 }
 
-void Task::did_waitpid(WaitStatus status) {
+bool Task::did_waitpid(WaitStatus status) {
   if (is_detached_proxy() &&
       (status.stop_sig() == SIGSTOP || status.stop_sig() == SIGCONT)) {
     LOG(debug) << "Task " << tid << " is a detached proxy, ignoring status " << status;
-    return;
+    return true;
   }
 
   LOG(debug) << "  Task " << tid << " changed status to " << status;
@@ -2110,7 +2133,7 @@ void Task::did_waitpid(WaitStatus status) {
       // here is get out quickly and the higher-level function should go ahead
       // and delete us.
       wait_status = status;
-      return;
+      return true;
     }
     LOG(debug) << "Unexpected process reap for " << tid;
     /* Mark buffers as having been destroyed. We missed our chance
@@ -2146,11 +2169,13 @@ void Task::did_waitpid(WaitStatus status) {
     } else if (status.stop_sig()) {
       if (!ptrace_if_stopped(PTRACE_GETSIGINFO, nullptr, &pending_siginfo)) {
         LOG(debug) << "Unexpected process death getting siginfo for " << tid;
-        status = WaitStatus::for_ptrace_event(PTRACE_EVENT_EXIT);
+        // Let's pretend this stop never happened.
+        set_stopped(false);
+        return false;
       }
     }
 
-    // An unstable exit can cause a task to exit without us having run it, in
+    // A SIGKILL or equivalent can cause a task to exit without us having run it, in
     // which case we might have pending register changes for it that are now
     // irrelevant. In that case we just throw away our register changes and use
     // whatever the kernel now has.
@@ -2201,7 +2226,12 @@ void Task::did_waitpid(WaitStatus status) {
 #endif
       else {
         LOG(debug) << "Unexpected process death for " << tid;
-        status = WaitStatus::for_ptrace_event(PTRACE_EVENT_EXIT);
+        // Let's pretend this stop never happened.
+        // Note that pending_siginfo may have been overwritten above,
+        // but in that case we're going to ignore this signal-stop
+        // so it doesn't matter.
+        set_stopped(false);
+        return false;
       }
     }
   }
@@ -2297,6 +2327,7 @@ void Task::did_waitpid(WaitStatus status) {
   }
 
   did_wait();
+  return true;
 }
 
 template <typename Arch>
@@ -2398,7 +2429,8 @@ Task* Task::clone(CloneReason reason, int flags, remote_ptr<void> stack,
 
   // wait() before trying to do anything that might need to
   // use ptrace to access memory
-  t->wait();
+  bool ok = t->wait();
+  ASSERT(t, ok) << "Task " << t->tid << " killed unexpectedly; not sure how to handle this";
 
   t->post_wait_clone(this, flags);
   if (CLONE_SHARE_THREAD_GROUP & flags) {
@@ -3618,7 +3650,9 @@ long Task::ptrace_seize(pid_t tid, Session& session) {
   sa.sa_flags = 0; // No SA_RESTART, so waitpid() will be interrupted
   sigaction(SIGALRM, &sa, nullptr);
 
-  t->wait();
+  if (!t->wait()) {
+    FATAL() << "Tracee died before reaching SIGSTOP";
+  }
   if (t->ptrace_event() == PTRACE_EVENT_EXIT) {
     t->proceed_to_exit();
     FATAL() << "Tracee died before reaching SIGSTOP\n"
@@ -4020,8 +4054,10 @@ void Task::dup_from(Task *other) {
 
 /**
  * Proceeds until the next system call, which is being executed.
+ * Returns false if did_waitpid failed because the task got SIGKILL
+ * or equivalent.
  */
-static void __ptrace_cont(Task* t, ResumeRequest resume_how,
+static bool __ptrace_cont(Task* t, ResumeRequest resume_how,
                           SupportedArch syscall_arch, int expect_syscallno,
                           int expect_syscallno2 = -1, pid_t new_tid = -1) {
   t->resume_execution(resume_how, RESUME_NONBLOCKING, RESUME_NO_TICKS);
@@ -4044,7 +4080,9 @@ static void __ptrace_cont(Task* t, ResumeRequest resume_how,
       t->hpc.set_tid(new_tid);
       t->tid = new_tid;
     }
-    t->did_waitpid(result.status);
+    if (!t->did_waitpid(result.status)) {
+      return false;
+    }
 
     if (ReplaySession::is_ignored_signal(t->status().stop_sig())) {
       t->resume_execution(resume_how, RESUME_NONBLOCKING, RESUME_NO_TICKS);
@@ -4063,6 +4101,7 @@ static void __ptrace_cont(Task* t, ResumeRequest resume_how,
              current_syscall == expect_syscallno2)
       << "Should be at " << syscall_name(expect_syscallno, syscall_arch)
       << ", but instead at " << syscall_name(current_syscall, syscall_arch);
+  return true;
 }
 
 void Task::did_handle_ptrace_exit_event() {
@@ -4126,9 +4165,10 @@ void Task::os_exec(SupportedArch exec_arch, std::string filename)
    * tid, no matter what tid it was before.
    */
   pid_t tgid = real_tgid();
-  __ptrace_cont(this, RESUME_SYSCALL, arch(), expect_syscallno,
-                syscall_number_for_execve(exec_arch),
-                tgid == tid ? -1 : tgid);
+  bool ok = __ptrace_cont(this, RESUME_SYSCALL, arch(), expect_syscallno,
+                          syscall_number_for_execve(exec_arch),
+                          tgid == tid ? -1 : tgid);
+  ASSERT(this, ok) << "Task " << tid << " got killed while trying to exec";
   LOG(debug) << this->status() << " " << this->regs();
   if (this->regs().syscall_result()) {
     errno = -this->regs().syscall_result();
@@ -4169,7 +4209,7 @@ void Task::tgkill(int sig) {
   ASSERT(this, 0 == syscall(SYS_tgkill, real_tgid(), tid, sig));
 }
 
-void Task::move_to_signal_stop()
+bool Task::move_to_signal_stop()
 {
   LOG(debug) << "    maybe not in signal-stop (status " << status()
              << "); doing tgkill(SYSCALLBUF_DESCHED_SIGNAL)";
@@ -4197,7 +4237,9 @@ void Task::move_to_signal_stop()
     old_ip = old_ip.decrement_by_syscall_insn_length(arch());
   }
   do {
-    resume_execution(RESUME_SINGLESTEP, RESUME_WAIT, RESUME_NO_TICKS);
+    if (!resume_execution(RESUME_SINGLESTEP, RESUME_WAIT_NO_EXIT, RESUME_NO_TICKS)) {
+      return false;
+    }
     ASSERT(this, old_ip == ip())
         << "Singlestep actually advanced when we "
         << "just expected a signal; was at " << old_ip << " now at "
@@ -4205,6 +4247,7 @@ void Task::move_to_signal_stop()
     // Ignore any pending TIME_SLICE_SIGNALs and continue until we get our
     // SYSCALLBUF_DESCHED_SIGNAL.
   } while (stop_sig() == PerfCounters::TIME_SLICE_SIGNAL);
+  return true;
 }
 
 bool Task::should_apply_rseq_abort(EventType event_type, remote_code_ptr* new_ip,
diff --git a/src/Task.h b/src/Task.h
index 8a20bfb071b..81b5ac1dc4b 100644
--- a/src/Task.h
+++ b/src/Task.h
@@ -78,11 +78,14 @@ enum ResumeRequest {
   RESUME_SYSEMU_SINGLESTEP = NativeArch::PTRACE_SYSEMU_SINGLESTEP,
 };
 enum WaitRequest {
+  // Don't wait after resuming.
+  RESUME_NONBLOCKING,
   // After resuming, blocking-waitpid() until tracee status
   // changes.
   RESUME_WAIT,
-  // Don't wait after resuming.
-  RESUME_NONBLOCKING
+  // Like RESUME_WAIT, but we're not expecting a PTRACE_EVENT_EXIT
+  // or reap, so return false also in that case.
+  RESUME_WAIT_NO_EXIT
 };
 enum TicksRequest {
   // We don't expect to see any ticks (though we seem to on the odd buggy
@@ -254,11 +257,17 @@ class Task {
   pid_t pid_of_pidfd(int fd);
 
   /**
-   * Force the wait status of this to |status|, as if
-   * |wait()/try_wait()| had returned it. Call this whenever a waitpid
-   * returned activity for this past.
+   * Records the wait status of this task as |status|, e.g. if
+   * |wait()/try_wait()| has returned it. Call this whenever a waitpid
+   * returned activity for this task.
+   * If this returns false, then the task was kicked out of a ptrace-stop
+   * by SIGKILL or equivalent before we could read registers etc.
+   * We will treat this stop as if it never happened; the caller must
+   * act as if there was no stop.
+   * If `status.reaped()` (i.e. fatal signal or normal exit), this always
+   * returns true.
    */
-  void did_waitpid(WaitStatus status);
+  bool did_waitpid(WaitStatus status);
 
   /**
    * Syscalls have side effects on registers (e.g. setting the flags register).
@@ -432,7 +441,7 @@ class Task {
    * a syscall, depending on the value of Session::syscall_seccomp_ordering()).
    * Continue into the kernel to perform the syscall and stop at the
    * PTRACE_SYSCALL syscall-exit trap. Returns false if we see the process exit
-   * before that.
+   * before that; we may or may not be stopped in that case.
    */
   bool exit_syscall();
 
@@ -575,8 +584,12 @@ class Task {
    * after that many seconds have elapsed.
    *
    * All tracee execution goes through here.
+   *
+   * If `wait_how` == RESUME_WAIT and we don't complete a
+   * did_waitpid() (e.g. because the tracee was SIGKILLed or
+   * equivalent), this returns false.
    */
-  void resume_execution(ResumeRequest how, WaitRequest wait_how,
+  bool resume_execution(ResumeRequest how, WaitRequest wait_how,
                         TicksRequest tick_period, int sig = 0);
 
   /** Return the session this is part of. */
@@ -585,6 +598,13 @@ class Task {
   /** Set the tracee's registers to |regs|. Lazy. */
   void set_regs(const Registers& regs);
 
+  /** Set the tracee's registers to |regs|. The task
+   * is known to be exiting (running towards, or actually in, a
+   * not-yet-reported PTRACE_EVENT_EXIT or reap) so its real registers
+   * won't change underneath us, but we need to override those
+   * registers with our values for recording purposes. */
+  void override_regs_during_exit(const Registers& regs);
+
   /** Ensure registers are flushed back to the underlying task. */
   void flush_regs();
 
@@ -712,8 +732,12 @@ class Task {
    * with the process in a stopped() state. If interrupt_after_elapsed >= 0,
    * interrupt the task after that many seconds have elapsed. If
    * interrupt_after_elapsed == 0.0, the interrupt will happen immediately.
+   * Returns false if the wait failed because we reached a stop but we got
+   * SIGKILLed (or equivalent) out of it, in which case it is not safe to wait
+   * because that might block indefinitely waiting for us to acknowledge the
+   * PTRACE_EVENT_EXIT of other tasks.
    */
-  void wait(double interrupt_after_elapsed = -1);
+  bool wait(double interrupt_after_elapsed = -1);
 
   /**
    * Currently we don't allow recording across uid changes, so we can
@@ -1079,8 +1103,9 @@ class Task {
   /**
    * Try to move this task to a signal stop by signaling it with the
    * syscallbuf desched signal (which is guaranteed not to be blocked).
+   * Returns false if the task exited unexpectedly.
    */
-  void move_to_signal_stop();
+  bool move_to_signal_stop();
 
   // A map from original table to (potentially detached) clone, to preserve
   // FdTable sharing relationships during a session fork.
diff --git a/src/fast_forward.cc b/src/fast_forward.cc
index 9e77ccdf8f9..1eeeb86dc1e 100644
--- a/src/fast_forward.cc
+++ b/src/fast_forward.cc
@@ -168,7 +168,8 @@ FastForwardStatus fast_forward_through_instruction(Task* t, ResumeRequest how,
 
   remote_code_ptr ip = t->ip();
 
-  t->resume_execution(how, RESUME_WAIT, RESUME_UNLIMITED_TICKS);
+  bool ok = t->resume_execution(how, RESUME_WAIT_NO_EXIT, RESUME_UNLIMITED_TICKS);
+  ASSERT(t, ok) << "Tracee was killed";
   if (t->stop_sig() != SIGTRAP) {
     // we might have stepped into a system call...
     return result;
@@ -310,7 +311,8 @@ FastForwardStatus fast_forward_through_instruction(Task* t, ResumeRequest how,
     // So, disable watchpoints temporarily.
     t->vm()->save_watchpoints();
     t->vm()->remove_all_watchpoints();
-    t->resume_execution(RESUME_CONT, RESUME_WAIT, RESUME_UNLIMITED_TICKS);
+    ok = t->resume_execution(RESUME_CONT, RESUME_WAIT_NO_EXIT, RESUME_UNLIMITED_TICKS);
+    ASSERT(t, ok) << "Tracee was killed";
     t->vm()->restore_watchpoints();
     t->vm()->remove_breakpoint(limit_ip, BKPT_INTERNAL);
     result.did_fast_forward = true;
diff --git a/src/record_signal.cc b/src/record_signal.cc
index 75d9197b5cf..4178c064811 100644
--- a/src/record_signal.cc
+++ b/src/record_signal.cc
@@ -518,7 +518,11 @@ static void handle_desched_event(RecordTask* t) {
     // syscall may have re-armed the event.
     disarm_desched_event(t);
 
-    t->resume_execution(RESUME_SYSCALL, RESUME_WAIT, RESUME_UNLIMITED_TICKS);
+    if (!t->resume_execution(RESUME_SYSCALL, RESUME_WAIT_NO_EXIT, RESUME_UNLIMITED_TICKS)) {
+      LOG(debug) << "  (got exit, bailing out)";
+      t->push_event(Event::noop());
+      return;
+    }
 
     if (t->status().is_syscall()) {
       t->apply_syscall_entry_regs();
@@ -537,12 +541,6 @@ static void handle_desched_event(RecordTask* t) {
       // seccomp filter.
       break;
     }
-    if (t->ptrace_event() == PTRACE_EVENT_EXIT) {
-      LOG(debug)
-          << "  (got exit, bailing out)";
-      t->push_event(Event::noop());
-      return;
-    }
 
     // Completely ignore spurious desched signals and
     // signals that aren't going to be delivered to the
diff --git a/src/record_syscall.cc b/src/record_syscall.cc
index 7c1570279dd..968e8f90b2f 100644
--- a/src/record_syscall.cc
+++ b/src/record_syscall.cc
@@ -3355,7 +3355,10 @@ static Switchable prepare_clone(RecordTask* t, TaskSyscallState& syscall_state)
   }
 
   while (true) {
-    t->resume_execution(RESUME_SYSCALL, RESUME_WAIT, RESUME_NO_TICKS);
+    if (!t->resume_execution(RESUME_SYSCALL, RESUME_WAIT_NO_EXIT, RESUME_NO_TICKS)) {
+      // Tracee died unexpectedly during clone.
+      return ALLOW_SWITCH;
+    }
     // XXX handle stray signals?
     if (t->ptrace_event()) {
       break;
@@ -3366,7 +3369,7 @@ static Switchable prepare_clone(RecordTask* t, TaskSyscallState& syscall_state)
       LOG(debug) << "clone failed, returning "
                  << errno_name(-t->regs().syscall_result_signed());
       syscall_state.emulate_result(t->regs().syscall_result());
-      // clone failed and we're existing the syscall with an error. Reenter
+      // clone failed and we're exiting the syscall with an error. Reenter
       // the syscall so that we're in the same state as the normal execution
       // path.
       t->ev().Syscall().failed_during_preparation = true;
diff --git a/src/replay_syscall.cc b/src/replay_syscall.cc
index 473e5d71331..fdf4e534ecd 100644
--- a/src/replay_syscall.cc
+++ b/src/replay_syscall.cc
@@ -218,7 +218,8 @@ template <typename Arch> static void prepare_clone(ReplayTask* t) {
   Registers entry_regs = r;
 
   // Run; we will be interrupted by PTRACE_EVENT_CLONE/FORK/VFORK.
-  t->resume_execution(RESUME_CONT, RESUME_WAIT, RESUME_NO_TICKS);
+  bool ok = t->resume_execution(RESUME_CONT, RESUME_WAIT_NO_EXIT, RESUME_NO_TICKS);
+  ASSERT(t, ok) << "Tracee was killed";
 
   pid_t new_tid;
   while (!t->clone_syscall_is_complete(&new_tid, Arch::arch())) {
@@ -227,7 +228,8 @@ template <typename Arch> static void prepare_clone(ReplayTask* t) {
     // state to try the syscall again.
     ASSERT(t, t->regs().syscall_result_signed() == -EAGAIN);
     t->set_regs(entry_regs);
-    t->resume_execution(RESUME_CONT, RESUME_WAIT, RESUME_NO_TICKS);
+    bool ok = t->resume_execution(RESUME_CONT, RESUME_WAIT_NO_EXIT, RESUME_NO_TICKS);
+    ASSERT(t, ok) << "Tracee was killed";
   }
 
   // Get out of the syscall