From 9ac2ef0c188816e461869f54eb7c15abbaff6efa Mon Sep 17 00:00:00 2001
From: Brian Warner <warner@lothar.com>
Date: Fri, 12 Apr 2024 00:42:21 -0500
Subject: [PATCH] feat(swingset): allow slow termination/deletion of vats

This introduces new `runPolicy()` controls which enable "slow
termination" of vats. When configured, terminated vats are immediately
dead (all promises are rejected, all new messages go splat, they never
run again), however the vat's state is deleted slowly, one piece at a
time. This makes it safe to terminate large vats, with a long history,
lots of c-list imports/exports, or large vatstore tables, without fear
of causing an overload (by e.g. dropping 100k references all in a
single crank).

See docs/run-policy.md for details and configuration instructions.

The kernelKeeper is upgraded from v1 to v2, to add a new
'vats.terminated' key, which tracks the vats that have been terminated
but not yet completely deleted. NOTE: deployed applications must use
`upgradeSwingset()` when using this kernel version for the first time.

Also refactor vatKeeper.deleteSnapshotsAndTranscripts() into two
separate methods, to fix a bug that hid in the combination: if the
snapshot deletion phase exhausted our budget, we'd call
deleteVatTranscripts() with a budget of 0, which was interpreted as
"unlimited", and deleted all the transcript spans in a single burst.

refs #8928

Co-authored-by: Richard Gibson <richard.gibson@gmail.com>
---
 packages/SwingSet/docs/run-policy.md          | 142 ++++++++-
 .../src/controller/upgradeSwingset.js         |   8 +
 packages/SwingSet/src/kernel/kernel.js        |  87 ++++--
 .../SwingSet/src/kernel/state/kernelKeeper.js | 196 +++++++++++-
 .../SwingSet/src/kernel/state/vatKeeper.js    |  44 ++-
 packages/SwingSet/src/kernel/vat-warehouse.js |   4 +
 packages/SwingSet/src/lib/runPolicies.js      |  52 +++-
 packages/SwingSet/src/types-external.js       |   9 +-
 packages/SwingSet/src/types-internal.js       |   4 +-
 .../SwingSet/test/snapshots/state.test.js.md  |   4 +-
 .../test/snapshots/state.test.js.snap         | Bin 279 -> 277 bytes
 packages/SwingSet/test/state.test.js          |  46 ++-
 packages/SwingSet/test/stripPrefix.test.js    |   8 +
 .../SwingSet/test/transcript-light.test.js    |   2 +-
 .../SwingSet/test/upgrade-swingset.test.js    |  33 +-
 .../bootstrap-slow-terminate.js               |  55 ++++
 .../slow-termination/slow-termination.test.js | 292 ++++++++++++++++++
 .../slow-termination/vat-slow-terminate.js    |  16 +
 .../vat-admin/terminate/terminate.test.js     |   6 +-
 packages/internal/src/index.js                |   1 +
 20 files changed, 945 insertions(+), 64 deletions(-)
 create mode 100644 packages/SwingSet/test/stripPrefix.test.js
 create mode 100644 packages/SwingSet/test/vat-admin/slow-termination/bootstrap-slow-terminate.js
 create mode 100644 packages/SwingSet/test/vat-admin/slow-termination/slow-termination.test.js
 create mode 100644 packages/SwingSet/test/vat-admin/slow-termination/vat-slow-terminate.js

diff --git a/packages/SwingSet/docs/run-policy.md b/packages/SwingSet/docs/run-policy.md
index 7ad658fa125..6befcf2e54b 100644
--- a/packages/SwingSet/docs/run-policy.md
+++ b/packages/SwingSet/docs/run-policy.md
@@ -39,9 +39,14 @@ The kernel will invoke the following methods on the policy object (so all must e
 * `policy.crankFailed()`
 * `policy.emptyCrank()`
 
-All methods should return `true` if the kernel should keep running, or `false` if it should stop.
+All those methods should return `true` if the kernel should keep running, or `false` if it should stop.
 
-The `computrons` argument may be `undefined` (e.g. if the crank was delivered to a non-`xs worker`-based vat, such as the comms vat). The policy should probably treat this as equivalent to some "typical" number of computrons.
+The following methods are optional (for backwards compatibility with policy objects created for older kernels):
+
+* `policy.allowCleanup()` : may return budget, see "Terminated-Vat Cleanup" below
+* `policy.didCleanup({ cleanups })` (if missing, kernel pretends it returned `true` to keep running)
+
+The `computrons` value may be `undefined` (e.g. if the crank was delivered to a non-`xs worker`-based vat, such as the comms vat). The policy should probably treat this as equivalent to some "typical" number of computrons.
 
 `crankFailed` indicates that the vat suffered an error during crank delivery, such as a metering fault, memory allocation fault, or fatal syscall. We do not currently have a way to measure the computron usage of failed cranks (many of the error cases are signaled by the worker process exiting with a distinctive status code, which does not give it an opportunity to report back detailed metering data). The run policy should assume the worst.
 
@@ -57,6 +62,27 @@ More arguments may be added in the future, such as:
 
 The run policy should be provided as the first argument to `controller.run()`. If omitted, the kernel defaults to `forever`, a policy that runs until the queue is empty.
 
+## Terminated-Vat Cleanup
+
+Some vats may grow very large (i.e. large c-lists with lots of imported/exported objects, or lots of vatstore entries). If/when these are terminated, the burst of cleanup work might overwhelm the kernel, especially when processing all the dropped imports (which trigger GC messages to other vats).
+
+To protect the system against these bursts, the run policy can be configured to terminate vats slowly. Instead of doing all the cleanup work immediately, the policy allows the kernel to do a little bit of work each time `controller.run()` is called (e.g. once per block, for kernels hosted inside a blockchain).
+
+There are two RunPolicy methods which control this. The first is `runPolicy.allowCleanup()`. This will be invoked many times during `controller.run()`, each time the kernel tries to decide what to do next (once per step). The return value will enable (or not) a fixed amount of cleanup work. The second is `runPolicy.didCleanup({ cleanups })`, which is called later, to inform the policy of how much cleanup work was actually done. The policy can count the cleanups and switch `allowCleanup()` to return `false` when it reaches a threshold. (We need the pre-check `allowCleanup` method because the simple act of looking for cleanup work is itself a cost that we might not be willing to pay).
+
+If `allowCleanup()` exists, it must either return a falsy value or a `{ budget?: number }` object.
+
+A falsy return value (eg `allowCleanup: () => false`) prohibits cleanup work. This can be useful in a "only clean up during idle blocks" approach (see below), but should not be the only policy used, otherwise vat cleanup would never happen.
+
+A numeric `budget` limits how many cleanups are allowed to happen (if any are needed). One "cleanup" will delete one vatstore row, or one c-list entry (note that c-list deletion may trigger GC work), or one heap snapshot record, or one transcript span (including its populated transcript items). Using `{ budget: 5 }` seems to be a reasonable limit on each call, balancing overhead against doing sufficiently small units of work that we can limit the total work performed.
+
+If `budget` is missing or `undefined`, the kernel will perform unlimited cleanup work. This also happens if `allowCleanup()` is missing entirely, which maintains the old behavior for host applications that haven't been updated to make new policy objects. Note that cleanup is higher priority than any delivery, and is second only to acceptance queue routing.
+
+`didCleanup({ cleanups })` is called when the kernel actually performed some vat-termination cleanup, and the `cleanups` property is a number with the count of cleanups that took place. Each query to `allowCleanup()` might (or might not) be followed by a call to `didCleanup`, with a `cleanups` value that does not exceed the specified budget. Like other policy methods, `didCleanup` should return `true` if the kernel should keep running or `false` if it should stop.
+
+To limit the work done per block (for blockchain-based applications) the host's RunPolicy objects must keep track of how many cleanups were reported, and change the behavior of `allowCleanup()` when it reaches a per-block threshold. See below for examples.
+
+
 ## Typical Run Policies
 
 A basic policy might simply limit the block to 100 cranks with deliveries and two vat creations:
@@ -82,6 +108,7 @@ function make100CrankPolicy() {
       return true;
     },
   });
+  return policy;
 }
 ```
 
@@ -99,15 +126,15 @@ while(1) {
 
 Note that a new instance of this kind of policy object should be provided in each call to `controller.run()`.
 
-A more sophisticated one would count computrons. Suppose that experiments suggest that one million computrons take about 5 seconds to execute. The policy would look like:
+A more sophisticated policy would count computrons, for example based on experimental observations that a 5-second budget is filled by about sixty-five million computrons. The policy would look like:
 
 
 ```js
 function makeComputronCounterPolicy(limit) {
-  let total = 0;
+  let total = 0n;
   const policy = harden({
     vatCreated() {
-      total += 100000; // pretend vat creation takes 100k computrons
+      total += 100_000n; // pretend vat creation takes 100k computrons
       return (total < limit);
     },
     crankComplete(details) {
@@ -116,17 +143,118 @@ function makeComputronCounterPolicy(limit) {
       return (total < limit);
     },
     crankFailed() {
-      total += 1000000; // who knows, 1M is as good as anything
+      total += 1_000_000n; // who knows, 1M is as good as anything
       return (total < limit);
     },
     emptyCrank() {
       return true;
     }
   });
+  return policy;
 }
 ```
 
-See `src/runPolicies.js` for examples.
+See [runPolicies.js](../src/lib/runPolicies.js) for examples.
+
+To slowly terminate vats, limiting each block to 5 cleanups, the policy should start with a budget of 5, return the remaining `{ budget }` from `allowCleanup()`, and decrement it as `didCleanup` reports that budget being consumed:
+
+```js
+function makeSlowTerminationPolicy() {
+  let cranks = 0;
+  let vats = 0;
+  let cleanups = 5;
+  const policy = harden({
+    vatCreated() {
+      vats += 1;
+      return (vats < 2);
+    },
+    crankComplete(details) {
+      cranks += 1;
+      return (cranks < 100);
+    },
+    crankFailed() {
+      cranks += 1;
+      return (cranks < 100);
+    },
+    emptyCrank() {
+      return true;
+    },
+    allowCleanup() {
+      if (cleanups > 0) {
+        return { budget: cleanups };
+      } else {
+        return false;
+      }
+    },
+    didCleanup(spent) {
+      cleanups -= spent.cleanups;
+    },
+  });
+  return policy;
+}
+```
+
+A more conservative approach might only allow cleanup in otherwise-empty blocks. To accompish this, use two separate policy objects, and two separate "runs". The first run only performs deliveries, and prohibits all cleanups:
+
+```js
+function makeDeliveryOnlyPolicy() {
+  let empty = true;
+  const didWork = () => { empty = false; return true; };
+  const policy = harden({
+    vatCreated: didWork,
+    crankComplete: didWork,
+    crankFailed: didWork,
+    emptyCrank: didWork,
+    allowCleanup: () => false,
+  });
+  const wasEmpty = () => empty;
+  return [ policy, wasEmpty ];
+}
+```
+
+The second only performs cleanup, with a limited budget, stopping the run after any deliveries occur (such as GC actions):
+
+```js
+function makeCleanupOnlyPolicy() {
+  let cleanups = 5;
+  const stop: () => false;
+  const policy = harden({
+    vatCreated: stop,
+    crankComplete: stop,
+    crankFailed: stop,
+    emptyCrank: stop,
+    allowCleanup() {
+      if (cleanups > 0) {
+        return { budget: cleanups };
+      } else {
+        return false;
+      }
+    },
+    didCleanup(spent) {
+      cleanups -= spent.cleanups;
+    },
+  });
+  return policy;
+}
+```
+
+On each block, the host should only perform the second (cleanup) run if the first policy reports that the block was empty:
+
+```js
+async function doBlock() {
+  const [ firstPolicy, wasEmpty ] = makeDeliveryOnlyPolicy();
+  await controller.run(firstPolicy);
+  if (wasEmpty()) {
+    const secondPolicy = makeCleanupOnlyPolicy();
+    await controller.run(secondPolicy);
+  }
+}
+```
+
+Note that regardless of whatever computron/delivery budget is imposed by the first policy, the second policy will allow one additional delivery to be made (we do not yet have an `allowDelivery()` pre-check method that might inhibit this). The cleanup work, which may or may not happen, will sometimes trigger a GC delivery like `dispatch.dropExports`, but at most one such delivery will be made before the second policy returns `false` and stops `controller.run()`. If cleanup does not trigger such a delivery, or if no cleanup work needs to be done, then one normal run-queue delivery will be performed before the policy has a chance to say "stop". All other cleanup-triggered GC work will be deferred until the first run of the next block.
+
+Also note that `budget` and `cleanups` are plain `Number`s, whereas `comptrons` is a `BigInt`.
+
 
 ## Non-Consensus Wallclock Limits
 
diff --git a/packages/SwingSet/src/controller/upgradeSwingset.js b/packages/SwingSet/src/controller/upgradeSwingset.js
index 5de665342b7..87195b8cfae 100644
--- a/packages/SwingSet/src/controller/upgradeSwingset.js
+++ b/packages/SwingSet/src/controller/upgradeSwingset.js
@@ -196,6 +196,14 @@ export const upgradeSwingset = kernelStorage => {
     version = 1;
   }
 
+  if (version < 2) {
+    // schema v2: add vats.terminated = []
+    assert(!kvStore.has('vats.terminated'));
+    kvStore.set('vats.terminated', JSON.stringify([]));
+    modified = true;
+    version = 2;
+  }
+
   if (modified) {
     kvStore.set('version', `${version}`);
   }
diff --git a/packages/SwingSet/src/kernel/kernel.js b/packages/SwingSet/src/kernel/kernel.js
index 032847ada77..060e6675b03 100644
--- a/packages/SwingSet/src/kernel/kernel.js
+++ b/packages/SwingSet/src/kernel/kernel.js
@@ -2,6 +2,7 @@
 
 import { assert, Fail } from '@endo/errors';
 import { isNat } from '@endo/nat';
+import { mustMatch, M } from '@endo/patterns';
 import { importBundle } from '@endo/import-bundle';
 import { makeUpgradeDisconnection } from '@agoric/internal/src/upgrade-api.js';
 import { kser, kslot, makeError } from '@agoric/kmarshal';
@@ -275,12 +276,16 @@ export default function buildKernel(
       // (#9157). The fix will add .critical to CrankResults, populated by a
       // getOptions query in deliveryCrankResults() or copied from
       // dynamicOptions in processCreateVat.
-      critical = kernelKeeper.provideVatKeeper(vatID).getOptions().critical;
+      const vatKeeper = kernelKeeper.provideVatKeeper(vatID);
+      critical = vatKeeper.getOptions().critical;
 
       // Reject all promises decided by the vat, making sure to capture the list
       // of kpids before that data is deleted.
       const deadPromises = [...kernelKeeper.enumeratePromisesByDecider(vatID)];
-      kernelKeeper.cleanupAfterTerminatedVat(vatID);
+      // remove vatID from the list of live vats, and mark for deletion
+      kernelKeeper.deleteVatID(vatID);
+      kernelKeeper.markVatAsTerminated(vatID);
+      kernelKeeper.removeVatFromSwingStoreExports(vatID);
       for (const kpid of deadPromises) {
         resolveToError(kpid, makeError('vat terminated'), vatID);
       }
@@ -387,7 +392,8 @@ export default function buildKernel(
    *    abort?: boolean, // changes should be discarded, not committed
    *    consumeMessage?: boolean, // discard the aborted delivery
    *    didDelivery?: VatID, // we made a delivery to a vat, for run policy and save-snapshot
-   *    computrons?: BigInt, // computron count for run policy
+   *    computrons?: bigint, // computron count for run policy
+   *    cleanups?: number, // cleanup budget spent
    *    meterID?: string, // deduct those computrons from a meter
    *    measureDirt?: { vatID: VatID, dirt: Dirt }, // dirt counters should increment
    *    terminate?: { vatID: VatID, reject: boolean, info: SwingSetCapData }, // terminate vat, notify vat-admin
@@ -651,16 +657,40 @@ export default function buildKernel(
     if (!vatWarehouse.lookup(vatID)) {
       return NO_DELIVERY_CRANK_RESULTS; // can't collect from the dead
     }
-    const vatKeeper = kernelKeeper.provideVatKeeper(vatID);
     /** @type { KernelDeliveryBringOutYourDead } */
     const kd = harden([type]);
     const vd = vatWarehouse.kernelDeliveryToVatDelivery(vatID, kd);
     const status = await deliverAndLogToVat(vatID, kd, vd);
-    vatKeeper.clearReapDirt(); // BOYD zeros out the when-to-BOYD counters
     // no gcKrefs, BOYD clears them anyways
     return deliveryCrankResults(vatID, status, false); // no meter, BOYD clears dirt
   }
 
+  /**
+   * Perform a small (budget-limited) amount of dead-vat cleanup work.
+   *
+   * @param {RunQueueEventCleanupTerminatedVat} message
+   *     'message' is the run-queue cleanup action, which includes a vatID and budget.
+   *     A budget of 'undefined' allows unlimited work. Otherwise, the budget is a Number,
+   *     and cleanup should not touch more than maybe 5*budget DB rows.
+   * @returns {Promise<CrankResults>}
+   */
+  async function processCleanupTerminatedVat(message) {
+    const { vatID, budget } = message;
+    const { done, cleanups } = kernelKeeper.cleanupAfterTerminatedVat(
+      vatID,
+      budget,
+    );
+    if (done) {
+      kernelKeeper.forgetTerminatedVat(vatID);
+      kernelSlog.write({ type: 'vat-cleanup-complete', vatID });
+    }
+    // We don't perform any deliveries here, so there are no computrons to
+    // report, but we do tell the runPolicy know how much kernel-side DB
+    // work we did, so it can decide how much was too much.
+    const computrons = 0n;
+    return harden({ computrons, cleanups });
+  }
+
   /**
    * The 'startVat' event is queued by `initializeKernel` for all static vats,
    * so that we execute their bundle imports and call their `buildRootObject`
@@ -916,7 +946,6 @@ export default function buildKernel(
     const boydVD = vatWarehouse.kernelDeliveryToVatDelivery(vatID, boydKD);
     const boydStatus = await deliverAndLogToVat(vatID, boydKD, boydVD);
     const boydResults = deliveryCrankResults(vatID, boydStatus, false);
-    vatKeeper.clearReapDirt();
 
     // we don't meter bringOutYourDead since no user code is running, but we
     // still report computrons to the runPolicy
@@ -1172,6 +1201,7 @@ export default function buildKernel(
    * @typedef { import('../types-internal.js').RunQueueEventRetireImports } RunQueueEventRetireImports
    * @typedef { import('../types-internal.js').RunQueueEventNegatedGCAction } RunQueueEventNegatedGCAction
    * @typedef { import('../types-internal.js').RunQueueEventBringOutYourDead } RunQueueEventBringOutYourDead
+   * @typedef { import('../types-internal.js').RunQueueEventCleanupTerminatedVat } RunQueueEventCleanupTerminatedVat
    * @typedef { import('../types-internal.js').RunQueueEvent } RunQueueEvent
    */
 
@@ -1239,6 +1269,8 @@ export default function buildKernel(
     } else if (message.type === 'negated-gc-action') {
       // processGCActionSet pruned some negated actions, but had no GC
       // action to perform. Record the DB changes in their own crank.
+    } else if (message.type === 'cleanup-terminated-vat') {
+      deliverP = processCleanupTerminatedVat(message);
     } else if (gcMessages.includes(message.type)) {
       deliverP = processGCMessage(message);
     } else {
@@ -1289,18 +1321,13 @@ export default function buildKernel(
     const crankResults = await deliverRunQueueEvent(message);
     // { abort/commit, deduct, terminate+notify, consumeMessage }
 
-    if (crankResults.didDelivery) {
-      if (message.type === 'create-vat') {
-        // TODO: create-vat now gets metering, at least for the
-        // dispatch.startVat . We should probably tell the policy about
-        // the creation too since there's extra overhead (we're
-        // launching a new child process, at least, although that
-        // sometimes happens randomly because of vat eviction policy
-        // which should not affect the in-consensus policyInput)
-        policyInput = ['create-vat', {}];
-      } else {
-        policyInput = ['crank', {}];
-      }
+    if (message.type === 'cleanup-terminated-vat') {
+      const { cleanups } = crankResults;
+      assert(cleanups !== undefined);
+      policyInput = ['cleanup', { cleanups }];
+    } else if (crankResults.didDelivery) {
+      const tag = message.type === 'create-vat' ? 'create-vat' : 'crank';
+      policyInput = [tag, {}];
     }
 
     // Deliveries cause syscalls, syscalls might cause errors
@@ -1758,16 +1785,24 @@ export default function buildKernel(
     }
   }
 
+  const allowCleanupShape = M.or(
+    // 'false' will prohibit cleanup
+    false,
+    // otherwise allow cleanup, optionally with a limiting budget
+    M.splitRecord({}, { budget: M.number() }, M.record()),
+  );
+
   /**
    * Pulls the next message from the highest-priority queue and returns it
    * along with a corresponding processor.
    *
+   * @param {RunPolicy} [policy] - a RunPolicy to limit the work being done
    * @returns {{
    *   message: RunQueueEvent | undefined,
    *   processor: (message: RunQueueEvent) => Promise<PolicyInput>,
    * }}
    */
-  function getNextMessageAndProcessor() {
+  function getNextMessageAndProcessor(policy) {
     const acceptanceMessage = kernelKeeper.getNextAcceptanceQueueMsg();
     if (acceptanceMessage) {
       return {
@@ -1775,7 +1810,12 @@ export default function buildKernel(
         processor: processAcceptanceMessage,
       };
     }
+    // Absent specific configuration, allow unlimited cleanup.
+    const allowCleanup = policy?.allowCleanup?.() ?? {};
+    mustMatch(harden(allowCleanup), allowCleanupShape);
+
     const message =
+      kernelKeeper.nextCleanupTerminatedVatAction(allowCleanup) ||
       processGCActionSet(kernelKeeper) ||
       kernelKeeper.nextReapAction() ||
       kernelKeeper.getNextRunQueueMsg();
@@ -1885,7 +1925,7 @@ export default function buildKernel(
       kernelKeeper.startCrank();
       try {
         kernelKeeper.establishCrankSavepoint('start');
-        const { processor, message } = getNextMessageAndProcessor();
+        const { processor, message } = getNextMessageAndProcessor(policy);
         if (!message) {
           break;
         }
@@ -1907,6 +1947,13 @@ export default function buildKernel(
           case 'crank-failed':
             policyOutput = policy.crankFailed(policyInput[1]);
             break;
+          case 'cleanup': {
+            // Give the policy a chance to interrupt kernel execution,
+            // but default to continuing.
+            const { didCleanup } = policy;
+            policyOutput = didCleanup ? didCleanup(policyInput[1]) : true;
+            break;
+          }
           case 'none':
             policyOutput = policy.emptyCrank();
             break;
diff --git a/packages/SwingSet/src/kernel/state/kernelKeeper.js b/packages/SwingSet/src/kernel/state/kernelKeeper.js
index d1e149c44be..c6d84d97d27 100644
--- a/packages/SwingSet/src/kernel/state/kernelKeeper.js
+++ b/packages/SwingSet/src/kernel/state/kernelKeeper.js
@@ -1,6 +1,7 @@
 /* eslint-disable no-use-before-define */
 import { Nat, isNat } from '@endo/nat';
 import { assert, Fail } from '@endo/errors';
+import { objectMetaMap } from '@agoric/internal';
 import {
   initializeVatState,
   makeVatKeeper,
@@ -50,7 +51,7 @@ const enableKernelGC = true;
 export { DEFAULT_REAP_DIRT_THRESHOLD_KEY };
 
 // most recent DB schema version
-export const CURRENT_SCHEMA_VERSION = 1;
+export const CURRENT_SCHEMA_VERSION = 2;
 
 // Kernel state lives in a key-value store supporting key retrieval by
 // lexicographic range. All keys and values are strings.
@@ -69,14 +70,15 @@ export const CURRENT_SCHEMA_VERSION = 1;
 // only modified by a call to upgradeSwingset(). See below for
 // deltas/upgrades from one version to the next.
 //
-// The current ("v1") schema keys/values are:
+// The current ("v2") schema keys/values are:
 //
-// version = '1'
+// version = '2'
 // vat.names = JSON([names..])
 // vat.dynamicIDs = JSON([vatIDs..])
 // vat.name.$NAME = $vatID = v$NN
 // vat.nextID = $NN
 // vat.nextUpgradeID = $NN
+// vats.terminated = JSON([vatIDs..])
 // device.names = JSON([names..])
 // device.name.$NAME = $deviceID = d$NN
 // device.nextID = $NN
@@ -172,6 +174,9 @@ export const CURRENT_SCHEMA_VERSION = 1;
 //   * replace `kernel.defaultReapInterval` with `kernel.defaultReapDirtThreshold`
 //   * replace vat's `vNN.reapInterval`/`vNN.reapCountdown` with `vNN.reapDirt`
 //             and a `vNN.reapDirtThreshold` in `vNN.options`
+// v2:
+//   * change `version` to `'2'`
+//   * add `vats.terminated` with `[]` as initial value
 
 export function commaSplit(s) {
   if (s === '') {
@@ -180,6 +185,11 @@ export function commaSplit(s) {
   return s.split(',');
 }
 
+export function stripPrefix(prefix, str) {
+  assert(str.startsWith(prefix), str);
+  return str.slice(prefix.length);
+}
+
 function insistMeterID(m) {
   assert.typeof(m, 'string');
   assert.equal(m[0], 'm');
@@ -248,6 +258,12 @@ export default function makeKernelKeeper(
 
   insistStorageAPI(kvStore);
 
+  // the terminated-vats cache is normally populated from
+  // 'vats.terminated', but for initialization purposes we need give
+  // it a value here, and then populate it for real if we're dealing
+  // with an up-to-date DB
+  let terminatedVats = [];
+
   const versionString = kvStore.get('version');
   const version = Number(versionString || '0');
   if (expectedVersion === 'uninitialized') {
@@ -261,6 +277,9 @@ export default function makeKernelKeeper(
     throw Error(
       `kernel DB is too old: has version v${version}, but expected v${expectedVersion}`,
     );
+  } else {
+    // DB is up-to-date, so populate any caches we use
+    terminatedVats = JSON.parse(getRequired('vats.terminated'));
   }
 
   /**
@@ -399,6 +418,7 @@ export default function makeKernelKeeper(
     kvStore.set('vat.dynamicIDs', '[]');
     kvStore.set('vat.nextID', `${FIRST_VAT_ID}`);
     kvStore.set('vat.nextUpgradeID', `1`);
+    kvStore.set('vats.terminated', '[]');
     kvStore.set('device.names', '[]');
     kvStore.set('device.nextID', `${FIRST_DEVICE_ID}`);
     kvStore.set('ko.nextID', `${FIRST_OBJECT_ID}`);
@@ -679,8 +699,12 @@ export default function makeKernelKeeper(
   function ownerOfKernelObject(kernelSlot) {
     insistKernelType('object', kernelSlot);
     const owner = kvStore.get(`${kernelSlot}.owner`);
-    if (owner) {
-      insistVatID(owner);
+    if (!owner) {
+      return undefined;
+    }
+    insistVatID(owner);
+    if (terminatedVats.includes(owner)) {
+      return undefined;
     }
     return owner;
   }
@@ -885,13 +909,70 @@ export default function makeKernelKeeper(
     kvStore.set(`${kernelSlot}.data.slots`, capdata.slots.join(','));
   }
 
-  function cleanupAfterTerminatedVat(vatID) {
+  function removeVatFromSwingStoreExports(vatID) {
+    // Delete primary swingstore records for this vat, in preparation
+    // for (slow) deletion. After this, swingstore exports will omit
+    // this vat. This is called from the kernel's terminateVat, which
+    // initiates (but does not complete) deletion.
+    snapStore.stopUsingLastSnapshot(vatID);
+    transcriptStore.stopUsingTranscript(vatID);
+  }
+
+  /**
+   * Perform some cleanup work for a specific (terminated but not
+   * fully-deleted) vat, possibly limited by a budget. Returns 'done'
+   * (where false means "please call me again", and true means "you
+   * can delete the vatID now"), and a count of how much work was done
+   * (so the runPolicy can decide when to stop).
+   *
+   * @param {string} vatID
+   * @param {number} [budget]
+   * @returns {{ done: boolean, cleanups: number }}
+   *
+   */
+  function cleanupAfterTerminatedVat(vatID, budget = undefined) {
     insistVatID(vatID);
-    const vatKeeper = provideVatKeeper(vatID);
-    const exportPrefix = `${vatID}.c.o+`;
-    const importPrefix = `${vatID}.c.o-`;
+    let cleanups = 0;
+    const work = {
+      exports: 0,
+      imports: 0,
+      kv: 0,
+      snapshots: 0,
+      transcripts: 0,
+    };
+    let spend = _did => false; // returns "stop now"
+    if (budget !== undefined) {
+      assert.typeof(budget, 'number');
+      spend = (did = 1) => {
+        assert(budget !== undefined); // hush TSC
+        cleanups += did;
+        budget -= did;
+        return budget <= 0;
+      };
+    }
+    const logWork = () => {
+      const w = objectMetaMap(work, desc => (desc.value ? desc : undefined));
+      kernelSlog?.write({ type: 'vat-cleanup', vatID, work: w });
+    };
+
+    // TODO: it would be slightly cheaper to walk all kvStore keys in
+    // order, and act on each one according to its category (c-list
+    // export, c-list import, vatstore, other), so we use a single
+    // enumeratePrefixedKeys() call each time. Until we do that, the
+    // last phase of the cleanup (where we've deleted all the exports
+    // and imports, and are working on the remaining keys) will waste
+    // two DB queries on each call. OTOH, those queries will probably
+    // hit the same SQLite index page as the successful one, so it
+    // probably won't cause any extra disk IO. So we can defer this
+    // optimization for a while. Note: when we implement it, be
+    // prepared to encounter the clist entries in eiher order (kref
+    // first or vref first), and delete the other one in the same
+    // call, so we don't wind up with half an entry.
 
-    vatKeeper.deleteSnapshotsAndTranscript();
+    const vatKeeper = provideVatKeeper(vatID);
+    const clistPrefix = `${vatID}.c.`;
+    const exportPrefix = `${clistPrefix}o+`;
+    const importPrefix = `${clistPrefix}o-`;
 
     // Note: ASCII order is "+,-./", and we rely upon this to split the
     // keyspace into the various o+NN/o-NN/etc spaces. If we were using a
@@ -915,8 +996,27 @@ export default function makeKernelKeeper(
       // begin with `vMM.c.o+`.  In addition to deleting the c-list entry, we
       // must also delete the corresponding kernel owner entry for the object,
       // since the object will no longer be accessible.
+      const vref = stripPrefix(clistPrefix, k);
+      assert(vref.startsWith('o+'), vref);
       const kref = kvStore.get(k);
-      orphanKernelObject(kref, vatID);
+      // we must delete the c-list entry, but we don't need to
+      // manipulate refcounts like the way vatKeeper/deleteCListEntry
+      // does, so just delete the keys directly
+      // vatKeeper.deleteCListEntry(kref, vref);
+      kvStore.delete(`${vatID}.c.${kref}`);
+      kvStore.delete(`${vatID}.c.${vref}`);
+      // if this object became unreferenced, processRefcounts will
+      // delete it, so don't be surprised. TODO: this results in an
+      // extra get() for each delete(), see if there's a way to avoid
+      // this. TODO: think carefully about other such races.
+      if (kvStore.has(`${kref}.owner`)) {
+        orphanKernelObject(kref, vatID);
+      }
+      work.exports += 1;
+      if (spend()) {
+        logWork();
+        return { done: false, cleanups };
+      }
     }
 
     // then scan for imported objects, which must be decrefed
@@ -924,9 +1024,14 @@ export default function makeKernelKeeper(
       // abandoned imports: delete the clist entry as if the vat did a
       // drop+retire
       const kref = kvStore.get(k) || Fail`getNextKey ensures get`;
-      const vref = k.slice(`${vatID}.c.`.length);
+      const vref = stripPrefix(clistPrefix, k);
       vatKeeper.deleteCListEntry(kref, vref);
       // that will also delete both db keys
+      work.imports += 1;
+      if (spend()) {
+        logWork();
+        return { done: false, cleanups };
+      }
     }
 
     // the caller used enumeratePromisesByDecider() before calling us,
@@ -935,8 +1040,31 @@ export default function makeKernelKeeper(
     // now loop back through everything and delete it all
     for (const k of enumeratePrefixedKeys(kvStore, `${vatID}.`)) {
       kvStore.delete(k);
+      work.kv += 1;
+      if (spend()) {
+        logWork();
+        return { done: false, cleanups };
+      }
+    }
+
+    // this will internally loop through 'budget' deletions
+    const dsc = vatKeeper.deleteSnapshots(budget);
+    work.snapshots += dsc.cleanups;
+    if (spend(dsc.cleanups)) {
+      logWork();
+      return { done: false, cleanups };
     }
 
+    // same
+    const dts = vatKeeper.deleteTranscripts(budget);
+    work.transcripts += dts.cleanups;
+    // last task, so increment cleanups, but dc.done is authoritative
+    spend(dts.cleanups);
+    logWork();
+    return { done: dts.done, cleanups };
+  }
+
+  function deleteVatID(vatID) {
     // TODO: deleting entries from the dynamic vat IDs list requires a linear
     // scan of the list; arguably this collection ought to be represented in a
     // different way that makes it efficient to remove an entry from it, though
@@ -1221,6 +1349,41 @@ export default function makeKernelKeeper(
     return makeUpgradeID(nextID);
   }
 
+  function markVatAsTerminated(vatID) {
+    if (!terminatedVats.includes(vatID)) {
+      terminatedVats.push(vatID);
+      kvStore.set(`vats.terminated`, JSON.stringify(terminatedVats));
+    }
+  }
+
+  function getFirstTerminatedVat() {
+    if (terminatedVats.length) {
+      return terminatedVats[0];
+    }
+    return undefined;
+  }
+
+  function forgetTerminatedVat(vatID) {
+    terminatedVats = terminatedVats.filter(id => id !== vatID);
+    kvStore.set(`vats.terminated`, JSON.stringify(terminatedVats));
+  }
+
+  function nextCleanupTerminatedVatAction(allowCleanup) {
+    if (!allowCleanup) {
+      return undefined;
+    }
+    // budget === undefined means "unlimited"
+    const { budget } = allowCleanup;
+    // if (getGCActions().size) {
+    //  return undefined;
+    // }
+    const vatID = getFirstTerminatedVat();
+    if (vatID) {
+      return { type: 'cleanup-terminated-vat', vatID, budget };
+    }
+    return undefined;
+  }
+
   // As refcounts are decremented, we accumulate a set of krefs for which
   // action might need to be taken:
   //   * promises which are now resolved and unreferenced can be deleted
@@ -1413,7 +1576,7 @@ export default function makeKernelKeeper(
 
   function vatIsAlive(vatID) {
     insistVatID(vatID);
-    return kvStore.has(`${vatID}.o.nextID`);
+    return kvStore.has(`${vatID}.o.nextID`) && !terminatedVats.includes(vatID);
   }
 
   /**
@@ -1705,11 +1868,18 @@ export default function makeKernelKeeper(
     provideVatKeeper,
     vatIsAlive,
     evictVatKeeper,
+    removeVatFromSwingStoreExports,
     cleanupAfterTerminatedVat,
     addDynamicVatID,
     getDynamicVats,
     getStaticVats,
     getDevices,
+    deleteVatID,
+
+    markVatAsTerminated,
+    getFirstTerminatedVat,
+    forgetTerminatedVat,
+    nextCleanupTerminatedVatAction,
 
     allocateUpgradeID,
 
diff --git a/packages/SwingSet/src/kernel/state/vatKeeper.js b/packages/SwingSet/src/kernel/state/vatKeeper.js
index 2ebe31dc384..f4ce7b8a7ce 100644
--- a/packages/SwingSet/src/kernel/state/vatKeeper.js
+++ b/packages/SwingSet/src/kernel/state/vatKeeper.js
@@ -675,11 +675,44 @@ export function makeVatKeeper(
     });
   }
 
-  function deleteSnapshotsAndTranscript() {
-    if (snapStore) {
-      snapStore.deleteVatSnapshots(vatID);
+  /**
+   * Perform some (possibly-limited) cleanup work for a vat. Returns
+   * 'done' (where false means "please call me again", and true means
+   * "you can delete the vatID now"), and a count of how much work was
+   * done (so the runPolicy can decide when to stop).
+   *
+   * @param {number} [budget]
+   * @returns {{ done: boolean, cleanups: number }}
+   *
+   */
+  function deleteSnapshots(budget = undefined) {
+    // Each budget=1 allows us to delete one snapshot entry.
+    if (!snapStore) {
+      return { done: true, cleanups: 0 };
     }
-    transcriptStore.deleteVatTranscripts(vatID);
+    // initially uses 2+2*budget DB statements, then just 1 when done
+    return snapStore.deleteVatSnapshots(vatID, budget);
+  }
+
+  /**
+   * Perform some (possibly-limited) cleanup work for a vat. Returns
+   * 'done' (where false means "please call me again", and true means
+   * "you can delete the vatID now"), and a count of how much work was
+   * done (so the runPolicy can decide when to stop).
+   *
+   * @param {number} [budget]
+   * @returns {{ done: boolean, cleanups: number }}
+   *
+   */
+  function deleteTranscripts(budget = undefined) {
+    // Each budget=1 allows us to delete one transcript span and any
+    // transcript items associated with that span. Some nodes will
+    // have historical transcript items, some will not. Using budget=5
+    // and snapshotInterval=200 means we delete 5 span records and
+    // maybe 1000 span items.
+
+    // initially uses 2+3*budget DB statements, then just 1 when done
+    return transcriptStore.deleteVatTranscripts(vatID, budget);
   }
 
   function beginNewIncarnation() {
@@ -761,7 +794,8 @@ export function makeVatKeeper(
     dumpState,
     saveSnapshot,
     getSnapshotInfo,
-    deleteSnapshotsAndTranscript,
+    deleteSnapshots,
+    deleteTranscripts,
     beginNewIncarnation,
   });
 }
diff --git a/packages/SwingSet/src/kernel/vat-warehouse.js b/packages/SwingSet/src/kernel/vat-warehouse.js
index 9dc8903c713..2d4a167c398 100644
--- a/packages/SwingSet/src/kernel/vat-warehouse.js
+++ b/packages/SwingSet/src/kernel/vat-warehouse.js
@@ -544,6 +544,10 @@ export function makeVatWarehouse({
       vatKeeper.addToTranscript(getTranscriptEntry(vd, deliveryResult));
     }
 
+    if (kd[0] === 'bringOutYourDead') {
+      vatKeeper.clearReapDirt(); // BOYD zeros out the when-to-BOYD counters
+    }
+
     // TODO: if per-vat policy decides it wants a BOYD or heap snapshot,
     // now is the time to do it, or to ask the kernel to schedule it
 
diff --git a/packages/SwingSet/src/lib/runPolicies.js b/packages/SwingSet/src/lib/runPolicies.js
index 2b54eace7af..8adf8ef9c18 100644
--- a/packages/SwingSet/src/lib/runPolicies.js
+++ b/packages/SwingSet/src/lib/runPolicies.js
@@ -3,6 +3,9 @@ import { assert } from '@endo/errors';
 export function foreverPolicy() {
   /** @type { RunPolicy } */
   return harden({
+    allowCleanup() {
+      return {}; // unlimited budget
+    },
     vatCreated(_details) {
       return true;
     },
@@ -27,6 +30,9 @@ export function crankCounter(
   let vats = 0;
   /** @type { RunPolicy } */
   const policy = harden({
+    allowCleanup() {
+      return { budget: 100 }; // limited budget
+    },
     vatCreated() {
       vats += 1;
       return vats < maxCreateVats;
@@ -47,13 +53,21 @@ export function crankCounter(
   return policy;
 }
 
-export function computronCounter(limit) {
+export function computronCounter(limit, options = {}) {
   assert.typeof(limit, 'bigint');
+  const {
+    cleanupBudget = 100,
+    vatCreatedComputrons = 100_000n, // pretend that's the cost
+    crankFailedComputrons = 1_000_000n,
+  } = options;
   let total = 0n;
   /** @type { RunPolicy } */
   const policy = harden({
+    allowCleanup() {
+      return { budget: cleanupBudget }; // limited budget
+    },
     vatCreated() {
-      total += 100000n; // pretend vat creation takes 100k computrons
+      total += vatCreatedComputrons;
       return total < limit;
     },
     crankComplete(details = {}) {
@@ -65,7 +79,7 @@ export function computronCounter(limit) {
       return total < limit;
     },
     crankFailed() {
-      total += 1000000n; // who knows, 1M is as good as anything
+      total += crankFailedComputrons;
       return total < limit;
     },
     emptyCrank() {
@@ -79,6 +93,7 @@ export function wallClockWaiter(seconds) {
   const timeout = Date.now() + 1000 * seconds;
   /** @type { RunPolicy } */
   const policy = harden({
+    allowCleanup: () => ({}), // unlimited budget
     vatCreated: () => Date.now() < timeout,
     crankComplete: () => Date.now() < timeout,
     crankFailed: () => Date.now() < timeout,
@@ -86,3 +101,34 @@ export function wallClockWaiter(seconds) {
   });
   return policy;
 }
+
+export function noCleanup() {
+  /** @type { RunPolicy } */
+  const policy = harden({
+    allowCleanup: () => false,
+    vatCreated: () => true,
+    crankComplete: () => true,
+    crankFailed: () => true,
+    emptyCrank: () => true,
+  });
+  return policy;
+}
+
+export function someCleanup(budget) {
+  let once = true;
+  /** @type { RunPolicy } */
+  const policy = harden({
+    allowCleanup: () => {
+      if (once) {
+        once = false;
+        return { budget };
+      }
+      return false;
+    },
+    vatCreated: () => true,
+    crankComplete: () => true,
+    crankFailed: () => true,
+    emptyCrank: () => true,
+  });
+  return policy;
+}
diff --git a/packages/SwingSet/src/types-external.js b/packages/SwingSet/src/types-external.js
index 1f1805b2884..364a1d6041f 100644
--- a/packages/SwingSet/src/types-external.js
+++ b/packages/SwingSet/src/types-external.js
@@ -224,12 +224,17 @@ export {};
  * @typedef { [tag: 'create-vat', details: PolicyInputDetails  ]} PolicyInputCreateVat
  * @typedef { [tag: 'crank', details: PolicyInputDetails ] } PolicyInputCrankComplete
  * @typedef { [tag: 'crank-failed', details: PolicyInputDetails ]} PolicyInputCrankFailed
- * @typedef { PolicyInputNone | PolicyInputCreateVat | PolicyInputCrankComplete | PolicyInputCrankFailed } PolicyInput
+ * @typedef { [tag: 'cleanup', details: { cleanups: number, computrons?: number }] } PolicyInputCleanup
+ * @typedef { PolicyInputNone | PolicyInputCreateVat | PolicyInputCrankComplete |
+ *            PolicyInputCrankFailed | PolicyInputCleanup } PolicyInput
  * @typedef { boolean } PolicyOutput
- * @typedef { { vatCreated: (details: {}) => PolicyOutput,
+ * @typedef { {
+ *              allowCleanup?: () => false | { budget?: number },
+ *              vatCreated: (details: {}) => PolicyOutput,
  *              crankComplete: (details: { computrons?: bigint }) => PolicyOutput,
  *              crankFailed: (details: {}) => PolicyOutput,
  *              emptyCrank: () => PolicyOutput,
+ *              didCleanup?: (details: { cleanups: number }) => PolicyOutput,
  *             } } RunPolicy
  *
  * @typedef {object} VatWarehousePolicy
diff --git a/packages/SwingSet/src/types-internal.js b/packages/SwingSet/src/types-internal.js
index 102262e7db7..7665a8b2c7c 100644
--- a/packages/SwingSet/src/types-internal.js
+++ b/packages/SwingSet/src/types-internal.js
@@ -143,10 +143,12 @@ export {};
  * @typedef { { type: 'retireImports', vatID: VatID, krefs: string[] } } RunQueueEventRetireImports
  * @typedef { { type: 'negated-gc-action', vatID?: VatID } } RunQueueEventNegatedGCAction
  * @typedef { { type: 'bringOutYourDead', vatID: VatID } } RunQueueEventBringOutYourDead
+ * @typedef { { type: 'cleanup-terminated-vat', vatID: VatID,
+ *              budget: number | undefined } } RunQueueEventCleanupTerminatedVat
  * @typedef { RunQueueEventNotify | RunQueueEventSend | RunQueueEventCreateVat |
  *            RunQueueEventUpgradeVat | RunQueueEventChangeVatOptions | RunQueueEventStartVat |
  *            RunQueueEventDropExports | RunQueueEventRetireExports | RunQueueEventRetireImports |
- *            RunQueueEventNegatedGCAction | RunQueueEventBringOutYourDead
+ *            RunQueueEventNegatedGCAction | RunQueueEventBringOutYourDead | RunQueueEventCleanupTerminatedVat
  *          } RunQueueEvent
  */
 
diff --git a/packages/SwingSet/test/snapshots/state.test.js.md b/packages/SwingSet/test/snapshots/state.test.js.md
index cdaf5c03bdc..4c02a66077d 100644
--- a/packages/SwingSet/test/snapshots/state.test.js.md
+++ b/packages/SwingSet/test/snapshots/state.test.js.md
@@ -8,8 +8,8 @@ Generated by [AVA](https://avajs.dev).
 
 > initial state
 
-    '99068f7796b6004cfad57c5079365643b0306f525e529ef83f3b80b424517bff'
+    '7b16bffd29f6a2d11bae7b536ef4c230af8cadc29284928b6cc2f7338507a987'
 
 > expected activityhash
 
-    '62b78f65e37e063b3ac5982eff14077e178970080a8338a5bf5476570f168a4d'
+    '7dbf5a49d4e2b999c431730fcd4927c01c713eaa54fe273626e4201853e38d3b'
diff --git a/packages/SwingSet/test/snapshots/state.test.js.snap b/packages/SwingSet/test/snapshots/state.test.js.snap
index 55ffcea9b2da4965aa9010d7dba4d08f10f0a6ca..0efcf0fac96c6f997cd9489988b1d3ad22ebcee5 100644
GIT binary patch
literal 277
zcmV+w0qXuiRzV<BZeeh9Xm4~Nb~6eC00+yO^Y+P)Jl>rG!LLP|a@GQV5KUg!e>9Iu
z@u^B-_#cZ100000000AZP`ge8F%Y~6Mb!MkbySgTub&-XKux>eUF)n|?jmeZDB(}Y
zmvRvm1tU!}#k{^to>w}y&#Uy>XSqDmD=x<g+cX}=y3}5A9$UvQBQ885^CqoJt8Kw^
zo7UdGuFTl+<@s*7A8y#~hlkx$$lB*p*rejou+|wRWCvb~B?AgeBxEwlM7uG2B2xlD
zbShwqL1xS{pZ-+%Cy8@$XcJp8Ii*AvbO2R|H{(D`Q_vuY&K5BNyzyd<($RrHWXQis
bZ*n;cb=fvjukW>g+)nrba;NjiAOQdXeZ+k^

literal 279
zcmV+y0qFigRzV<BZeeh9Xm4~Nb~6eC0BWqM@`7<j^C9>IBLdcu>|1v0D|odPLI`>P
zaMD(pPalg200000000ARkTFgJF%U&JLWr6hY)6GOyY_hOjuTMRj_nz(6=yfX289xC
zLQYB|DkOgSoA>_z)>yZ+Z=YxBrB^vUq*q?|3%6x_9LrjI;X1a?T}EE>$T?rCZEdBk
zd~D0s+t)c~?)>t6Gu#c=>~4qq-BSqQyb=W$9e_>B87U!9Fb<tH5ezuR83pGc>W!M{
zA==sEL{U{LN2!ND3jdSb9LcLgF(lwjG@LN_SyU$rL^6>N1mJ<aF`h9hT5?FBlk;rz
dFX>HAM@d~S8&faurGH#c_yI@Wlo22S000#?cz^%^

diff --git a/packages/SwingSet/test/state.test.js b/packages/SwingSet/test/state.test.js
index b9e8e21901a..c60a01aa79c 100644
--- a/packages/SwingSet/test/state.test.js
+++ b/packages/SwingSet/test/state.test.js
@@ -183,7 +183,7 @@ test('kernel state', async t => {
   k.emitCrankHashes();
 
   checkState(t, store.dump, [
-    ['version', '1'],
+    ['version', '2'],
     ['crankNumber', '0'],
     ['gcActions', '[]'],
     ['runQueue', '[1,1]'],
@@ -206,6 +206,7 @@ test('kernel state', async t => {
     ['kernel.snapshotInitial', '3'],
     ['kernel.snapshotInterval', '200'],
     ['meter.nextID', '1'],
+    ['vats.terminated', '[]'],
   ]);
 });
 
@@ -222,7 +223,7 @@ test('kernelKeeper vat names', async t => {
 
   k.emitCrankHashes();
   checkState(t, store.dump, [
-    ['version', '1'],
+    ['version', '2'],
     ['crankNumber', '0'],
     ['gcActions', '[]'],
     ['runQueue', '[1,1]'],
@@ -247,6 +248,7 @@ test('kernelKeeper vat names', async t => {
     ['kernel.snapshotInitial', '3'],
     ['kernel.snapshotInterval', '200'],
     ['meter.nextID', '1'],
+    ['vats.terminated', '[]'],
   ]);
   t.deepEqual(k.getStaticVats(), [
     ['Frank', 'v2'],
@@ -277,7 +279,7 @@ test('kernelKeeper device names', async t => {
 
   k.emitCrankHashes();
   checkState(t, store.dump, [
-    ['version', '1'],
+    ['version', '2'],
     ['crankNumber', '0'],
     ['gcActions', '[]'],
     ['runQueue', '[1,1]'],
@@ -302,6 +304,7 @@ test('kernelKeeper device names', async t => {
     ['kernel.snapshotInitial', '3'],
     ['kernel.snapshotInterval', '200'],
     ['meter.nextID', '1'],
+    ['vats.terminated', '[]'],
   ]);
   t.deepEqual(k.getDevices(), [
     ['Frank', 'd8'],
@@ -459,7 +462,7 @@ test('kernelKeeper promises', async t => {
   k.emitCrankHashes();
 
   checkState(t, store.dump, [
-    ['version', '1'],
+    ['version', '2'],
     ['crankNumber', '0'],
     ['device.nextID', '7'],
     ['vat.nextID', '1'],
@@ -490,6 +493,7 @@ test('kernelKeeper promises', async t => {
     ['kernel.snapshotInitial', '3'],
     ['kernel.snapshotInterval', '200'],
     ['meter.nextID', '1'],
+    ['vats.terminated', '[]'],
   ]);
 
   k.deleteKernelObject(ko);
@@ -1074,7 +1078,7 @@ test('dirt upgrade', async t => {
   // * v3.reapCountdown: 'never'
   // * v3.reapInterval: 'never'
 
-  t.is(k.kvStore.get('version'), '1');
+  t.is(k.kvStore.get('version'), '2');
   k.kvStore.delete(`kernel.defaultReapDirtThreshold`);
   k.kvStore.set(`kernel.defaultReapInterval`, '1000');
 
@@ -1098,6 +1102,7 @@ test('dirt upgrade', async t => {
 
   k.kvStore.delete(`version`);
   k.kvStore.set('initialized', 'true');
+  k.kvStore.delete(`vats.terminated`);
 
   // kernelKeeper refuses to work with an old state
   t.throws(() => duplicateKeeper(store.serialize));
@@ -1153,3 +1158,34 @@ test('dirt upgrade', async t => {
     never: true,
   });
 });
+
+test('v2 upgrade', async t => {
+  // this should add vats.terminated
+  const store = buildKeeperStorageInMemory();
+  const k = makeKernelKeeper(store, 'uninitialized');
+  k.createStartingKernelState({ defaultManagerType: 'local' });
+  k.setInitialized();
+  k.saveStats();
+
+  // roll back to v1
+  t.is(k.kvStore.get('version'), '2');
+  k.kvStore.delete(`vats.terminated`);
+  k.kvStore.set('version', '1');
+
+  // kernelKeeper refuses to work with an old state
+  t.throws(() => duplicateKeeper(store.serialize));
+
+  // it requires a manual upgrade
+  let k2;
+  {
+    const serialized = store.serialize();
+    const { kernelStorage } = initSwingStore(null, { serialized });
+    upgradeSwingset(kernelStorage);
+    k2 = makeKernelKeeper(kernelStorage, CURRENT_SCHEMA_VERSION); // works this time
+    k2.loadStats();
+  }
+
+  t.true(k2.kvStore.has(`vats.terminated`));
+  t.deepEqual(JSON.parse(k2.kvStore.get(`vats.terminated`)), []);
+  t.is(k2.kvStore.get(`version`), '2');
+});
diff --git a/packages/SwingSet/test/stripPrefix.test.js b/packages/SwingSet/test/stripPrefix.test.js
new file mode 100644
index 00000000000..a34e8ac3cd6
--- /dev/null
+++ b/packages/SwingSet/test/stripPrefix.test.js
@@ -0,0 +1,8 @@
+import { test } from '../tools/prepare-test-env-ava.js';
+import { stripPrefix } from '../src/kernel/state/kernelKeeper.js';
+
+test('stripPrefix', t => {
+  t.is(stripPrefix('prefix', 'prefixed'), 'ed');
+  t.is(stripPrefix('', 'prefixed'), 'prefixed');
+  t.throws(() => stripPrefix('not', 'prefixed'), { message: /prefixed/ });
+});
diff --git a/packages/SwingSet/test/transcript-light.test.js b/packages/SwingSet/test/transcript-light.test.js
index fa56e6a0a58..9049eb57845 100644
--- a/packages/SwingSet/test/transcript-light.test.js
+++ b/packages/SwingSet/test/transcript-light.test.js
@@ -17,7 +17,7 @@ test('transcript-light load', async t => {
   t.teardown(c.shutdown);
   const serialized0 = debug.serialize();
   const kvstate0 = debug.dump().kvEntries;
-  t.is(kvstate0.version, '1');
+  t.is(kvstate0.version, '2');
   t.is(kvstate0.runQueue, '[1,1]');
   t.not(kvstate0.acceptanceQueue, '[]');
 
diff --git a/packages/SwingSet/test/upgrade-swingset.test.js b/packages/SwingSet/test/upgrade-swingset.test.js
index 11558bc7d47..a18b2699dd8 100644
--- a/packages/SwingSet/test/upgrade-swingset.test.js
+++ b/packages/SwingSet/test/upgrade-swingset.test.js
@@ -16,7 +16,7 @@ test.before(async t => {
   t.context.data = { kernelBundles };
 });
 
-test('kernel refuses to run with out-of-date DB', async t => {
+test('kernel refuses to run with out-of-date DB - v0', async t => {
   const { hostStorage, kernelStorage } = initSwingStore();
   const { commit } = hostStorage;
   const { kvStore } = kernelStorage;
@@ -28,7 +28,7 @@ test('kernel refuses to run with out-of-date DB', async t => {
   // kernelkeeper v0 schema, just deleting the version key and adding
   // 'initialized'
 
-  t.is(kvStore.get('version'), '1');
+  t.is(kvStore.get('version'), '2');
   kvStore.delete(`version`);
   kvStore.set('initialized', 'true');
   await commit();
@@ -39,6 +39,29 @@ test('kernel refuses to run with out-of-date DB', async t => {
   });
 });
 
+test('kernel refuses to run with out-of-date DB - v1', async t => {
+  const { hostStorage, kernelStorage } = initSwingStore();
+  const { commit } = hostStorage;
+  const { kvStore } = kernelStorage;
+  const config = {};
+  await initializeSwingset(config, [], kernelStorage, t.context.data);
+  await commit();
+
+  // now doctor the initial state to make it look like the
+  // kernelkeeper v1 schema, by reducing the version key and removing
+  // vats.terminated
+
+  t.is(kvStore.get('version'), '2');
+  kvStore.set(`version`, '1');
+  kvStore.delete('vats.terminated');
+  await commit();
+
+  // Now build a controller around this modified state, which should fail.
+  await t.throwsAsync(() => makeSwingsetController(kernelStorage), {
+    message: /kernel DB is too old/,
+  });
+});
+
 test('upgrade kernel state', async t => {
   const { hostStorage, kernelStorage } = initSwingStore();
   const { commit } = hostStorage;
@@ -73,9 +96,10 @@ test('upgrade kernel state', async t => {
 
   t.true(kvStore.has('kernel.defaultReapDirtThreshold'));
 
-  t.is(kvStore.get('version'), '1');
+  t.is(kvStore.get('version'), '2');
   kvStore.delete('version'); // i.e. revert to v0
   kvStore.set('initialized', 'true');
+  kvStore.delete('vats.terminated');
   kvStore.delete(`kernel.defaultReapDirtThreshold`);
   kvStore.set(`kernel.defaultReapInterval`, '300');
 
@@ -162,9 +186,10 @@ test('upgrade non-reaping kernel state', async t => {
 
   t.true(kvStore.has('kernel.defaultReapDirtThreshold'));
 
-  t.is(kvStore.get('version'), '1');
+  t.is(kvStore.get('version'), '2');
   kvStore.delete('version'); // i.e. revert to v0
   kvStore.set('initialized', 'true');
+  kvStore.delete('vats.terminated');
   kvStore.delete(`kernel.defaultReapDirtThreshold`);
   kvStore.set(`kernel.defaultReapInterval`, 'never');
 
diff --git a/packages/SwingSet/test/vat-admin/slow-termination/bootstrap-slow-terminate.js b/packages/SwingSet/test/vat-admin/slow-termination/bootstrap-slow-terminate.js
new file mode 100644
index 00000000000..135124209ef
--- /dev/null
+++ b/packages/SwingSet/test/vat-admin/slow-termination/bootstrap-slow-terminate.js
@@ -0,0 +1,55 @@
+import { Far, E } from '@endo/far';
+
+export function buildRootObject(_vatPowers) {
+  let root;
+  let adminNode;
+  const myImports = [];
+
+  const self = Far('root', {
+    async bootstrap(vats, devices) {
+      const svc = E(vats.vatAdmin).createVatAdminService(devices.vatAdmin);
+      // create a dynamic vat, send it a message and let it respond, to make
+      // sure everything is working
+      const dude = await E(svc).createVatByName('dude');
+      root = dude.root;
+      adminNode = dude.adminNode;
+      await E(root).alive();
+
+      // set up 20 "bootstrap exports, dude imports" c-list entries
+      for (let i = 0; i < 20; i += 1) {
+        await E(root).acceptImports(Far('bootstrap export', {}));
+      }
+
+      // set up 20 "dude exports, bootstrap imports" c-list entries
+
+      for (let i = 0; i < 20; i += 1) {
+        myImports.push(await E(root).sendExport());
+      }
+
+      // ask dude to creates 20 vatstore entries (in addition to the
+      // built-in liveslots stuff)
+      await E(root).makeVatstore(20);
+
+      return 'bootstrap done';
+    },
+
+    async kill(mode) {
+      switch (mode) {
+        case 'kill':
+          await E(adminNode).terminateWithFailure(mode);
+          break;
+        case 'dieHappy':
+          await E(root).dieHappy(mode);
+          break;
+        default:
+          console.log('something terrible has happened');
+          break;
+      }
+      // confirm the vat is dead, even though cleanup happens later
+      return E(root)
+        .ping()
+        .catch(err => `kill done, ${err}`);
+    },
+  });
+  return self;
+}
diff --git a/packages/SwingSet/test/vat-admin/slow-termination/slow-termination.test.js b/packages/SwingSet/test/vat-admin/slow-termination/slow-termination.test.js
new file mode 100644
index 00000000000..4092aae7ac0
--- /dev/null
+++ b/packages/SwingSet/test/vat-admin/slow-termination/slow-termination.test.js
@@ -0,0 +1,292 @@
+// @ts-nocheck
+// eslint-disable-next-line import/order
+import { test } from '../../../tools/prepare-test-env-ava.js';
+
+import tmp from 'tmp';
+import sqlite3 from 'better-sqlite3';
+import path from 'path';
+
+import { kser } from '@agoric/kmarshal';
+import { initSwingStore } from '@agoric/swing-store';
+
+import { buildVatController, buildKernelBundles } from '../../../src/index.js';
+import { enumeratePrefixedKeys } from '../../../src/kernel/state/storageHelper.js';
+
+/**
+ * @param {string} [prefix]
+ * @returns {Promise<[string, () => void]>}
+ */
+export const tmpDir = prefix =>
+  new Promise((resolve, reject) => {
+    tmp.dir({ unsafeCleanup: true, prefix }, (err, name, removeCallback) => {
+      if (err) {
+        reject(err);
+      } else {
+        resolve([name, removeCallback]);
+      }
+    });
+  });
+
+test.before(async t => {
+  const kernelBundles = await buildKernelBundles();
+  t.context.data = { kernelBundles };
+});
+
+const makeCleanupPolicy = budget => {
+  let cleanups = 0;
+  const stop = () => false;
+  const policy = harden({
+    vatCreated: stop,
+    crankComplete: stop,
+    crankFailed: stop,
+    emptyCrank: stop,
+    allowCleanup() {
+      if (budget > 0) {
+        return { budget };
+      } else {
+        return false;
+      }
+    },
+    didCleanup(spent) {
+      budget -= spent.cleanups;
+      cleanups += spent.cleanups;
+    },
+  });
+  const getCleanups = () => cleanups;
+  return [policy, getCleanups];
+};
+
+const bfile = relpath => new URL(relpath, import.meta.url).pathname;
+
+async function doSlowTerminate(t, mode) {
+  const config = {
+    defaultManagerType: 'xsnap',
+    defaultReapInterval: 'never',
+    snapshotInitial: 2, // same as the default
+    snapshotInterval: 10, // ensure multiple spans+snapshots
+    bootstrap: 'bootstrap',
+    bundles: {
+      dude: {
+        sourceSpec: bfile('vat-slow-terminate.js'),
+      },
+    },
+    vats: {
+      bootstrap: {
+        sourceSpec: bfile('bootstrap-slow-terminate.js'),
+      },
+    },
+  };
+
+  let countCleanups = 0;
+  const noCleanupPolicy = {
+    allowCleanup: () => false,
+    vatCreated: () => true,
+    crankComplete: () => true,
+    crankFailed: () => true,
+    emptyCrank: () => true,
+    didCleanup: ({ cleanups }) => {
+      countCleanups += cleanups;
+      return true;
+    },
+  };
+
+  const [dbDir, cleanup] = await tmpDir('testdb');
+  t.teardown(cleanup);
+
+  const ss = initSwingStore(dbDir);
+  const { kernelStorage } = ss;
+  const { commit } = ss.hostStorage;
+  const { kvStore } = kernelStorage;
+  // look directly at DB to confirm changes
+  const db = sqlite3(path.join(dbDir, 'swingstore.sqlite'));
+
+  const controller = await buildVatController(config, [], {
+    ...t.context.data,
+    kernelStorage,
+  });
+  t.teardown(controller.shutdown);
+  t.is(controller.kpStatus(controller.bootstrapResult), 'unresolved');
+  controller.pinVatRoot('bootstrap');
+  await controller.run(noCleanupPolicy);
+  await commit();
+  t.is(controller.kpStatus(controller.bootstrapResult), 'fulfilled');
+  t.deepEqual(
+    controller.kpResolution(controller.bootstrapResult),
+    kser('bootstrap done'),
+  );
+  t.is(countCleanups, 0);
+
+  // bootstrap adds a fair amount of vat-dude state:
+  // * we have c-list entries for 20 imports and 20 exports, each of
+  //   which need two kvStore entries, so 80 kvStore total
+  // * the vat has created 20 baggage entries, all of which go into
+  //   the vatstore, adding 20 kvStore
+  // * an empty vat has about 29 kvStore entries just to track
+  //   counters, the built-in collection types, baggage itself, etc
+  // * by sending 40-plus deliveries into an xsnap vat with
+  //   snapInterval=5, we get 8-ish transcript spans (7 old, 1
+  //   current), and each old span generates a heap snapshot record
+  // Slow vat termination means deleting these entries slowly.
+
+  const vatID = JSON.parse(kvStore.get('vat.dynamicIDs'))[0];
+  t.is(vatID, 'v6'); // change if necessary
+  const remainingKV = () =>
+    Array.from(enumeratePrefixedKeys(kvStore, `${vatID}.`));
+  const remainingSnapshots = () =>
+    db
+      .prepare('SELECT COUNT(*) FROM snapshots WHERE vatID=?')
+      .pluck()
+      .get(vatID);
+  const remainingTranscriptItems = () =>
+    db
+      .prepare('SELECT COUNT(*) FROM transcriptItems WHERE vatID=?')
+      .pluck()
+      .get(vatID);
+  const remainingTranscriptSpans = () =>
+    db
+      .prepare('SELECT COUNT(*) FROM transcriptSpans WHERE vatID=?')
+      .pluck()
+      .get(vatID);
+
+  // 20*2 for imports, 21*2 for exports, 20*1 for vatstore = 102
+  // plus 27 for usual liveslots stuff
+  t.is(remainingKV().length, 129);
+  t.false(JSON.parse(kvStore.get('vats.terminated')).includes(vatID));
+  // we get one span for snapshotInitial (=2), then a span every
+  // snapshotInterval (=10). Each non-current span creates a
+  // snapshot.
+  t.is(remainingTranscriptSpans(), 6);
+  t.is(remainingTranscriptItems(), 59);
+  t.is(remainingSnapshots(), 5);
+  const remaining = () =>
+    remainingKV().length +
+    remainingSnapshots() +
+    remainingTranscriptItems() +
+    remainingTranscriptSpans();
+
+  // note: mode=dieHappy means we send one extra message to the vat,
+  // which adds a single transcript item (but this doesn't happen to trigger an extra span)
+
+  const kpid = controller.queueToVatRoot('bootstrap', 'kill', [mode]);
+  await controller.run(noCleanupPolicy);
+  await commit();
+  t.is(controller.kpStatus(kpid), 'fulfilled');
+  t.deepEqual(
+    controller.kpResolution(kpid),
+    kser('kill done, Error: vat terminated'),
+  );
+  t.is(countCleanups, 0);
+
+  t.true(JSON.parse(kvStore.get('vats.terminated')).includes(vatID));
+  // no cleanups were allowed, so nothing should be removed yet
+  t.truthy(kernelStorage.kvStore.get(`${vatID}.options`, undefined));
+  t.is(remainingKV().length, 129);
+
+  // now do a series of cleanup runs, each with budget=5
+  const clean = async (budget = 5) => {
+    const [policy, getCleanups] = makeCleanupPolicy(budget);
+    await controller.run(policy);
+    await commit();
+    return getCleanups();
+  };
+
+  // cleanup currently deletes c-list exports, then c-list imports,
+  // then all other kvStore entries, then snapshots, then transcripts
+
+  let leftKV = remainingKV().length;
+  const cleanKV = async (expectedKV, expectedCleanups) => {
+    const cleanups = await clean();
+    const newLeftKV = remainingKV().length;
+    t.is(leftKV - newLeftKV, expectedKV);
+    leftKV = newLeftKV;
+    t.is(cleanups, expectedCleanups);
+  };
+
+  // we have 21 c-list exports (1 vat root, plus 20 we added), we
+  // delete them 5 at a time (2 kv each, so 10kv per clean)
+  await cleanKV(10, 5); // 5 c-list exports
+  await cleanKV(10, 5); // 5 c-list exports
+  await cleanKV(10, 5); // 5 c-list exports
+  await cleanKV(10, 5); // 5 c-list exports
+
+  // we have one export left, so this clean(budget=5) will delete the
+  // two kv for the export, then the first four of our 20 c-list
+  // imports, each of which also has 2 kv)
+
+  await cleanKV(10, 5); // 1 c-list exports, 4 c-list imports
+  await cleanKV(10, 5); // 5 c-list imports
+  await cleanKV(10, 5); // 5 c-list imports
+  await cleanKV(10, 5); // 5 c-list imports
+
+  // we have one import left, so this clean(budget=5) will delete its
+  // two kv, then the first four of our 47 other kv entries (20
+  // vatstore plus 27 liveslots overhead
+  await cleanKV(6, 5); // 1 c-list import, 4 other kv
+  // now there are 45 other kv entries left
+  t.is(remainingKV().length, 43);
+
+  await cleanKV(5, 5); // 5 other kv
+  await cleanKV(5, 5); // 5 other kv
+  await cleanKV(5, 5); // 5 other kv
+  await cleanKV(5, 5); // 5 other kv
+  await cleanKV(5, 5); // 5 other kv
+  t.is(remainingSnapshots(), 5);
+  await cleanKV(5, 5); // 5 other kv
+  await cleanKV(5, 5); // 5 other kv
+  await cleanKV(5, 5); // 5 other kv
+
+  // we have 3 kv entries left, so budget=5 will delete those three,
+  // then two snapshots
+  t.is(remainingSnapshots(), 5);
+  let cleanups = await clean();
+  t.deepEqual(remainingKV(), []);
+  t.is(kernelStorage.kvStore.get(`${vatID}.options`, undefined));
+  t.is(remainingSnapshots(), 3);
+  t.is(remainingTranscriptSpans(), 6);
+  let ts = 59;
+  if (mode === 'dieHappy') {
+    ts = 60;
+  }
+  t.is(remainingTranscriptItems(), ts);
+  t.is(cleanups, 5);
+
+  // there are three snapshots remaining. do a clean with budget=3, to
+  // exercise the bugfix where we'd call deleteVatTranscripts() with
+  // budget=0 by mistake
+
+  cleanups = await clean(3);
+  t.is(cleanups, 3);
+  t.is(remainingSnapshots(), 0);
+  t.is(remainingTranscriptSpans(), 6);
+  t.is(remainingTranscriptItems(), ts);
+
+  // the next clean (with the default budget of 5) will delete the
+  // five most recent transcript spans, starting with the isCurrent=1
+  // one (which had 9 or 10 items), leaving just the earliest (which
+  // had 4, due to `snapshotInitial`)
+
+  cleanups = await clean();
+  t.is(cleanups, 5);
+  t.is(remainingTranscriptSpans(), 1);
+  t.is(remainingTranscriptItems(), 4);
+  t.true(JSON.parse(kvStore.get('vats.terminated')).includes(vatID));
+
+  // the final clean deletes the remaining span, and finishes by
+  // removing the "still being deleted" bookkeeping, and the .options
+
+  cleanups = await clean();
+  t.is(remainingTranscriptSpans(), 0);
+  t.is(remainingTranscriptItems(), 0);
+  t.is(remaining(), 0);
+  t.is(cleanups, 1);
+
+  t.false(JSON.parse(kvStore.get('vats.terminated')).includes(vatID));
+}
+
+test.serial('slow terminate (kill)', async t => {
+  await doSlowTerminate(t, 'kill');
+});
+
+test.serial('slow terminate (die happy)', async t => {
+  await doSlowTerminate(t, 'dieHappy');
+});
diff --git a/packages/SwingSet/test/vat-admin/slow-termination/vat-slow-terminate.js b/packages/SwingSet/test/vat-admin/slow-termination/vat-slow-terminate.js
new file mode 100644
index 00000000000..78ec9a74e47
--- /dev/null
+++ b/packages/SwingSet/test/vat-admin/slow-termination/vat-slow-terminate.js
@@ -0,0 +1,16 @@
+import { Far } from '@endo/far';
+
+export function buildRootObject(vatPowers, _vatParameters, baggage) {
+  const hold = [];
+  return Far('root', {
+    alive: () => true,
+    dieHappy: completion => vatPowers.exitVat(completion),
+    sendExport: () => Far('dude export', {}),
+    acceptImports: imports => hold.push(imports),
+    makeVatstore: count => {
+      for (let i = 0; i < count; i += 1) {
+        baggage.init(`key-${i}`, i);
+      }
+    },
+  });
+}
diff --git a/packages/SwingSet/test/vat-admin/terminate/terminate.test.js b/packages/SwingSet/test/vat-admin/terminate/terminate.test.js
index 750862e084b..a4759943e47 100644
--- a/packages/SwingSet/test/vat-admin/terminate/terminate.test.js
+++ b/packages/SwingSet/test/vat-admin/terminate/terminate.test.js
@@ -150,8 +150,12 @@ async function doTerminateCritical(
     t.is(thrown.message, mode);
   }
   t.is(controller.kpStatus(kpid), dynamic ? 'unresolved' : 'fulfilled');
+  // the kernel is supposed to crash before deleting vNN.options,
+  // although strictly speaking it doesn't matter because the host is
+  // supposed to crash too, abandoning the uncommitted swingstore
+  // changes
   const postProbe = kernelStorage.kvStore.get(`${deadVatID}.options`);
-  t.is(postProbe, undefined);
+  t.truthy(postProbe);
 }
 
 test.serial('terminate (dynamic, non-critical)', async t => {
diff --git a/packages/internal/src/index.js b/packages/internal/src/index.js
index 28e9139d3f3..536d8ffa125 100644
--- a/packages/internal/src/index.js
+++ b/packages/internal/src/index.js
@@ -14,4 +14,5 @@ export * from './typeGuards.js';
 export * from './types.js';
 
 export { objectMap } from '@endo/common/object-map.js';
+export { objectMetaMap } from '@endo/common/object-meta-map.js';
 export { fromUniqueEntries } from '@endo/common/from-unique-entries.js';