From e726ff1bd22a66fd2f9d33da178c95181f8b1af2 Mon Sep 17 00:00:00 2001
From: Brian Warner <warner@lothar.com>
Date: Fri, 23 Aug 2024 20:54:04 -0700
Subject: [PATCH 1/4] refactor: split out scanForDeadObjects to a new file

no behavior changes
---
 packages/swingset-liveslots/src/boyd-gc.js   | 167 +++++++++++++++++
 packages/swingset-liveslots/src/liveslots.js | 184 +++----------------
 2 files changed, 188 insertions(+), 163 deletions(-)
 create mode 100644 packages/swingset-liveslots/src/boyd-gc.js

diff --git a/packages/swingset-liveslots/src/boyd-gc.js b/packages/swingset-liveslots/src/boyd-gc.js
new file mode 100644
index 00000000000..aedca2b8fe8
--- /dev/null
+++ b/packages/swingset-liveslots/src/boyd-gc.js
@@ -0,0 +1,167 @@
+import { Fail } from '@endo/errors';
+import { parseVatSlot } from './parseVatSlots.js';
+
+/*
+    Imports are in one of 5 states: UNKNOWN, REACHABLE, UNREACHABLE,
+    COLLECTED, FINALIZED. Note that there's no actual state machine with those
+    values, and we can't observe all of the transitions from JavaScript, but
+    we can describe what operations could cause a transition, and what our
+    observations allow us to deduce about the state:
+
+    * UNKNOWN moves to REACHABLE when a crank introduces a new import
+    * userspace holds a reference only in REACHABLE
+    * REACHABLE moves to UNREACHABLE only during a userspace crank
+    * UNREACHABLE moves to COLLECTED when GC runs, which queues the finalizer
+    * COLLECTED moves to FINALIZED when a new turn runs the finalizer
+    * liveslots moves from FINALIZED to UNKNOWN by syscalling dropImports
+
+    convertSlotToVal either imports a vref for the first time, or
+    re-introduces a previously-seen vref. It transitions from:
+
+    * UNKNOWN to REACHABLE by creating a new Presence
+    * UNREACHABLE to REACHABLE by re-using the old Presence that userspace
+      forgot about
+    * COLLECTED/FINALIZED to REACHABLE by creating a new Presence
+
+    Our tracking tables hold data that depends on the current state:
+
+    * slotToVal holds a WeakRef in [REACHABLE, UNREACHABLE, COLLECTED]
+    * that WeakRef .deref()s into something in [REACHABLE, UNREACHABLE]
+    * deadSet holds the vref only in FINALIZED
+    * re-introduction must ensure the vref is not in the deadSet
+
+    Each state thus has a set of perhaps-measurable properties:
+
+    * UNKNOWN: slotToVal[baseRef] is missing, baseRef not in deadSet
+    * REACHABLE: slotToVal has live weakref, userspace can reach
+    * UNREACHABLE: slotToVal has live weakref, userspace cannot reach
+    * COLLECTED: slotToVal[baseRef] has dead weakref
+    * FINALIZED: slotToVal[baseRef] is missing, baseRef is in deadSet
+
+    Our finalizer callback is queued by the engine's transition from
+    UNREACHABLE to COLLECTED, but the baseRef might be re-introduced before the
+    callback has a chance to run. There might even be multiple copies of the
+    finalizer callback queued. So the callback must deduce the current state
+    and only perform cleanup (i.e. delete the slotToVal entry and add the
+    baseRef to the deadSet) in the COLLECTED state.
+
+*/
+
+export const makeBOYDKit = ({
+  gcTools,
+  getValForSlot,
+  slotToVal,
+  vrm,
+  kernelRecognizableRemotables,
+  syscall,
+  possiblyDeadSet,
+  possiblyRetiredSet,
+}) => {
+  const scanForDeadObjects = async () => {
+    // `possiblyDeadSet` accumulates vrefs which have lost a supporting
+    // pillar (in-memory, export, or virtualized data refcount) since the
+    // last call to scanForDeadObjects. The vref might still be supported
+    // by a remaining pillar, or the pillar which was dropped might be back
+    // (e.g., given a new in-memory manifestation).
+
+    const importsToDrop = new Set();
+    const importsToRetire = new Set();
+    const exportsToRetire = new Set();
+    let doMore;
+    await null;
+    do {
+      doMore = false;
+
+      await gcTools.gcAndFinalize();
+
+      // possiblyDeadSet contains a baseref for everything (Presences,
+      // Remotables, Representatives) that might have lost a
+      // pillar. The object might still be supported by other pillars,
+      // and the lost pillar might have been reinstantiated by the
+      // time we get here. The first step is to filter this down to a
+      // list of definitely dead baserefs.
+
+      const deadSet = new Set();
+
+      for (const baseRef of possiblyDeadSet) {
+        if (slotToVal.has(baseRef)) {
+          continue; // RAM pillar remains
+        }
+        const { virtual, durable, type } = parseVatSlot(baseRef);
+        assert(type === 'object', `unprepared to track ${type}`);
+        if (virtual || durable) {
+          if (vrm.isVirtualObjectReachable(baseRef)) {
+            continue; // vdata or export pillar remains
+          }
+        }
+        deadSet.add(baseRef);
+      }
+      possiblyDeadSet.clear();
+
+      // deadSet now contains objects which are certainly dead
+
+      // possiblyRetiredSet holds (a subset of??) baserefs which have
+      // lost a recognizer recently. TODO recheck this
+
+      for (const vref of possiblyRetiredSet) {
+        if (!getValForSlot(vref) && !deadSet.has(vref)) {
+          // Don't retire things that haven't yet made the transition to dead,
+          // i.e., always drop before retiring
+
+          if (!vrm.isVrefRecognizable(vref)) {
+            importsToRetire.add(vref);
+          }
+        }
+      }
+      possiblyRetiredSet.clear();
+
+      const deadBaseRefs = Array.from(deadSet);
+      deadBaseRefs.sort();
+      for (const baseRef of deadBaseRefs) {
+        const { virtual, durable, allocatedByVat, type } =
+          parseVatSlot(baseRef);
+        type === 'object' || Fail`unprepared to track ${type}`;
+        if (virtual || durable) {
+          // Representative: send nothing, but perform refcount checking
+
+          const [gcAgain, retirees] = vrm.deleteVirtualObject(baseRef);
+          if (retirees) {
+            retirees.map(retiree => exportsToRetire.add(retiree));
+          }
+          doMore = doMore || gcAgain;
+        } else if (allocatedByVat) {
+          // Remotable: send retireExport
+          // for remotables, vref === baseRef
+          if (kernelRecognizableRemotables.has(baseRef)) {
+            kernelRecognizableRemotables.delete(baseRef);
+            exportsToRetire.add(baseRef);
+          }
+        } else {
+          // Presence: send dropImport unless reachable by VOM
+          // eslint-disable-next-line no-lonely-if
+          if (!vrm.isPresenceReachable(baseRef)) {
+            importsToDrop.add(baseRef);
+
+            if (!vrm.isVrefRecognizable(baseRef)) {
+              // for presences, baseRef === vref
+              importsToRetire.add(baseRef);
+            }
+          }
+        }
+      }
+    } while (possiblyDeadSet.size > 0 || possiblyRetiredSet.size > 0 || doMore);
+
+    if (importsToDrop.size) {
+      syscall.dropImports(Array.from(importsToDrop).sort());
+    }
+    if (importsToRetire.size) {
+      syscall.retireImports(Array.from(importsToRetire).sort());
+    }
+    if (exportsToRetire.size) {
+      syscall.retireExports(Array.from(exportsToRetire).sort());
+    }
+  };
+
+  return { scanForDeadObjects };
+};
+harden(makeBOYDKit);
diff --git a/packages/swingset-liveslots/src/liveslots.js b/packages/swingset-liveslots/src/liveslots.js
index 4490679c4d5..a9db3c100e2 100644
--- a/packages/swingset-liveslots/src/liveslots.js
+++ b/packages/swingset-liveslots/src/liveslots.js
@@ -12,6 +12,7 @@ import { makeVirtualReferenceManager } from './virtualReferences.js';
 import { makeVirtualObjectManager } from './virtualObjectManager.js';
 import { makeCollectionManager } from './collectionManager.js';
 import { makeWatchedPromiseManager } from './watchedPromises.js';
+import { makeBOYDKit } from './boyd-gc.js';
 
 const SYSCALL_CAPDATA_BODY_SIZE_LIMIT = 10_000_000;
 const SYSCALL_CAPDATA_SLOTS_LENGTH_LIMIT = 10_000;
@@ -165,52 +166,6 @@ function build(
     }
   }
 
-  /*
-    Imports are in one of 5 states: UNKNOWN, REACHABLE, UNREACHABLE,
-    COLLECTED, FINALIZED. Note that there's no actual state machine with those
-    values, and we can't observe all of the transitions from JavaScript, but
-    we can describe what operations could cause a transition, and what our
-    observations allow us to deduce about the state:
-
-    * UNKNOWN moves to REACHABLE when a crank introduces a new import
-    * userspace holds a reference only in REACHABLE
-    * REACHABLE moves to UNREACHABLE only during a userspace crank
-    * UNREACHABLE moves to COLLECTED when GC runs, which queues the finalizer
-    * COLLECTED moves to FINALIZED when a new turn runs the finalizer
-    * liveslots moves from FINALIZED to UNKNOWN by syscalling dropImports
-
-    convertSlotToVal either imports a vref for the first time, or
-    re-introduces a previously-seen vref. It transitions from:
-
-    * UNKNOWN to REACHABLE by creating a new Presence
-    * UNREACHABLE to REACHABLE by re-using the old Presence that userspace
-      forgot about
-    * COLLECTED/FINALIZED to REACHABLE by creating a new Presence
-
-    Our tracking tables hold data that depends on the current state:
-
-    * slotToVal holds a WeakRef in [REACHABLE, UNREACHABLE, COLLECTED]
-    * that WeakRef .deref()s into something in [REACHABLE, UNREACHABLE]
-    * deadSet holds the vref only in FINALIZED
-    * re-introduction must ensure the vref is not in the deadSet
-
-    Each state thus has a set of perhaps-measurable properties:
-
-    * UNKNOWN: slotToVal[baseRef] is missing, baseRef not in deadSet
-    * REACHABLE: slotToVal has live weakref, userspace can reach
-    * UNREACHABLE: slotToVal has live weakref, userspace cannot reach
-    * COLLECTED: slotToVal[baseRef] has dead weakref
-    * FINALIZED: slotToVal[baseRef] is missing, baseRef is in deadSet
-
-    Our finalizer callback is queued by the engine's transition from
-    UNREACHABLE to COLLECTED, but the baseRef might be re-introduced before the
-    callback has a chance to run. There might even be multiple copies of the
-    finalizer callback queued. So the callback must deduce the current state
-    and only perform cleanup (i.e. delete the slotToVal entry and add the
-    baseRef to the deadSet) in the COLLECTED state.
-
-  */
-
   function finalizeDroppedObject(baseRef) {
     // TODO: Ideally this function should assert that it is not metered.  This
     // appears to be fine in practice, but it breaks a number of unit tests in
@@ -235,113 +190,6 @@ function build(
   }
   const vreffedObjectRegistry = new FinalizationRegistry(finalizeDroppedObject);
 
-  async function scanForDeadObjects() {
-    // `possiblyDeadSet` accumulates vrefs which have lost a supporting
-    // pillar (in-memory, export, or virtualized data refcount) since the
-    // last call to scanForDeadObjects. The vref might still be supported
-    // by a remaining pillar, or the pillar which was dropped might be back
-    // (e.g., given a new in-memory manifestation).
-
-    const importsToDrop = new Set();
-    const importsToRetire = new Set();
-    const exportsToRetire = new Set();
-    let doMore;
-    await null;
-    do {
-      doMore = false;
-
-      await gcTools.gcAndFinalize();
-
-      // possiblyDeadSet contains a baseref for everything (Presences,
-      // Remotables, Representatives) that might have lost a
-      // pillar. The object might still be supported by other pillars,
-      // and the lost pillar might have been reinstantiated by the
-      // time we get here. The first step is to filter this down to a
-      // list of definitely dead baserefs.
-
-      const deadSet = new Set();
-
-      for (const baseRef of possiblyDeadSet) {
-        if (slotToVal.has(baseRef)) {
-          continue; // RAM pillar remains
-        }
-        const { virtual, durable, type } = parseVatSlot(baseRef);
-        assert(type === 'object', `unprepared to track ${type}`);
-        if (virtual || durable) {
-          // eslint-disable-next-line no-use-before-define
-          if (vrm.isVirtualObjectReachable(baseRef)) {
-            continue; // vdata or export pillar remains
-          }
-        }
-        deadSet.add(baseRef);
-      }
-      possiblyDeadSet.clear();
-
-      // deadSet now contains objects which are certainly dead
-
-      // possiblyRetiredSet holds (a subset of??) baserefs which have
-      // lost a recognizer recently. TODO recheck this
-
-      for (const vref of possiblyRetiredSet) {
-        // eslint-disable-next-line no-use-before-define
-        if (!getValForSlot(vref) && !deadSet.has(vref)) {
-          // Don't retire things that haven't yet made the transition to dead,
-          // i.e., always drop before retiring
-          // eslint-disable-next-line no-use-before-define
-          if (!vrm.isVrefRecognizable(vref)) {
-            importsToRetire.add(vref);
-          }
-        }
-      }
-      possiblyRetiredSet.clear();
-
-      const deadBaseRefs = Array.from(deadSet);
-      deadBaseRefs.sort();
-      for (const baseRef of deadBaseRefs) {
-        const { virtual, durable, allocatedByVat, type } =
-          parseVatSlot(baseRef);
-        type === 'object' || Fail`unprepared to track ${type}`;
-        if (virtual || durable) {
-          // Representative: send nothing, but perform refcount checking
-          // eslint-disable-next-line no-use-before-define
-          const [gcAgain, retirees] = vrm.deleteVirtualObject(baseRef);
-          if (retirees) {
-            retirees.map(retiree => exportsToRetire.add(retiree));
-          }
-          doMore = doMore || gcAgain;
-        } else if (allocatedByVat) {
-          // Remotable: send retireExport
-          // for remotables, vref === baseRef
-          if (kernelRecognizableRemotables.has(baseRef)) {
-            kernelRecognizableRemotables.delete(baseRef);
-            exportsToRetire.add(baseRef);
-          }
-        } else {
-          // Presence: send dropImport unless reachable by VOM
-          // eslint-disable-next-line no-lonely-if, no-use-before-define
-          if (!vrm.isPresenceReachable(baseRef)) {
-            importsToDrop.add(baseRef);
-            // eslint-disable-next-line no-use-before-define
-            if (!vrm.isVrefRecognizable(baseRef)) {
-              // for presences, baseRef === vref
-              importsToRetire.add(baseRef);
-            }
-          }
-        }
-      }
-    } while (possiblyDeadSet.size > 0 || possiblyRetiredSet.size > 0 || doMore);
-
-    if (importsToDrop.size) {
-      syscall.dropImports(Array.from(importsToDrop).sort());
-    }
-    if (importsToRetire.size) {
-      syscall.retireImports(Array.from(importsToRetire).sort());
-    }
-    if (exportsToRetire.size) {
-      syscall.retireExports(Array.from(exportsToRetire).sort());
-    }
-  }
-
   /**
    * Remember disavowed Presences which will kill the vat if you try to talk
    * to them
@@ -1545,16 +1393,16 @@ function build(
   // metered
   const unmeteredDispatch = meterControl.unmetered(dispatchToUserspace);
 
-  async function bringOutYourDead() {
-    await scanForDeadObjects();
-    // Now flush all the vatstore changes (deletions and refcounts) we
-    // made. dispatch() calls afterDispatchActions() automatically for
-    // most methods, but not bringOutYourDead().
-    // eslint-disable-next-line no-use-before-define
-    afterDispatchActions();
-    // XXX TODO: make this conditional on a config setting
-    return getRetentionStats();
-  }
+  const { scanForDeadObjects } = makeBOYDKit({
+    gcTools,
+    getValForSlot,
+    slotToVal,
+    vrm,
+    kernelRecognizableRemotables,
+    syscall,
+    possiblyDeadSet,
+    possiblyRetiredSet,
+  });
 
   /**
    * @param { import('./types.js').SwingSetCapData } _disconnectObjectCapData
@@ -1574,6 +1422,16 @@ function build(
     vom.flushStateCache();
   }
 
+  const bringOutYourDead = async () => {
+    await scanForDeadObjects();
+    // Now flush all the vatstore changes (deletions and refcounts) we
+    // made. dispatch() calls afterDispatchActions() automatically for
+    // most methods, but not bringOutYourDead().
+    afterDispatchActions();
+    // XXX TODO: make this conditional on a config setting
+    return getRetentionStats();
+  };
+
   /**
    * This 'dispatch' function is the entry point for the vat as a whole: the
    * vat-worker supervisor gives us VatDeliveryObjects (like

From 4490a7315378f3b21ba0ac3c16e9e657b05f9a3e Mon Sep 17 00:00:00 2001
From: Brian Warner <warner@lothar.com>
Date: Mon, 26 Aug 2024 01:34:52 -0700
Subject: [PATCH 2/4] add tests for #9939 to both SwingSet and liveslots

These tests will fail without the fixes in the next commit

One new test is disabled because of additional pending bugs that
interfere with the test setup (a combo of #9956 and #9959), which will
be re-enabled in PR #9961 (to address #8756 and others)
---
 packages/SwingSet/test/gc/bootstrap.js        |   5 +
 packages/SwingSet/test/gc/gc-vat.test.js      |  81 ++++++
 packages/SwingSet/test/gc/vat-target.js       |  14 +-
 .../test/dropped-weakset-9939.test.js         |  80 ++++++
 .../test/gc-before-finalizer.test.js          | 234 ++++++++++++++++++
 .../test/liveslots-real-gc.test.js            |  24 +-
 6 files changed, 433 insertions(+), 5 deletions(-)
 create mode 100644 packages/swingset-liveslots/test/dropped-weakset-9939.test.js
 create mode 100644 packages/swingset-liveslots/test/gc-before-finalizer.test.js

diff --git a/packages/SwingSet/test/gc/bootstrap.js b/packages/SwingSet/test/gc/bootstrap.js
index 3c08bb01b4e..8cdf7ce4dfe 100644
--- a/packages/SwingSet/test/gc/bootstrap.js
+++ b/packages/SwingSet/test/gc/bootstrap.js
@@ -23,5 +23,10 @@ export function buildRootObject() {
     async makeInvitation0() {
       await E(target).makeInvitationTarget(zoe);
     },
+
+    // for #9939
+    async storePresenceInWeakSet() {
+      await E(target).store(A);
+    },
   });
 }
diff --git a/packages/SwingSet/test/gc/gc-vat.test.js b/packages/SwingSet/test/gc/gc-vat.test.js
index a593ff6cec7..8c57f797a75 100644
--- a/packages/SwingSet/test/gc/gc-vat.test.js
+++ b/packages/SwingSet/test/gc/gc-vat.test.js
@@ -158,3 +158,84 @@ test('forward to fake zoe', async t => {
   // 'makeInvitationTarget' result promise with it, then dropped it
   t.is(findClist(c, targetID, invitation), undefined);
 });
+
+// see #9939
+test.failing('drop without retire', async t => {
+  let targetID;
+  let didBOYD = false;
+  // const msgs = ['deliver', 'deliver-result', 'syscall', 'syscall-result'];
+
+  function slogSender(slogObj) {
+    const {
+      time: _1,
+      replay: _2,
+      crankNum: _3,
+      deliveryNum: _4,
+      monotime: _5,
+      ...o
+    } = slogObj;
+    if (o.vatID !== targetID) return;
+    if (o.type === 'deliver' && o.kd[0] === 'bringOutYourDead') {
+      didBOYD = true;
+    }
+    // if (msgs.includes(o.type)) console.log(JSON.stringify(o));
+  }
+  const config = {
+    bootstrap: 'bootstrap', // v6
+    vats: {
+      bootstrap: {
+        // v6
+        sourceSpec: new URL('bootstrap.js', import.meta.url).pathname,
+      },
+      target: {
+        // v1
+        sourceSpec: new URL('vat-target.js', import.meta.url).pathname,
+        // avoid V8's GC nondeterminism, only needed on the target vat
+        creationOptions: { managerType: 'xs-worker' },
+      },
+    },
+  };
+  const kernelStorage = initSwingStore().kernelStorage;
+  await initializeSwingset(config, [], kernelStorage);
+  const c = await makeSwingsetController(kernelStorage, {}, { slogSender });
+  t.teardown(c.shutdown);
+
+  c.pinVatRoot('bootstrap');
+  targetID = c.vatNameToID('target');
+  c.pinVatRoot('target');
+
+  await c.run();
+
+  c.queueToVatRoot('bootstrap', 'storePresenceInWeakSet', []);
+  await c.run();
+
+  // now do enough dummy messages to trigger a new BOYD
+  didBOYD = false;
+  while (!didBOYD) {
+    c.queueToVatRoot('target', 'dummy', []);
+    await c.run();
+  }
+
+  // now tell vat-target to drop its WeakSet
+  c.queueToVatRoot('target', 'drop', []);
+  await c.run();
+
+  // and trigger a second BOYD
+  didBOYD = false;
+  while (!didBOYD) {
+    // this will fail once the vat is terminated
+    try {
+      c.queueToVatRoot('target', 'dummy', []);
+    } catch (e) {
+      if (/vat name "target" must exist/.test(e.message)) {
+        t.fail('vat terminated, bug is present');
+        break;
+      }
+    }
+    await c.run();
+  }
+  t.true(didBOYD);
+
+  // the test passes if the vat survived
+  return t.pass();
+});
diff --git a/packages/SwingSet/test/gc/vat-target.js b/packages/SwingSet/test/gc/vat-target.js
index dd4c1d5d0a6..53900b66055 100644
--- a/packages/SwingSet/test/gc/vat-target.js
+++ b/packages/SwingSet/test/gc/vat-target.js
@@ -1,6 +1,9 @@
 import { Far, E } from '@endo/far';
 
-export function buildRootObject() {
+export function buildRootObject(_vatPowers, _vatParameters, baggage) {
+  /** @type { WeakSet | undefined } */
+  let ws = new WeakSet();
+
   return Far('root', {
     async two(A, B) {
       // A=ko26 B=ko27
@@ -10,5 +13,14 @@ export function buildRootObject() {
     makeInvitationTarget(zoe) {
       return E(zoe).makeInvitationZoe();
     },
+
+    // these three are for testing #9939
+    store: x => {
+      baggage.init('x', x);
+      assert(ws);
+      ws.add(x);
+    },
+    dummy: () => 0,
+    drop: () => (ws = undefined),
   });
 }
diff --git a/packages/swingset-liveslots/test/dropped-weakset-9939.test.js b/packages/swingset-liveslots/test/dropped-weakset-9939.test.js
new file mode 100644
index 00000000000..fe8fe20cd1f
--- /dev/null
+++ b/packages/swingset-liveslots/test/dropped-weakset-9939.test.js
@@ -0,0 +1,80 @@
+import test from 'ava';
+import { Far } from '@endo/marshal';
+import { kser, kslot } from '@agoric/kmarshal';
+import { makeLiveSlots } from '../src/liveslots.js';
+import { buildSyscall } from './liveslots-helpers.js';
+import { makeStartVat, makeMessage, makeBringOutYourDead } from './util.js';
+import { makeMockGC } from './mock-gc.js';
+
+// Test for https://github.com/Agoric/agoric-sdk/issues/9939
+
+test.failing('weakset deletion vs retire', async t => {
+  const { syscall, log } = buildSyscall();
+  const gcTools = makeMockGC();
+
+  // #9939 was a bug in liveslots that caused a vat to emit
+  // syscall.retireImports despite not having done dropImports
+  // first. The setup is:
+  //
+  // * import a Presence (raising the RAM pillar)
+  // * store it in a virtual object (raising the vdata pillar)
+  // * use it as a key of a voAwareWeakMap or voAwareWeakSet
+  // * drop the Presence (dropping the RAM pillar)
+  // * do a BOYD
+  // * delete the voAwareWeakSet
+  // * do a BOYD
+  //
+  // When the voAwareWeakSet is collected, a finalizer callback named
+  // finalizeDroppedCollection is called, which clears the entries,
+  // and adds all its vref keys to possiblyRetiredSet. Later, during
+  // BOYD, a loop examines possiblyRetiredSet and adds qualified vrefs
+  // to importsToRetire, for the syscall.retireImports at the end.
+  //
+  // That qualification check was sufficient to prevent the retirement
+  // of vrefs that still have a RAM pillar, and also vrefs that were
+  // being dropped in this particular BOYD, but it was not sufficient
+  // to protect vrefs that still have a vdata pillar.
+
+  let myVOAwareWeakSet;
+  let myPresence;
+  function buildRootObject(vatPowers, _vatParameters, baggage) {
+    const { WeakSet } = vatPowers;
+    myVOAwareWeakSet = new WeakSet();
+    return Far('root', {
+      store: p => {
+        myPresence = p;
+        myVOAwareWeakSet.add(p);
+        baggage.init('presence', p);
+      },
+    });
+  }
+
+  const makeNS = () => ({ buildRootObject });
+  const ls = makeLiveSlots(syscall, 'vatA', {}, {}, gcTools, undefined, makeNS);
+  const { dispatch } = ls;
+  await dispatch(makeStartVat(kser()));
+  t.truthy(myVOAwareWeakSet);
+
+  await dispatch(makeMessage('o+0', 'store', [kslot('o-1')]));
+  t.truthy(myPresence);
+
+  log.length = 0;
+  gcTools.kill(myPresence);
+  gcTools.flushAllFRs();
+  await dispatch(makeBringOutYourDead());
+
+  log.length = 0;
+  gcTools.kill(myVOAwareWeakSet);
+  gcTools.flushAllFRs();
+  await dispatch(makeBringOutYourDead());
+
+  // The imported vref is still reachable by the 'baggage' durable
+  // store, so it must not be dropped or retired yet. The bug caused
+  // the vref to be retired without first doing a drop, which is a
+  // vat-fatal syscall error
+  const gcCalls = log.filter(
+    l => l.type === 'dropImports' || l.type === 'retireImports',
+  );
+  t.deepEqual(gcCalls, []);
+  log.length = 0;
+});
diff --git a/packages/swingset-liveslots/test/gc-before-finalizer.test.js b/packages/swingset-liveslots/test/gc-before-finalizer.test.js
new file mode 100644
index 00000000000..393ed248841
--- /dev/null
+++ b/packages/swingset-liveslots/test/gc-before-finalizer.test.js
@@ -0,0 +1,234 @@
+import test from 'ava';
+import { Far } from '@endo/marshal';
+import { kser, kslot } from '@agoric/kmarshal';
+import { makeLiveSlots } from '../src/liveslots.js';
+import { buildSyscall } from './liveslots-helpers.js';
+import { makeStartVat, makeMessage, makeBringOutYourDead } from './util.js';
+import { makeMockGC } from './mock-gc.js';
+
+const justGC = log =>
+  log.filter(
+    l =>
+      l.type === 'dropImports' ||
+      l.type === 'retireImports' ||
+      l.type === 'retireExports',
+  );
+
+test.failing('presence in COLLECTED state is not dropped yet', async t => {
+  const { syscall, log } = buildSyscall();
+  const gcTools = makeMockGC();
+
+  // our GC terminology (see boyd-gc.js for notes):
+  // * REACHABLE means userspace can reach a Presence
+  // * UNREACHABLE means it cannot
+  // * COLLECTED means the engine GC has noticed, so the
+  //             slotToVal.get(vref) WeakRef is empty, even though
+  //             slotToVal.has(vref) is still true. A finalizer
+  //             callback has been queued.
+  // * FINALIZED means the callback has been run. Our callback does
+  //             slotToVal.delete(vref) before adding the vref to
+  //             possiblyDeadSet
+
+  // slotToVal.has() returns true for REACHABLE / UNREACHABLE /
+  // COLLECTED, and false for FINALIZED. getValForSlot() is another
+  // way to probe for reachability, but it also looks to see if the
+  // WeakRef is full, so it returns false for both COLLECTED and
+  // FINALIZED.
+
+  // We use slotToVal.has(vref) as the "is it still reachable" probe
+  // in scanForDeadObjects(), because if we have a Presence in the
+  // COLLECTED state, we must not dropImports it during this BOYD. The
+  // finalizer is still enqueued, so some future turn will run it, and
+  // the vref will be added to possiblyDeadSet again. We want the drop
+  // to happen exactly once, and we don't remove the finalizer from
+  // the queue, which means the drop must happen in the future BOYD.
+
+  // We can get into this situation with the following sequence:
+  // 1: vref is held by both Presence and vdata
+  // 2: Presence is dropped by userspace (->UNREACHABLE)
+  // 3: userspace store.delete(), drops vdata refcount, addToPossiblyDeadSet
+  // 4: organic GC happens (->COLLECTED, WeakRef is empty)
+  // 5: BOYD is called (finalizer has not run yet)
+
+  // The order of steps 3 and 4 is not important. What matters is that
+  // the WeakRef is empty, but the finalizer has not yet run, at the
+  // time that BOYD happens. And that the vref is in possiblyDeadSet
+  // (because of the vdata drop, not the finalizer), so this BOYD will
+  // examine the vref.
+
+  // This test simulates this case with our mockGC tools. It would
+  // fail if boyd-gc.js used getValForSlot instead of slotToVal.has.
+
+  // The GC code used to call getValForSlot() instead of
+  // slotToVal.has(), in the possiblyRetiredSet. The second test
+  // exercises this case, and would fail if it still used
+  // getValForSlot
+
+  let myPresence;
+  function buildRootObject(vatPowers) {
+    const { VatData } = vatPowers;
+    const { makeScalarBigMapStore } = VatData;
+    const store = makeScalarBigMapStore();
+    return Far('root', {
+      store: p => {
+        myPresence = p;
+        store.init('presence', p);
+      },
+      drop: () => store.delete('presence'),
+    });
+  }
+
+  const makeNS = () => ({ buildRootObject });
+  const ls = makeLiveSlots(syscall, 'vatA', {}, {}, gcTools, undefined, makeNS);
+  const { dispatch } = ls;
+  await dispatch(makeStartVat(kser()));
+
+  await dispatch(makeMessage('o+0', 'store', [kslot('o-1')]));
+  t.truthy(myPresence);
+
+  // clear out everything before our check
+  await dispatch(makeBringOutYourDead());
+  log.length = 0;
+
+  // drop the vdata reference
+  await dispatch(makeMessage('o+0', 'drop', []));
+  log.length = 0;
+
+  // and, before BOYD can happen, collect (but do not finalize) the Presence
+  gcTools.kill(myPresence);
+
+  // now BOYD
+  await dispatch(makeBringOutYourDead());
+
+  // the BOYD must not have done dropImports or retireImports
+  t.deepEqual(log, []);
+
+  // eventually, the finalizer runs
+  gcTools.flushAllFRs();
+
+  // *now* a BOYD will drop+retire
+  await dispatch(makeBringOutYourDead());
+  t.deepEqual(justGC(log), [
+    { type: 'dropImports', slots: ['o-1'] },
+    { type: 'retireImports', slots: ['o-1'] },
+  ]);
+});
+
+// disabled until #9956 and #9959 are fixed, which interfere with this
+// test
+
+/*
+
+test.failing('presence in COLLECTED state is not retired early', async t => {
+  const { syscall, log } = buildSyscall();
+  const gcTools = makeMockGC();
+
+  // The GC code used to call getValForSlot() instead of
+  // slotToVal.has(), in the possiblyRetiredSet. This test would fail
+  // if it still used getValForSlot.
+
+  // The setup is a Presence in the COLLECTED state as the only
+  // pillar, but not in possiblyDeadSet, whose vref also appears in
+  // possiblyRetiredSet (because a recognizer was just deleted). On
+  // the BOYD that handles the possiblyRetiredSet, the "reachability
+  // inhibits retirement" check should treat the COLLECTED state as
+  // reachable, so the retirement is deferred for a later BOYD (which
+  // would drop the vref first)
+  //
+  // To build this, we have an anchored (virtual) MapStore msA holding
+  // the only reference (vdata) to a (virtual) WeakSetStore wssB. wssB
+  // has one (weak) key, o-1, for which there is a Presence P.
+  //
+  // 1: Construct everything, kill the wssB Representative, BOYD. That
+  //    will process wssB, but leave it alive because of the vdata in
+  //    msA.
+
+  // 2: Use msA.delete(key) to drop its vdata ref to wssB (adding wssB
+  //    to possiblyDeadSet), and use gcTools.kill(P) to mock-mark it
+  //    as COLLECTED (but not finalized)
+
+  // 3: Do a BOYD. The first pass will see wssB is unreachable, and
+  //    delete it. The collection deleter will put o-1 in
+  //    possiblyRetiredSet. There might be a second pass (doMore=1),
+  //    but it won't have anything in possiblyDeadSet and will do
+  //    nothing. Then the post-deadSet loop will process
+  //    possiblyRetiredSet, which will contain our o-1. That
+  //    processing step contains the valToSlot.has (or the
+  //    would-be-incorrect getValForSlot) that we want to exercise.
+
+  let myPresence;
+  let myWeakStore;
+
+  function buildRootObject(vatPowers) {
+    const { VatData, WeakSet } = vatPowers;
+    const { makeScalarBigMapStore, makeScalarBigWeakSetStore } = VatData;
+    const store = makeScalarBigMapStore();
+    myWeakStore = makeScalarBigWeakSetStore();
+    return Far('root', {
+      store: p => {
+        myPresence = p;
+        myWeakStore.add(p);
+        t.truthy(myWeakStore.has(p));
+        store.init('weakstore', myWeakStore);
+      },
+      dropWeakStore: () => store.delete('weakstore'),
+    });
+  }
+
+  const makeNS = () => ({ buildRootObject });
+  const ls = makeLiveSlots(syscall, 'vatA', {}, {}, gcTools, undefined, makeNS);
+  const { dispatch, testHooks } = ls;
+  const { possiblyDeadSet, possiblyRetiredSet, slotToVal } = testHooks;
+  await dispatch(makeStartVat(kser()));
+
+  // step 1 (setup): store, kill WeakSetStore representative, BOYD
+  await dispatch(makeMessage('o+0', 'store', [kslot('o-1')]));
+  t.truthy(myPresence);
+  gcTools.kill(myWeakStore);
+  gcTools.flushAllFRs();
+  await dispatch(makeBringOutYourDead());
+  log.length = 0;
+
+  // myPresence vref is held by the Presence, and recognized by myWeakStore
+  t.is(possiblyDeadSet.size, 0);
+  t.is(possiblyRetiredSet.size, 0);
+
+  console.log(`-- starting`);
+  // step 2: delete vdata ref to weakstore, make myPresence COLLECTED
+  await dispatch(makeMessage('o+0', 'dropWeakStore', []));
+  gcTools.kill(myPresence)
+  log.length = 0;
+  // weakstore is possiblyDead (NARRATORS VOICE: it's dead). Presence
+  // is not, because the finalizer hasn't run.
+  t.is(possiblyDeadSet.size, 1);
+  t.is(possiblyRetiredSet.size, 0);
+  t.not([...possiblyDeadSet][0], 'o-1');
+  // the empty weakref is still there
+  t.true(slotToVal.has('o-1'));
+
+
+  // step 3: BOYD. It will collect myWeakStore on the first pass,
+  // whose deleter should clear all entries, which will add its
+  // recognized vrefs to possiblyRetiredSet. The post-pass will check
+  // o-1 for reachability with slotToVal.has, and because that says it
+  // is reachable, it will not be retired (even though it has no
+  // recognizer by now)
+  console.log(`-- doing first BOYD`);
+  await dispatch(makeBringOutYourDead());
+  t.deepEqual(justGC(log), []);
+  log.length = 0;
+
+  // eventually, the finalizer runs
+  gcTools.flushAllFRs();
+
+  console.log(`-- doing second BOYD`);
+  // *now* a BOYD will drop+retire
+  await dispatch(makeBringOutYourDead());
+  console.log(log);
+  t.deepEqual(justGC(log), [
+    { type: 'dropImports', slots: ['o-1'] },
+    { type: 'retireImports', slots: ['o-1'] },
+  ]);
+});
+
+*/
diff --git a/packages/swingset-liveslots/test/liveslots-real-gc.test.js b/packages/swingset-liveslots/test/liveslots-real-gc.test.js
index 8832eb04546..9f29bb57970 100644
--- a/packages/swingset-liveslots/test/liveslots-real-gc.test.js
+++ b/packages/swingset-liveslots/test/liveslots-real-gc.test.js
@@ -253,7 +253,7 @@ test.serial('GC dispatch.retireExports', async t => {
   t.deepEqual(log, []);
 });
 
-test.serial('GC dispatch.dropExports', async t => {
+test.serial.failing('GC dispatch.dropExports', async t => {
   const { log, syscall } = buildSyscall();
   let collected;
   function build(_vatPowers) {
@@ -308,6 +308,14 @@ test.serial('GC dispatch.dropExports', async t => {
   // that should allow ex1 to be collected
   t.true(collected.result);
 
+  // upon collection, the vat should scan for local recognizers (weak
+  // collection keys) in case any need to be dropped, and find none
+  t.deepEqual(log.shift(), {
+    type: 'vatstoreGetNextKey',
+    priorKey: `vom.ir.${ex1}|`,
+    result: 'vom.rc.o+d6/1',
+  });
+
   // and once it's collected, the vat should emit `syscall.retireExport`
   // because nobody else will be able to recognize it again
   const l2 = log.shift();
@@ -318,7 +326,7 @@ test.serial('GC dispatch.dropExports', async t => {
   t.deepEqual(log, []);
 });
 
-test.serial(
+test.serial.failing(
   'GC dispatch.retireExports inhibits syscall.retireExports',
   async t => {
     const { log, syscall } = buildSyscall();
@@ -394,8 +402,16 @@ test.serial(
     // which should let the export be collected
     t.true(collected.result);
 
-    // the vat should *not* emit `syscall.retireExport`, because it already
-    // received a dispatch.retireExport
+    // the vat should scan for local recognizers (weak collection
+    // keys) in case any need to be dropped, and find none
+    t.deepEqual(log.shift(), {
+      type: 'vatstoreGetNextKey',
+      priorKey: 'vom.ir.o+10|',
+      result: 'vom.rc.o+d6/1',
+    });
+
+    // the vat should *not* emit `syscall.retireExport`, because it
+    // already received a dispatch.retireExport
     t.deepEqual(log, []);
   },
 );

From 7a0917583c1d87de3b4619b633fb6d641bde656c Mon Sep 17 00:00:00 2001
From: Brian Warner <warner@lothar.com>
Date: Thu, 29 Aug 2024 13:44:12 -0700
Subject: [PATCH 3/4] fix(liveslots): rewrite scanForDeadObjects to avoid
 retire-without-drop

This rewrites scanForDeadObjects(), which is called during
dispatch.bringOutYourDead to process possiblyDeadSet and
possiblyRetiredSet. The new flow should be easier to review and
understand.

The main behavioral difference is to fix a bug (#9939) in which a vref
that appears in possiblyRetiredSet (because e.g. a weak collection was
deleted, which was using that vref as a key), but which 1: lacks a RAM
pillar (Presence object) and 2: was not dropped in this BOYD (e.g. it
has a vdata pillar), used to be sent to the kernel in a bogus
`syscall.retireImports()` call. Because this vref was not previously
dropped by the vat (syscall.dropImports()), this was a vat-fatal
error.

The new code will only retire such a Presence vref if it was not
reachable by the vat.

The new tests are marked as expected to pass again.

thanks @mhofman and @gibson042 for recommendations

fixes #9939
---
 packages/SwingSet/test/gc/gc-vat.test.js      |   2 +-
 packages/swingset-liveslots/src/boyd-gc.js    | 473 ++++++++++++++----
 packages/swingset-liveslots/src/liveslots.js  |   1 -
 .../test/dropped-weakset-9939.test.js         |   2 +-
 .../test/gc-before-finalizer.test.js          |   2 +-
 .../test/liveslots-real-gc.test.js            |   4 +-
 6 files changed, 393 insertions(+), 91 deletions(-)

diff --git a/packages/SwingSet/test/gc/gc-vat.test.js b/packages/SwingSet/test/gc/gc-vat.test.js
index 8c57f797a75..617e4b07a7e 100644
--- a/packages/SwingSet/test/gc/gc-vat.test.js
+++ b/packages/SwingSet/test/gc/gc-vat.test.js
@@ -160,7 +160,7 @@ test('forward to fake zoe', async t => {
 });
 
 // see #9939
-test.failing('drop without retire', async t => {
+test('drop without retire', async t => {
   let targetID;
   let didBOYD = false;
   // const msgs = ['deliver', 'deliver-result', 'syscall', 'syscall-result'];
diff --git a/packages/swingset-liveslots/src/boyd-gc.js b/packages/swingset-liveslots/src/boyd-gc.js
index aedca2b8fe8..8f6c341039c 100644
--- a/packages/swingset-liveslots/src/boyd-gc.js
+++ b/packages/swingset-liveslots/src/boyd-gc.js
@@ -1,7 +1,8 @@
-import { Fail } from '@endo/errors';
 import { parseVatSlot } from './parseVatSlots.js';
 
 /*
+  Theory of Operation (imports/Presences)
+
     Imports are in one of 5 states: UNKNOWN, REACHABLE, UNREACHABLE,
     COLLECTED, FINALIZED. Note that there's no actual state machine with those
     values, and we can't observe all of the transitions from JavaScript, but
@@ -27,29 +28,247 @@ import { parseVatSlot } from './parseVatSlots.js';
 
     * slotToVal holds a WeakRef in [REACHABLE, UNREACHABLE, COLLECTED]
     * that WeakRef .deref()s into something in [REACHABLE, UNREACHABLE]
-    * deadSet holds the vref only in FINALIZED
-    * re-introduction must ensure the vref is not in the deadSet
+    * possiblyDeadSet holds the vref only in FINALIZED
+    * (TODO) re-introduction could remove the vref from possiblyDeadSet
 
     Each state thus has a set of perhaps-measurable properties:
 
-    * UNKNOWN: slotToVal[baseRef] is missing, baseRef not in deadSet
+    * UNKNOWN: slotToVal[baseRef] is missing, baseRef not in possiblyDeadSet
     * REACHABLE: slotToVal has live weakref, userspace can reach
     * UNREACHABLE: slotToVal has live weakref, userspace cannot reach
     * COLLECTED: slotToVal[baseRef] has dead weakref
-    * FINALIZED: slotToVal[baseRef] is missing, baseRef is in deadSet
+    * FINALIZED: slotToVal[baseRef] is missing, baseRef is in possiblyDeadSet
 
     Our finalizer callback is queued by the engine's transition from
     UNREACHABLE to COLLECTED, but the baseRef might be re-introduced before the
     callback has a chance to run. There might even be multiple copies of the
     finalizer callback queued. So the callback must deduce the current state
     and only perform cleanup (i.e. delete the slotToVal entry and add the
-    baseRef to the deadSet) in the COLLECTED state.
+    baseRef to possiblyDeadSet) in the COLLECTED state.
+
+    Our general rule is "trust the finalizer". The GC code below
+    considers a Presence to be reachable (the vref's "RAM pillar"
+    remains "up") until it moves to the FINALIZED state. We do this to
+    avoid race conditions between some other pillar dropping (and a
+    BOYD sampling the WeakRef) while it is in the COLLECTED state. If
+    we treated COLLECTED as unreachable,, then the subsequent
+    finalizer callback would examine the vref a second time,
+    potentially causing a vat-fatal double syscall.dropImports. This
+    rule may change if/when we use FinalizationRegistry better, by
+    explicitly de-registering the vref when we drop it, which JS
+    guarantees will remove and pending callback from the queue. This
+    may help us avoid probing the WeakRef during BOYD (instead relying
+    upon the fired/not-fired state of the FR), since that probing can
+    cause engines to retain objects longer than necessary.
+*/
+
+/*
+  Additional notes:
+
+  There are three categories of vrefs:
+  * Presence-style (o-NN, imports, durable)
+  * Remotable-style (o+NN, exportable, ephemeral)
+  * Representative-style (o+vKK/II or o+dKK/II, exportable, virtual/durable)
+
+  We don't necessarily have a Presence for every o-NN vref that the
+  vat can reach, because the vref might be held in virtual/durable
+  data ("vdata") while the in-RAM `Presence` object was
+  dropped. Likewise the in-RAM `Representative` can be dropped while
+  the o+dKK/II vref is kept REACHABLE by either vdata or an export to
+  the kernel. We *do* always have a Remotable for every o+NN vref that
+  the vat knows about, because Remotables are ephemeral.
+
+  We aren't recording any information about the kernel-facing import
+  status (c-list state) for Presence-style vrefs (o-NN), so we rely
+  upon the invariant that you only add a vref to possiblyDeadSet if it
+  was REACHABLE first. That way, possiblyDeadSet.has(vref) means that
+  the import status was REACHABLE. Likewise, code should only add to
+  possiblyRetiredSet if the vref was at least RECOGNIZABLE
+  beforehand. This helps us avoid a vat-fatal duplicate dropImports or
+  retireImports syscall.
+
+  For imports, the lifetime is controlled by the upstream vat: we
+  might drop REACHABLE today and maintain RECOGNIZABLE for days until
+  the object is finally retired. For exports *we* control the
+  lifetime, so when we determine an export is no longer REACHABLE, we
+  delete it and retire the vref immediately, and it does not
+  observably linger in the RECOGNIZABLE state. This simplifies our
+  tracking, and allows the deletion of Remotables and
+  Representative-type vrefs to be idempotent.
+
+  Each vref's reachability status is determined by a set of
+  "pillars". For Presence-style vrefs, there are two: the RAM pillar
+  (the `Presence` object), and the vdata pillar. The vdata pillar is
+  tracked in a vatstore key named `vom.rc.${vref}`, which stores the
+  reachable/recognizable refcounts.
+
+  For Representative-style vrefs, we add the export-status pillar,
+  because anything that we've exported to the kernel must be kept
+  alive until the kernel issues a dispatch.dropExports. That gives us
+  three pillars:
+  * the RAM pillar is the `Representative` object
+  * the vdata pillar is stored in `vom.rc.${vref}`
+  * the export-status pillar is stored in `vom.es.${vref}`
+
+  Remotables have only the RAM pillar. When a Remotable-style vref is
+  exported to the kernel, the Remotable is added to the
+  `exportedRemotables` set. And when it is stored in vdata, it appears
+  as a key of the `remotableRefCounts` map. That keeps the Remotable
+  itself alive until the other reachability pathways have gone
+  away. We don't do this for Representatives because it would violate
+  our "don't use RAM for inactive virtual objects" rule.
+
+  When an imported Presence becomes UNREACHABLE, it might still be
+  RECOGNIZABLE, by virtue of being the key of one or more weak
+  collections. If not, it might transit from REACHABLE directly to
+  NONE. The code that reacts to the UNREACHABLE transition must check
+  for recognizers, and do a retireImports right away if there are
+  none. Otherwise, recognizer will remain until either the kernel
+  retires the object (dispatch.retireImports), or the weak collection
+  is deleted, in which case possiblyRetiredSet will be updated with
+  the vref that might no longer be recognized. There will be a race
+  between us ceasing to recognize the vref (which should trigger a
+  syscall.retireImports), and the kernel telling us the object has
+  been deleted (via dispatch.retireImports). Either one must inhibit
+  the other.
+
+  possiblyRetiredSet only cares about Presence-style vrefs, because
+  they represent imports, whose lifetime is not under our control. The
+  collection-deletion code will add Remotable- and Representative-
+  style vrefs in possiblyRetiredSet, but we can remove and ignore
+  them.
+
+  We use slotToVal.has(vref) everywhere for our "is it still
+  reachable" check, which returns true for the REACHABLE / UNREACHABLE
+  / COLLECTED states, and false for the FINALIZED state. In contrast,
+  getValForSlot(vref) returns false for both COLLECTED and
+  FINALIZED. We want COLLECTED to qualify as "still reachable" because
+  it means there's still a finalizer callback queued, which will be
+  run eventually, and we need that callback to not trigger a duplicate
+  drop. We use slotToVal.has() in the possiblyRetiredSet loop (to
+  inhibit retirement of imported vrefs whose Presences are in the
+  COLLECTED state, and which just lost a recognizer), because
+  getValForSlot would allow such COLLECTED vrefs to be retired, even
+  before the finalizer had fired and could do a dropImports.
 
+  When we decide to delete a virtual object, we will delete its
+  `state`, decrementing the refcounts of any objects therein, which
+  might shake loose more data. So we keep looping until we've stopped
+  adding things to possiblyDeadSet. The same can happen when we use
+  vrm.ceaseRecognition() to delete any weak collection values keyed by
+  it. We also call ceaseRecognition when we realize that a Remotable
+  has been deleted. But the possiblyDeadSet processing loop cannot
+  make the decision to retire a Presence-style vref: those are deleted
+  outside our vat, and the kernel notifies us of the vref's retirement
+  with dispatch.retireImports (which also calls ceaseRecognition). The
+  only thing possiblyDeadSet can tell us about Presences is that our
+  vat can no longer *reach* the vref, which means we need to do a
+  syscall.dropImports, which cannot immediately release more data.
+
+  When the kernel sends us a dispatch.bringOutYourDead (or "BOYD" for
+  short), this scanForDeadObjects() will be called. This is the only
+  appropriate time for the syscall behavior to depend upon engine GC
+  behavior: during all other deliveries, we want the syscalls to be a
+  deterministic function of delivery contents, userspace behavior, and
+  vatstore data.
+
+  During BOYD, we still try to minimize the variation of behavior as
+  much as possible. The first step is to ask the engine to perform a
+  full GC sweep, to collect any remaining UNREACHABLE objects, and
+  allow the finalizer callbacks to run before looking at the
+  results. We also sort the vrefs before processing them, to remove
+  sensitivity to the exact order of finalizer invocation.
+
+  That makes BOYD a safe time to look inside WeakRefs and make
+  syscalls based on the contents, or to read the sets that are
+  modified during FinalizationRegistry callbacks and make syscalls to
+  query their state further. This this is the only time we examine and
+  clear possiblyDeadSet and possiblyRetiredSet, or probe
+  slotToVal. Outside of BOYD, in convertSlotToVal, we must probe the
+  WeakRefs to see whether we must build a new Presence or
+  Representative, or not, but we have carefully designed that code to
+  avoid making syscalls during the unmarshalling process, so the only
+  consequence of GC differences should be differences in metering and
+  memory allocation patterns.
+
+  Our general strategy is to look at the baseRefs/vrefs whose state
+  might have changed, determine their new reachability /
+  recognizability status, and then resolve any discrepancies between
+  that status and that of other parties who need to match.
+
+  The kernel is one such party. If the kernel thinks we can reach an
+  imported o-NN vref, but we've now determined that we cannot, we must
+  send a syscall.dropImports to resolve the difference. Once sent, the
+  kernel will update our c-list entry to reflect the unreachable (but
+  still recognizable) status. Likewise, if the kernel thinks *it* can
+  recognize an exported o+NN vref, but we've just retired it, we need
+  to update the kernel with a syscall.retireExports, so it can notify
+  downstream vats that have weak collections with our vref as a key.
+
+  The DB-backed `state` of a virtual object is another such party. If
+  the object is unreachable, but still has state data, we must delete
+  that state, and decrement refcounts it might have held.
+
+  Our plan is summarized as:
+   * outer loop
+     * gcAndFinalize
+     * sort possiblyDeadSet, examine each by type
+       * presence (vref):
+         * if unreachable:
+           * dropImports
+           * add to possiblyRetiredSet
+       * remotable (vref):
+         * if unreachable:
+           * retireExports if kernelRecognizableRemotables
+           * vrm.ceaseRecognition
+       * VOM (baseref)
+         * if unreachable:
+           * deleteVirtualObject (and retireExports retirees)
+       * all: remove from possiblyDeadSet
+     * repeat loop if gcAgain or possiblyDeadSet.size > 0
+   * now sort and process possiblyRetiredSet. for each:
+     * ignore unless presence
+     * if unreachable and unrecognizable: retireImport
+       (that's a duplicate reachability check, but note the answer might
+       be different now)
+*/
+
+/*
+  BaseRef vs vref
+
+  For multi-faceted virtual/durable objects (eg `defineKind()` with
+  faceted `behavior` argument), each time userspace create a new
+  instance, we create a full "cohort" of facets, passing a record of
+  Representative objects (one per facet) back to the caller. Each
+  facet gets is own vref, but they all share a common prefix, known as
+  the "baseRef". For example, `o+d44/2` is a BaseRef for a cohort, the
+  second instance created of the `o+d44` Kind, whose individual facets
+  would have vrefs of `o+d44/2:0` and `o+d44/2:1`.
+
+  We use a WeakMap to ensure that holding any facet will keep all the
+  others alive, so the cohort lives and dies as a group. The GC
+  tracking code needs to track the whole cohort at once, not the
+  individual facets, and any data structure which refers to cohorts
+  will use the BaseRef instead of a single vref. So `slotToVal` is
+  keyed by a BaseRef, and its values are a cohort of facets. But
+  `valToSlot` is keyed by the facets, and its values are the
+  individual facet's vref.
+
+  Most of the GC-related APIs that appear here take vrefs, but the
+  exceptions are:
+  * slotToVal is keyed by baseRef
+  * possiblyDeadSet holds baseRefs, that's what our WeakRefs track
+  * vrm.isVirtualObjectReachable takes baseRef
+  * vrm.deleteVirtualObject takes baseRef, returns [bool, retireees=vrefs]
+  * vrm.ceaseRecognition takes either baseRef or vref
+    (if given a baseRef, it will process all the facets)
+
+  For Presence- and Remotable- style objects, the vref and baseref are
+  the same.
 */
 
 export const makeBOYDKit = ({
   gcTools,
-  getValForSlot,
   slotToVal,
   vrm,
   kernelRecognizableRemotables,
@@ -57,108 +276,192 @@ export const makeBOYDKit = ({
   possiblyDeadSet,
   possiblyRetiredSet,
 }) => {
+  // Presence (o-NN) lifetimes are controlled by the upstream vat, or
+  // the kernel. If the vref was in possiblyDeadSet, then it *was*
+  // reachable before, so we can safely presume the kernel to think we
+  // can reach it.
+
+  const checkImportPresence = vref => {
+    // RAM pillar || vdata pillar
+    const isReachable = slotToVal.has(vref) || vrm.isPresenceReachable(vref);
+    // use slotToVal.has, not getSlotForVal(), to avoid duplicate drop
+    let dropImport;
+    if (!isReachable) {
+      dropImport = vref;
+    }
+    return { dropImport };
+  };
+
+  // Remotable (o+NN) lifetimes are controlled by us: we delete/retire
+  // the object as soon as it becomes unreachable. We only track the
+  // Remotable/Far object (the RAM pillar) directly: exports retain
+  // the Remotable in the exportedRemotables set, and vdata retains it
+  // as keys of the remotableRefCounts map. So when we decide the vref
+  // is unreachable, the Remotable is already gone, and it had no
+  // other data we need to delete, so our task is to remove any local
+  // recognition records, and to inform the kernel with a
+  // retireExports if kernelRecognizableRemotables says that the
+  // kernel still cares.
+
+  // note: we track export status for remotables in the
+  // kernelRecognizableRemotables set, not vom.es.VREF records. We
+  // don't currently track recognition records with
+  // vom.ir.VREF|COLLECTION, but we should, see #9956
+
+  const checkExportRemotable = vref => {
+    let gcAgain = false;
+    let exportsToRetire = [];
+
+    // Remotables have only the RAM pillar
+    const isReachable = slotToVal.has(vref);
+    if (!isReachable) {
+      // We own the object, so retire it immediately. If the kernel
+      // was recognizing it, we tell them it is now retired
+      if (kernelRecognizableRemotables.has(vref)) {
+        kernelRecognizableRemotables.delete(vref);
+        exportsToRetire = [vref];
+        // the kernel must not have been able to reach the object,
+        // else it would still be pinned by exportedRemotables
+      }
+      // and remove it from any local weak collections
+      gcAgain = vrm.ceaseRecognition(vref);
+    }
+    return { gcAgain, exportsToRetire };
+  };
+
+  // Representative (o+dNN/II or o+vNN/II) lifetimes are also
+  // controlled by us. We allow the Representative object to go away
+  // without deleting the vref, so we must track all three pillars:
+  // Representative (RAM), export, and vdata. When we decide the vref
+  // is unreachable, we must delete the virtual object's state, as
+  // well as retiring the object (by telling the kernel it has been
+  // retired, if the kernel cares, and removing any local recognition
+  // records).
+
+  const checkExportRepresentative = baseRef => {
+    // RAM pillar || (vdata pillar || export pillar)
+    const isReachable =
+      slotToVal.has(baseRef) || vrm.isVirtualObjectReachable(baseRef);
+    let gcAgain = false;
+    let exportsToRetire = [];
+    if (!isReachable) {
+      // again, we own the object, so we retire it immediately
+      [gcAgain, exportsToRetire] = vrm.deleteVirtualObject(baseRef);
+    }
+    return { gcAgain, exportsToRetire };
+  };
+
   const scanForDeadObjects = async () => {
-    // `possiblyDeadSet` accumulates vrefs which have lost a supporting
-    // pillar (in-memory, export, or virtualized data refcount) since the
-    // last call to scanForDeadObjects. The vref might still be supported
-    // by a remaining pillar, or the pillar which was dropped might be back
-    // (e.g., given a new in-memory manifestation).
+    await null;
+
+    // `possiblyDeadSet` holds vrefs which have lost a supporting
+    // pillar (in-memory, export, or virtualized data refcount) since
+    // the last call to scanForDeadObjects. The vref might still be
+    // supported by a remaining pillar, or the pillar which was
+    // dropped might have been restored (e.g., re-exported after a
+    // drop, or given a new in-memory manifestation).
 
     const importsToDrop = new Set();
     const importsToRetire = new Set();
     const exportsToRetire = new Set();
-    let doMore;
-    await null;
-    do {
-      doMore = false;
+    let gcAgain = false;
 
+    do {
+      gcAgain = false;
       await gcTools.gcAndFinalize();
 
-      // possiblyDeadSet contains a baseref for everything (Presences,
-      // Remotables, Representatives) that might have lost a
-      // pillar. The object might still be supported by other pillars,
-      // and the lost pillar might have been reinstantiated by the
-      // time we get here. The first step is to filter this down to a
-      // list of definitely dead baserefs.
+      // process a sorted list of vref/baseRefs we need to check for
+      // reachability, one at a time
 
-      const deadSet = new Set();
+      for (const vrefOrBaseRef of [...possiblyDeadSet].sort()) {
+        // remove the vref now, but some deleteVirtualObject might
+        // shake it loose again for a future pass to investigate
+        possiblyDeadSet.delete(vrefOrBaseRef);
+
+        const parsed = parseVatSlot(vrefOrBaseRef);
+        assert.equal(parsed.type, 'object', vrefOrBaseRef);
+
+        let res = {};
+        if (parsed.virtual || parsed.durable) {
+          const baseRef = vrefOrBaseRef;
+          res = checkExportRepresentative(baseRef);
+        } else if (parsed.allocatedByVat) {
+          const vref = vrefOrBaseRef;
+          res = checkExportRemotable(vref);
+        } else {
+          const vref = vrefOrBaseRef;
+          res = checkImportPresence(vref);
+        }
 
-      for (const baseRef of possiblyDeadSet) {
-        if (slotToVal.has(baseRef)) {
-          continue; // RAM pillar remains
+        // prepare our end-of-crank syscalls
+        if (res.dropImport) {
+          importsToDrop.add(res.dropImport);
+          possiblyRetiredSet.add(res.dropImport);
         }
-        const { virtual, durable, type } = parseVatSlot(baseRef);
-        assert(type === 'object', `unprepared to track ${type}`);
-        if (virtual || durable) {
-          if (vrm.isVirtualObjectReachable(baseRef)) {
-            continue; // vdata or export pillar remains
-          }
+        for (const facetVref of res.exportsToRetire || []) {
+          exportsToRetire.add(facetVref);
         }
-        deadSet.add(baseRef);
+        gcAgain ||= !!res.gcAgain;
       }
-      possiblyDeadSet.clear();
-
-      // deadSet now contains objects which are certainly dead
 
-      // possiblyRetiredSet holds (a subset of??) baserefs which have
-      // lost a recognizer recently. TODO recheck this
+      // Deleting virtual object state, or freeing weak-keyed
+      // collection entries, might shake loose more
+      // objects. possiblyDeadSet and possiblyRetiredSet are added
+      // when vdata vrefs are decreffed to zero, and gcAgain means that
+      // something in RAM might now be free. In both cases we should
+      // do another pass, including gcAndFinalize(), until we've
+      // cleared everything we can.
+    } while (possiblyDeadSet.size > 0 || gcAgain);
 
-      for (const vref of possiblyRetiredSet) {
-        if (!getValForSlot(vref) && !deadSet.has(vref)) {
-          // Don't retire things that haven't yet made the transition to dead,
-          // i.e., always drop before retiring
+    // Now we process potential retirements, by which we really mean
+    // de-recognitions, where this vat has ceased to even recognize a
+    // previously unreachable-yet-recognizable
+    // vref. addToPossiblyRetiredSet() is called from
+    // ceaseRecognition() when a recognizer goes away, such when a
+    // weak collection being deleted and it no longer recognizes all
+    // its former keys. ceaseRecognition() can be called from the loop
+    // above (when a Remotable-style object is deleted, or from within
+    // deleteVirtualObject), or in response to a retireImport()
+    // delivery. We assume possiblyRetiredSet is given vrefs of all
+    // sorts, but we only care about Presence-type, because we must do
+    // retireImports for them: the kernel doesn't care if/when we stop
+    // recognizing our own (maybe-exported) Remotable- and
+    // Representative- type vrefs.
 
-          if (!vrm.isVrefRecognizable(vref)) {
-            importsToRetire.add(vref);
-          }
+    for (const vref of [...possiblyRetiredSet].sort()) {
+      possiblyRetiredSet.delete(vref);
+      const parsed = parseVatSlot(vref);
+      assert.equal(parsed.type, 'object', vref);
+      // not allocated by us? that makes it a Presence-type
+      if (!parsed.allocatedByVat) {
+        // if we're dropping the vref, checkImportPresence() already
+        // did our isReachable check, so we can safely skip it (and
+        // save a vatstore syscall)
+        const isReachable =
+          !importsToDrop.has(vref) &&
+          (slotToVal.has(vref) || vrm.isPresenceReachable(vref));
+        // note: the old version used getValForSlot, which treats
+        // COLLECTED as unreachable, which would allow a retirement at
+        // the wrong time (before the finalizer fires and does the
+        // dropImport)
+        const isRecognizable = isReachable || vrm.isVrefRecognizable(vref);
+        if (!isRecognizable) {
+          importsToRetire.add(vref);
         }
       }
-      possiblyRetiredSet.clear();
-
-      const deadBaseRefs = Array.from(deadSet);
-      deadBaseRefs.sort();
-      for (const baseRef of deadBaseRefs) {
-        const { virtual, durable, allocatedByVat, type } =
-          parseVatSlot(baseRef);
-        type === 'object' || Fail`unprepared to track ${type}`;
-        if (virtual || durable) {
-          // Representative: send nothing, but perform refcount checking
-
-          const [gcAgain, retirees] = vrm.deleteVirtualObject(baseRef);
-          if (retirees) {
-            retirees.map(retiree => exportsToRetire.add(retiree));
-          }
-          doMore = doMore || gcAgain;
-        } else if (allocatedByVat) {
-          // Remotable: send retireExport
-          // for remotables, vref === baseRef
-          if (kernelRecognizableRemotables.has(baseRef)) {
-            kernelRecognizableRemotables.delete(baseRef);
-            exportsToRetire.add(baseRef);
-          }
-        } else {
-          // Presence: send dropImport unless reachable by VOM
-          // eslint-disable-next-line no-lonely-if
-          if (!vrm.isPresenceReachable(baseRef)) {
-            importsToDrop.add(baseRef);
-
-            if (!vrm.isVrefRecognizable(baseRef)) {
-              // for presences, baseRef === vref
-              importsToRetire.add(baseRef);
-            }
-          }
-        }
-      }
-    } while (possiblyDeadSet.size > 0 || possiblyRetiredSet.size > 0 || doMore);
+    }
+
+    // note that retiring Presence-type vrefs cannot shake loose any
+    // local data, so we don't need to loop back around
 
     if (importsToDrop.size) {
-      syscall.dropImports(Array.from(importsToDrop).sort());
+      syscall.dropImports([...importsToDrop].sort());
     }
     if (importsToRetire.size) {
-      syscall.retireImports(Array.from(importsToRetire).sort());
+      syscall.retireImports([...importsToRetire].sort());
     }
     if (exportsToRetire.size) {
-      syscall.retireExports(Array.from(exportsToRetire).sort());
+      syscall.retireExports([...exportsToRetire].sort());
     }
   };
 
diff --git a/packages/swingset-liveslots/src/liveslots.js b/packages/swingset-liveslots/src/liveslots.js
index a9db3c100e2..10747156c2d 100644
--- a/packages/swingset-liveslots/src/liveslots.js
+++ b/packages/swingset-liveslots/src/liveslots.js
@@ -1395,7 +1395,6 @@ function build(
 
   const { scanForDeadObjects } = makeBOYDKit({
     gcTools,
-    getValForSlot,
     slotToVal,
     vrm,
     kernelRecognizableRemotables,
diff --git a/packages/swingset-liveslots/test/dropped-weakset-9939.test.js b/packages/swingset-liveslots/test/dropped-weakset-9939.test.js
index fe8fe20cd1f..251bfcaff4e 100644
--- a/packages/swingset-liveslots/test/dropped-weakset-9939.test.js
+++ b/packages/swingset-liveslots/test/dropped-weakset-9939.test.js
@@ -8,7 +8,7 @@ import { makeMockGC } from './mock-gc.js';
 
 // Test for https://github.com/Agoric/agoric-sdk/issues/9939
 
-test.failing('weakset deletion vs retire', async t => {
+test('weakset deletion vs retire', async t => {
   const { syscall, log } = buildSyscall();
   const gcTools = makeMockGC();
 
diff --git a/packages/swingset-liveslots/test/gc-before-finalizer.test.js b/packages/swingset-liveslots/test/gc-before-finalizer.test.js
index 393ed248841..b0b5f7c7c19 100644
--- a/packages/swingset-liveslots/test/gc-before-finalizer.test.js
+++ b/packages/swingset-liveslots/test/gc-before-finalizer.test.js
@@ -14,7 +14,7 @@ const justGC = log =>
       l.type === 'retireExports',
   );
 
-test.failing('presence in COLLECTED state is not dropped yet', async t => {
+test('presence in COLLECTED state is not dropped yet', async t => {
   const { syscall, log } = buildSyscall();
   const gcTools = makeMockGC();
 
diff --git a/packages/swingset-liveslots/test/liveslots-real-gc.test.js b/packages/swingset-liveslots/test/liveslots-real-gc.test.js
index 9f29bb57970..80641d82a44 100644
--- a/packages/swingset-liveslots/test/liveslots-real-gc.test.js
+++ b/packages/swingset-liveslots/test/liveslots-real-gc.test.js
@@ -253,7 +253,7 @@ test.serial('GC dispatch.retireExports', async t => {
   t.deepEqual(log, []);
 });
 
-test.serial.failing('GC dispatch.dropExports', async t => {
+test.serial('GC dispatch.dropExports', async t => {
   const { log, syscall } = buildSyscall();
   let collected;
   function build(_vatPowers) {
@@ -326,7 +326,7 @@ test.serial.failing('GC dispatch.dropExports', async t => {
   t.deepEqual(log, []);
 });
 
-test.serial.failing(
+test.serial(
   'GC dispatch.retireExports inhibits syscall.retireExports',
   async t => {
     const { log, syscall } = buildSyscall();

From c04516352292d263ead3641720606a8141de17d5 Mon Sep 17 00:00:00 2001
From: Brian Warner <warner@lothar.com>
Date: Thu, 29 Aug 2024 13:20:24 -0700
Subject: [PATCH 4/4] Revert "test(boot): skip `basicFlows` test until #9939 is
 fixed"

This reverts commit 064ff1ad395856111b4d82bb68d8ab92f8d83f12.

Now that the underlying issue is fixed, we can re-enable this
formerly-flaky test. Thanks @michaelfig for your patience.
---
 packages/boot/test/orchestration/restart-contracts.test.ts | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/packages/boot/test/orchestration/restart-contracts.test.ts b/packages/boot/test/orchestration/restart-contracts.test.ts
index fe6104b6b12..dcf79a6db83 100644
--- a/packages/boot/test/orchestration/restart-contracts.test.ts
+++ b/packages/boot/test/orchestration/restart-contracts.test.ts
@@ -183,9 +183,7 @@ test.serial('stakeAtom', async t => {
 // restarting that one. For them to share bootstrap they'll each need a unique
 // instance name, which will require paramatizing the the two builders scripts
 // and the two core-eval functions.
-//
-// TODO(#9939): Flaky under Node.js until liveslots problem exposed by vows is fixed.
-test.serial.skip('basicFlows', async t => {
+test.serial('basicFlows', async t => {
   const {
     walletFactoryDriver,
     buildProposal,