From 9dcdabbd15b5f8a44ba0093578277094ab66ad4f Mon Sep 17 00:00:00 2001 From: Brian Warner Date: Thu, 27 Jun 2024 12:12:51 -0700 Subject: [PATCH] fix(swingset): retire unreachable orphans If a kernel object ("koid", the object subset of krefs) is unreachable, and then becomes orphaned (either because the owning vat was terminated, or called `syscall.abandonExports`, or was upgraded and the koid was ephemeral), then retire it immediately. The argument is that the previously-owning vat can never again talk about the object, so it can never become reachable again, which is normally the point at which the owning vat would retire it. But because the owning vat is dead, it can't retire the koid by itself, the kernel needs to do the retirement on the vat's behalf. We now consolidate retirement responsibilities into processRefcounts(): when terminateVat or syscall.abandonExports use abandonKernelObjects() to mark a kref as orphaned, it also adds the kref to maybeFreeKrefs, and then processRefcounts() is responsible for noticing the kref is both orphaned and unreachable, and then notifying any importers of its retirement. I double-checked that cleanupAfterTerminatedVat will always be followed by a processRefcounts(), by virtue of either being called from processDeliveryMessage (in the crankResults.terminate clause), or from within a device invocation syscall (which only happens during a delivery, so follows the same path). We need this to ensure that any maybeFreeKrefs created by the cleanup's abandonKernelObjects() will get processed promptly. This also changes getObjectRefCount() to tolerate deleted krefs (i.e. missing `koNN.refCount`) by just returning 0,0. This fixes a potential kernel panic in the new approach, when a kref is recognizable by one vat but only reachable by a send-message on the run-queue, then becomes unreachable as that message is delivered (the run-queue held the last strong reference), and if the target vat does syscall.exit during the delivery. The decref pushes the kref onto maybeFreeKrefs, the terminateVat retires the merely-recognizable now-orphaned kref, then processRefcounts used getObjectRefCount() to grab the refcount for the now-retired (and deleted) kref, which asserted that the koNN.refCount key still existed, which didn't. This occured in "zoe - secondPriceAuction -- valid input" unit test , where the contract did syscall.exit in response to a timer wake() message sent to a single-use wakeObj. Also rename abandonKernelObject back to orphanKernelObject, the name fits better now. closes #7212 --- packages/SwingSet/src/kernel/kernel.js | 2 +- packages/SwingSet/src/kernel/kernelSyscall.js | 4 +- .../SwingSet/src/kernel/state/kernelKeeper.js | 122 ++++++++++++++--- .../SwingSet/test/gc-kernel-orphan.test.js | 123 +++++++++++++++++- 4 files changed, 226 insertions(+), 25 deletions(-) diff --git a/packages/SwingSet/src/kernel/kernel.js b/packages/SwingSet/src/kernel/kernel.js index 634a04f8da6..eac3ba99d02 100644 --- a/packages/SwingSet/src/kernel/kernel.js +++ b/packages/SwingSet/src/kernel/kernel.js @@ -981,7 +981,7 @@ export default function buildKernel( ...kernelKeeper.enumerateNonDurableObjectExports(vatID), ]; for (const { kref } of abandonedObjects) { - kernelKeeper.abandonKernelObject(kref, vatID); + kernelKeeper.orphanKernelObject(kref, vatID); } // cleanup done, now we reset the worker to a clean state with no diff --git a/packages/SwingSet/src/kernel/kernelSyscall.js b/packages/SwingSet/src/kernel/kernelSyscall.js index 94b38eaebfc..573b2b351b7 100644 --- a/packages/SwingSet/src/kernel/kernelSyscall.js +++ b/packages/SwingSet/src/kernel/kernelSyscall.js @@ -185,9 +185,7 @@ export function makeKernelSyscallHandler(tools) { function abandonExports(vatID, koids) { Array.isArray(koids) || Fail`abandonExports given non-Array ${koids}`; for (const koid of koids) { - // note that this is effectful and also performed outside of a syscall - // by processUpgradeVat in {@link ./kernel.js} - kernelKeeper.abandonKernelObject(koid, vatID); + kernelKeeper.orphanKernelObject(koid, vatID); } return OKNULL; } diff --git a/packages/SwingSet/src/kernel/state/kernelKeeper.js b/packages/SwingSet/src/kernel/state/kernelKeeper.js index 86e50048878..b4204833e05 100644 --- a/packages/SwingSet/src/kernel/state/kernelKeeper.js +++ b/packages/SwingSet/src/kernel/state/kernelKeeper.js @@ -620,7 +620,9 @@ export default function makeKernelKeeper( function getObjectRefCount(kernelSlot) { const data = kvStore.get(`${kernelSlot}.refCount`); - data || Fail`getObjectRefCount(${kernelSlot}) was missing`; + if (!data) { + return { reachable: 0, recognizable: 0 }; + } const [reachable, recognizable] = commaSplit(data).map(Number); reachable <= recognizable || Fail`refmismatch(get) ${kernelSlot} ${reachable},${recognizable}`; @@ -721,7 +723,7 @@ export default function makeKernelKeeper( addGCActions(newActions); } - function abandonKernelObject(kref, oldVat) { + function orphanKernelObject(kref, oldVat) { // termination orphans all exports, upgrade orphans non-durable // exports, and syscall.abandonExports orphans specific ones const ownerKey = `${kref}.owner`; @@ -949,6 +951,14 @@ export default function makeKernelKeeper( * */ function cleanupAfterTerminatedVat(vatID, budget = undefined) { + // this is called from terminateVat, which is called from either: + // * end of processDeliveryMessage, if crankResults.terminate + // * device-vat-admin (when vat-v-a does adminNode.terminateVat) + // (which always happens inside processDeliveryMessage) + // so we're always followed by a call to processRefcounts, at + // end-of-delivery in processDeliveryMessage, after checking + // crankResults.terminate + insistVatID(vatID); let cleanups = 0; const work = { @@ -1018,7 +1028,7 @@ export default function makeKernelKeeper( assert(vref.startsWith('o+'), vref); const kref = kvStore.get(k); // note: adds to maybeFreeKrefs, deletes c-list and .owner - abandonKernelObject(kref, vatID); + orphanKernelObject(kref, vatID); work.exports += 1; if (spend()) { logWork(); @@ -1491,14 +1501,43 @@ export default function makeKernelKeeper( return false; } + // TODO (#9888): change the processRefcounts maybeFreeKrefs + // iteration to handle krefs that get added multiple times (while + // we're iterating), because we might do different work the second + // time around. The concerning scenario is: + // + // * kp2 has resolution data which references ko1 + // * somehow both kp2 and ko1 end up on maybeFreeKrefs, but ko1.reachable=1 + // (via kp2) + // * our iterator visits ko1 first, sees it is still reachable, ignores it + // * then the iterator visits kp2, decrefs its kp.data.slots + // * this pushes ko1 back onto maybeFreeKrefs + // * we need to examine ko1 again, so it can be released + // + // It could also happen if/when we implement #2069 auxdata: + // + // * ko2 has auxdata that references ko1 + // * both ko1 and ko2 end up on maybeFreeKrefs, but ko1.reachable = 1 + // (via the ko2 auxdata) + // * our iterator visits ko1 first, sees it is still reachable, ignores it + // * then the iterator visits ko2, does deleteKernelObject + // * this frees the auxdata, which pushes ko1 back onto maybeFreeKrefs + // * we need to examine ko1 again, so it can be released + // + // We should use something like an ordered Set, and a loop that does: + // * pop the first kref off + // * processes it (maybe adding more krefs) + // * repeats until the thing is empty + // Or maybe make a copy of maybeFreeKrefs at the start, clear the + // original, and wrap this in a loop that keeps going until + // maybeFreeKrefs is still empty at the end. Be sure to imagine a + // very deep linked list: don't just process it twice, keep + // processing until there's nothing left to do, otherwise we'll be + // leaving work for the next delivery. + function processRefcounts() { if (enableKernelGC) { const actions = new Set(); - // TODO (else buggy): change the iteration to handle krefs that get - // added multiple times (while we're iterating), because we might do - // different work the second time around. Something like an ordered - // Set, and a loop that: pops the first kref off, processes it (maybe - // adding more krefs), repeats until the thing is empty. for (const kref of maybeFreeKrefs.values()) { const { type } = parseKernelSlot(kref); if (type === 'promise') { @@ -1506,6 +1545,7 @@ export default function makeKernelKeeper( const kp = getKernelPromise(kpid); if (kp.refCount === 0) { let idx = 0; + // TODO (#9889) don't assume promise is settled for (const slot of kp.data.slots) { // Note: the following decrement can result in an addition to the // maybeFreeKrefs set, which we are in the midst of iterating. @@ -1516,27 +1556,69 @@ export default function makeKernelKeeper( deleteKernelPromise(kpid); } } + if (type === 'object') { const { reachable, recognizable } = getObjectRefCount(kref); if (reachable === 0) { - const ownerVatID = ownerOfKernelObject(kref); - if (ownerVatID) { + // We avoid ownerOfKernelObject(), which will report + // 'undefined' if the owner is dead (and being slowly + // deleted). Message delivery should use that, but not us. + const ownerKey = `${kref}.owner`; + let ownerVatID = kvStore.get(ownerKey); + const terminated = terminatedVats.includes(ownerVatID); + + // Some objects that are still owned, but the owning vat + // might still alive, or might be terminated and in the + // process of being deleted. These two clauses are + // mutually exclusive. + + if (ownerVatID && !terminated) { const vatKeeper = provideVatKeeper(ownerVatID); - const isReachable = vatKeeper.getReachableFlag(kref); - if (isReachable) { + const vatConsidersReachable = vatKeeper.getReachableFlag(kref); + if (vatConsidersReachable) { // the reachable count is zero, but the vat doesn't realize it actions.add(`${ownerVatID} dropExport ${kref}`); } if (recognizable === 0) { - // TODO: rethink this - // assert.equal(isReachable, false, `${kref} is reachable but not recognizable`); + // TODO: rethink this assert + // assert.equal(vatConsidersReachable, false, `${kref} is reachable but not recognizable`); actions.add(`${ownerVatID} retireExport ${kref}`); } - } else if (recognizable === 0) { - // unreachable, unrecognizable, orphaned: delete the - // empty refcount here, since we can't send a GC - // action without an ownerVatID - deleteKernelObject(kref); + } else if (ownerVatID && terminated) { + // When we're slowly deleting a vat, and one of its + // exports becomes unreferenced, we obviously must not + // send dropExports or retireExports into the dead vat. + // We fast-forward the abandonment that slow-deletion + // would have done, then treat the object as orphaned. + + const { vatSlot } = getReachableAndVatSlot(ownerVatID, kref); + // delete directly, not orphanKernelObject(), which + // would re-submit to maybeFreeKrefs + kvStore.delete(ownerKey); + kvStore.delete(`${ownerVatID}.c.${kref}`); + kvStore.delete(`${ownerVatID}.c.${vatSlot}`); + // now fall through to the orphaned case + ownerVatID = undefined; + } + + // Now handle objects which were orphaned. NOTE: this + // includes objects which were owned by a terminated (but + // not fully deleted) vat, where `ownerVatID` was cleared + // in the last line of that previous clause (the + // fall-through case). Don't try to change this `if + // (!ownerVatID)` into an `else if`: the two clauses are + // *not* mutually-exclusive. + + if (!ownerVatID) { + // orphaned and unreachable, so retire it. If the kref + // is recognizable, then we need retireKernelObjects() + // to scan for importers and send retireImports (and + // delete), else we can call deleteKernelObject directly + if (recognizable) { + retireKernelObjects([kref]); + } else { + deleteKernelObject(kref); + } } } } @@ -1830,7 +1912,7 @@ export default function makeKernelKeeper( ownerOfKernelDevice, kernelObjectExists, getImporters, - abandonKernelObject, + orphanKernelObject, retireKernelObjects, deleteKernelObject, pinObject, diff --git a/packages/SwingSet/test/gc-kernel-orphan.test.js b/packages/SwingSet/test/gc-kernel-orphan.test.js index 394a546f283..d6b30126493 100644 --- a/packages/SwingSet/test/gc-kernel-orphan.test.js +++ b/packages/SwingSet/test/gc-kernel-orphan.test.js @@ -257,7 +257,7 @@ for (const cause of ['abandon', 'terminate']) { // the fix was to change kernelKeeper.getRefCounts to handle a missing // koNN.refCounts by just returning 0,0 -test('termination plus maybeFreeKrefs', async t => { +test('termination plus maybeFreeKrefs - dropped', async t => { const { kernel, kvStore } = await makeKernel(); await kernel.start(); @@ -267,6 +267,23 @@ test('termination plus maybeFreeKrefs', async t => { // it (but doesn't retire it), vatB will self-terminate upon // receiving that message, creating two places that try to retire it + // The order of events will be: + // * 'terminate' translated, drops reachable to zero, adds to maybeFreeKrefs + // * 'terminate' delivered, delivery marks vat for termination + // * post-delivery crankResults.terminate check marks vat as terminated + // (but slow-deletion means nothing is deleted on that crank) + // * post-delivery does processRefCounts() + // * that processes ko22/billy, sees 0,1, owner=v2, v2 is terminated + // so it orphans ko22 (removes from vatB c-list and clears .owner) + // and falls through to (owner=undefined) case + // which sees recognizable=1 and retires the object + // (i.e. pushes retireImport gcAction and deletes .owner and .refCounts) + // * next crank starts cleanup, walks c-list, orphans ko21/bob + // which adds ko21 to maybeFreeKrefs + // * post-cleanup processRefCounts() does ko21, sees 1,1, owner=undefined + // does nothing, since vatA still holds an (orphaned) reference + // * cleanup finishes + // Our two root objects (alice and bob) are pinned so they don't // disappear while the test is talking to them, so vatB exports // "billy". @@ -339,4 +356,108 @@ test('termination plus maybeFreeKrefs', async t => { t.is(kvStore.get(`${billyKref}.owner`), undefined); t.is(kvStore.get(`${billyKref}.refCounts`), undefined); + t.is(kvStore.get(`${vatA}.c.${billyKref}`), undefined); + t.is(kvStore.get(`${vatA}.c.${vrefs.billyForA}`), undefined); + t.is(kvStore.get(`${vatB}.c.${billyKref}`), undefined); + t.is(kvStore.get(`${vatB}.c.${vrefs.billyForB}`), undefined); +}); + +// like above, but the object doesn't remain recognizable +test('termination plus maybeFreeKrefs - retired', async t => { + const { kernel, kvStore } = await makeKernel(); + await kernel.start(); + + const vrefs = {}; // track vrefs within vats + + // vatB exports an object to vatA, vatA sends it a message and drops + // and retires it, vatB will self-terminate upon receiving that + // message. The order of events will be: + // * 'terminate' translated, drops refcount to zero, adds to maybeFreeKrefs + // * 'terminate' delivered, delivery marks vat for termination + // * post-delivery crankResults.terminate check marks vat as terminated + // (but slow-deletion means nothing is deleted on that crank) + // * post-delivery does processRefCounts() + // * that processes ko22/billy, sees 0,0, owner=v2, v2 is terminated + // so it orphans ko22 (removes from vatB c-list and clears .owner) + // and falls through to (owner=undefined) case + // which sees recognizable=0 and deletes the object (just .refCount now) + // * next crank starts cleanup, walks c-list, orphans ko21/bob + // which adds ko21 to maybeFreeKrefs + // * post-cleanup processRefCounts() does ko21, sees 1,1, owner=undefined + // does nothing, since vatA still holds an (orphaned) reference + // * cleanup finishes + + vrefs.aliceForA = 'o+100'; + vrefs.bobForB = 'o+200'; + vrefs.billyForB = 'o+201'; + let billyKref; + + let vatA; + function setupA(syscall, _state, _helpers, _vatPowers) { + function dispatch(vd) { + if (vd[0] === 'startVat') { + return; // skip startVat + } + console.log(`deliverA:`, JSON.stringify(vd)); + if (vd[0] === 'message') { + const methargs = kunser(vd[2].methargs); + const [method] = methargs; + if (method === 'call-billy') { + t.is(vd[2].methargs.slots.length, 1); + vrefs.billyForA = vd[2].methargs.slots[0]; + t.is(vrefs.billyForA, 'o-50'); // probably + billyKref = kvStore.get(`${vatA}.c.${vrefs.billyForA}`); + syscall.send( + vrefs.billyForA, + kser(['terminate', [kslot(vrefs.billyForA, 'billy-A')]]), + ); + syscall.dropImports([vrefs.billyForA]); + syscall.retireImports([vrefs.billyForA]); + } + } + } + return dispatch; + } + await kernel.createTestVat('vatA', setupA); + vatA = kernel.vatNameToID('vatA'); + const alice = kernel.addExport(vatA, vrefs.aliceForA); + + function setupB(syscall, _state, _helpers, _vatPowers) { + function dispatch(vd) { + if (vd[0] === 'startVat') { + return; // skip startVat + } + console.log(`deliverB:`, JSON.stringify(vd)); + if (vd[0] === 'message') { + const [method] = kunser(vd[2].methargs); + if (method === 'init') { + vrefs.aliceForB = vd[2].methargs.slots[0]; + syscall.send( + vrefs.aliceForB, + kser(['call-billy', [kslot(vrefs.billyForB, 'billy-B')]]), + ); + } + if (method === 'terminate') { + t.is(vd[2].methargs.slots.length, 1); + assert.equal(vd[2].methargs.slots[0], vrefs.billyForB); + syscall.exit(false, kser('reason')); + } + } + } + return dispatch; + } + await kernel.createTestVat('vatB', setupB); + const vatB = kernel.vatNameToID('vatB'); + const bob = kernel.addExport(vatB, vrefs.bobForB); + + // this triggers everything, the bug was a kernel crash + kernel.queueToKref(bob, 'init', [kslot(alice, 'alice')], 'none'); + await kernel.run(); + + t.is(kvStore.get(`${billyKref}.owner`), undefined); + t.is(kvStore.get(`${billyKref}.refCounts`), undefined); + t.is(kvStore.get(`${vatA}.c.${billyKref}`), undefined); + t.is(kvStore.get(`${vatA}.c.${vrefs.billyForA}`), undefined); + t.is(kvStore.get(`${vatB}.c.${billyKref}`), undefined); + t.is(kvStore.get(`${vatB}.c.${vrefs.billyForB}`), undefined); });