From b3beede3a6fa9faf7a76a023c8222b01da6f4d3a Mon Sep 17 00:00:00 2001 From: Brian Warner Date: Fri, 12 Apr 2024 00:42:21 -0500 Subject: [PATCH] feat(swingset): allow slow termination of vats This introduces new `runPolicy()` controls which enable "slow termination" of vats. When configured, terminated vats are immediately dead (all promises are rejected, all new messages go splat, they never run again), however the vat's state is deleted slowly, one piece at a time. This makes it safe to terminate large vats, with a long history, lots of c-list imports/exports, or large vatstore tables, without fear of causing an overload (by e.g. dropping 100k references all in a single crank). See docs/run-policy.md for details and configuration instructions. refs #8928 --- packages/SwingSet/docs/run-policy.md | 138 ++++++++- packages/SwingSet/src/kernel/kernel.js | 64 ++++- .../SwingSet/src/kernel/state/kernelKeeper.js | 136 ++++++++- .../SwingSet/src/kernel/state/vatKeeper.js | 37 ++- packages/SwingSet/src/lib/runPolicies.js | 41 +++ packages/SwingSet/src/types-external.js | 9 +- packages/SwingSet/src/types-internal.js | 4 +- .../SwingSet/test/snapshots/test-state.js.md | 4 +- .../test/snapshots/test-state.js.snap | Bin 278 -> 278 bytes packages/SwingSet/test/test-state.js | 4 + .../bootstrap-slow-terminate.js | 55 ++++ .../slow-termination/test-slow-termination.js | 272 ++++++++++++++++++ .../slow-termination/vat-slow-terminate.js | 16 ++ .../vat-admin/terminate/test-terminate.js | 6 +- 14 files changed, 758 insertions(+), 28 deletions(-) create mode 100644 packages/SwingSet/test/vat-admin/slow-termination/bootstrap-slow-terminate.js create mode 100644 packages/SwingSet/test/vat-admin/slow-termination/test-slow-termination.js create mode 100644 packages/SwingSet/test/vat-admin/slow-termination/vat-slow-terminate.js diff --git a/packages/SwingSet/docs/run-policy.md b/packages/SwingSet/docs/run-policy.md index 0eba4d940d2d..058781eec68e 100644 --- a/packages/SwingSet/docs/run-policy.md +++ b/packages/SwingSet/docs/run-policy.md @@ -35,7 +35,12 @@ The kernel will invoke the following methods on the policy object (so all must e * `policy.crankFailed()` * `policy.emptyCrank()` -All methods should return `true` if the kernel should keep running, or `false` if it should stop. +All those methods should return `true` if the kernel should keep running, or `false` if it should stop. + +The following methods are optional (for backwards compatibility with policy objects created for older kernels): + +* `policy.allowCleanup()` : may return budget, see "Terminated-Vat Cleanup" below +* `policy.didCleanup({ cleanups })` (if missing, kernel pretends it returned `true` to keep running) The `computrons` argument may be `undefined` (e.g. if the crank was delivered to a non-`xs worker`-based vat, such as the comms vat). The policy should probably treat this as equivalent to some "typical" number of computrons. @@ -53,6 +58,27 @@ More arguments may be added in the future, such as: The run policy should be provided as the first argument to `controller.run()`. If omitted, the kernel defaults to `forever`, a policy that runs until the queue is empty. +## Terminated-Vat Cleanup + +Some vats may grow very large (i.e. large c-lists with lots of imported/exported objects, or lots of vatstore entries). If/when these are terminated, the burst of cleanup work might overwhelm the kernel, especially when processing all the dropped imports (which trigger GC messages to other vats). + +To protect the system against these bursts, the run policy can be configured to terminate vats slowly. Instead of doing all the cleanup work immediately, the policy allows the kernel to do a little bit of work each time `controller.run()` is called (e.g. once per block, for kernels hosted inside a blockchain). + +There are two RunPolicy methods which control this. The first is `runPolicy.allowCleanup()`. This will be invoked many times during `controller.run()`, each time the kernel tries to decide what to do next (once per step). The return value will enable (or not) a fixed amount of cleanup work. The second is `runPolicy.didCleanup({ cleanups })`, which is called later, to inform the policy of how much cleanup work was actually done. The policy can count the cleanups and switch `allowCleanup()` to return `false` when it reaches a threshold. (We need the pre-check `allowCleanup` method because the simple act of looking for cleanup work is itself a cost that we might be able to afford). + +If `allowCleanup()` exists, it must either return a falsy value, or an object. This object may have a `budget` property, which must be a number. + +A falsy return value (eg `allowCleanup: () => false`) prohibits cleanup work. This can be useful in a "only clean up during idle blocks" approach (see below), but should not be the only policy used, otherwise vat cleanup would never happen. + +A numeric `budget` limits how many cleanups are allowed to happen (if any are needed). One "cleanup" will delete one vatstore row, or one c-list entry (note that c-list deletion may trigger GC work), or one heap snapshot record, or one transcript span (and its populated transcript items). Using `{ budget: 5 }` seems to be a reasonable limit on each call, balancing overhead against doing sufficiently small units of work that we can limit the total work performed. + +If `budget` is missing or `undefined`, the kernel will perform unlimited cleanup work. This also happens if `allowCleanup()` is missing entirely, which maintains the old behavior for host applications that haven't been updated to make new policy objects. Note that cleanup is higher priority than anything else, followed by GC work, then BringOutYourDead, then message delivery. + +`didCleanup({ cleanups })` is called when the kernel actually performed some vat-termination cleanup, and the `cleanups` property is a number with the count of cleanups that took place. Each query to `allowCleanup()` might (or might not) be followed by a call to `didCleanup`, with a `cleanups` value that does not exceed the specified budget. + +To limit the work done per block (for blockchain-based applications) the host's RunPolicy objects must keep track of how many cleanups were reported, and change the behavior of `allowCleanup()` when it reaches a per-block threshold. See below for examples. + + ## Typical Run Policies A basic policy might simply limit the block to 100 cranks with deliveries and two vat creations: @@ -78,6 +104,7 @@ function make100CrankPolicy() { return true; }, }); + return policy; } ``` @@ -95,15 +122,15 @@ while(1) { Note that a new policy object should be provided for each call to `run()`. -A more sophisticated one would count computrons. Suppose that experiments suggest that one million computrons take about 5 seconds to execute. The policy would look like: +A more sophisticated one would count computrons. Suppose that experiments suggest that sixty-five million computrons take about 5 seconds to execute. The policy would look like: ```js function makeComputronCounterPolicy(limit) { - let total = 0; + let total = 0n; const policy = harden({ vatCreated() { - total += 100000; // pretend vat creation takes 100k computrons + total += 1_000_000n; // pretend vat creation takes 1M computrons return (total < limit); }, crankComplete(details) { @@ -112,18 +139,119 @@ function makeComputronCounterPolicy(limit) { return (total < limit); }, crankFailed() { - total += 1000000; // who knows, 1M is as good as anything + total += 65_000_000n; // who knows, 65M is as good as anything return (total < limit); }, emptyCrank() { return true; } }); + return policy; } ``` See `src/runPolicies.js` for examples. +To slowly terminate vats, limiting each block to 5 cleanups, the policy should start with a budget of 5, return the remaining `{ budget }` from `allowCleanup()`, and decrement it as `didCleanup` reports that budget being consumed: + +```js +function makeSlowTerminationPolicy() { + let cranks = 0; + let vats = 0; + let cleanups = 5; + const policy = harden({ + vatCreated() { + vats += 1; + return (vats < 2); + }, + crankComplete(details) { + cranks += 1; + return (cranks < 100); + }, + crankFailed() { + cranks += 1; + return (cranks < 100); + }, + emptyCrank() { + return true; + }, + allowCleanup() { + if (cleanups > 0) { + return { budget: cleanups }; + } else { + return false; + } + }, + didCleanup(spent) { + cleanups -= spent.cleanups; + }, + }); + return policy; +} +``` + +A more conservative approach might only allow cleanup in otherwise-empty blocks. To accompish this, use two separate policy objects, and two separate "runs". The first run only performs deliveries, and prohibits all cleanups: + +```js +function makeDeliveryOnlyPolicy() { + let empty = true; + const didWork = () => { empty = false; return true; }; + const policy = harden({ + vatCreated: didWork, + crankComplete: didWork, + crankFailed: didWork, + emptyCrank: didWork, + allowCleanup: () => false, + }); + const wasEmpty = () => empty; + return [ policy, wasEmpty ]; +} +``` + +The second only performs cleanup, with a limited budget, stopping the run after any deliveries occur (such as GC actions): + +```js +function makeCleanupOnlyPolicy() { + let cleanups = 5; + const stop: () => false; + const policy = harden({ + vatCreated: stop, + crankComplete: stop, + crankFailed: stop, + emptyCrank: stop, + allowCleanup() { + if (cleanups > 0) { + return { budget: cleanups }; + } else { + return false; + } + }, + didCleanup(spent) { + cleanups -= spent.cleanups; + }, + }); + return policy; +} +``` + +On each block, the host should only perform the second (cleanup) run if the first policy reports that the block was empty: + +```js +async function doBlock() { + const [ firstPolicy, wasEmpty ] = makeDeliveryOnlyPolicy(); + await controller.run(firstPolicy); + if (wasEmpty()) { + const secondPolicy = makeCleanupOnlyPolicy(); + await controller.run(secondPolicy); + } +} +``` + +Note that regardless of whatever computron/delivery budget is imposed by the first policy, the second policy will allow one additional delivery to be made (we do not yet have an `allowDelivery()` pre-check method that might inhibit this). The cleanup work, which may or may not happen, will sometimes trigger a GC delivery like `dispatch.dropExports`, but at most one such delivery will be made before the second policy returns `false` and stops `controller.run()`. If cleanup does not trigger such a delivery, or if no cleanup work needs to be done, then one normal run-queue delivery will be performed before the policy has a chance to say "stop". All other cleanup-triggered GC work will be deferred until the first run of the next block. + +Also note that `budget` and `cleanups` are plain `Number`s, whereas `comptrons` is a `BigInt`. + + ## Non-Consensus Wallclock Limits If the SwingSet kernel is not being operated in consensus mode, then it is safe to use wallclock time as a block limit: diff --git a/packages/SwingSet/src/kernel/kernel.js b/packages/SwingSet/src/kernel/kernel.js index 79b78a342e02..05e8e40791c1 100644 --- a/packages/SwingSet/src/kernel/kernel.js +++ b/packages/SwingSet/src/kernel/kernel.js @@ -271,7 +271,8 @@ export default function buildKernel( // Reject all promises decided by the vat, making sure to capture the list // of kpids before that data is deleted. const deadPromises = [...kernelKeeper.enumeratePromisesByDecider(vatID)]; - kernelKeeper.cleanupAfterTerminatedVat(vatID); + kernelKeeper.addTerminatedVat(vatID); + kernelKeeper.deleteVatID(vatID); for (const kpid of deadPromises) { resolveToError(kpid, makeError('vat terminated'), vatID); } @@ -378,7 +379,8 @@ export default function buildKernel( * abort?: boolean, // changes should be discarded, not committed * consumeMessage?: boolean, // discard the aborted delivery * didDelivery?: VatID, // we made a delivery to a vat, for run policy and save-snapshot - * computrons?: BigInt, // computron count for run policy + * computrons?: bigint, // computron count for run policy + * cleanups?: number, // cleanup budget spent * meterID?: string, // deduct those computrons from a meter * measureDirt?: [ VatID, Dirt ], // the dirt counter should increment * terminate?: { vatID: VatID, reject: boolean, info: SwingSetCapData }, // terminate vat, notify vat-admin @@ -645,6 +647,31 @@ export default function buildKernel( return deliveryCrankResults(vatID, status, false); // no meter, BOYD clears dirt } + /** + * Perform a small (budget-limited) amount of dead-vat cleanup work. + * + * @param {RunQueueEventCleanupTerminatedVat} message + * 'message' is the run-queue cleanup action, which includes a vatID and budget. + * A budget of 'undefined' allows unlimited work. Otherwise, the budget is a Number, + * and cleanup should not touch more than maybe 5*budget DB rows. + * @returns {Promise} + */ + async function processCleanupTerminatedVat(message) { + const { vatID, budget } = message; + const { done, cleanups } = kernelKeeper.cleanupAfterTerminatedVat( + vatID, + budget, + ); + if (done) { + kernelKeeper.deleteTerminatedVat(vatID); + } + // We don't perform any deliveries here, so tere are no computrons to + // report, but we do tell the runPolicy know how much kernel-side DB + // work we did, so it can decide how much was too much. + const computrons = 0n; + return harden({ computrons, cleanups }); + } + /** * The 'startVat' event is queued by `initializeKernel` for all static vats, * so that we execute their bundle imports and call their `buildRootObject` @@ -1144,6 +1171,7 @@ export default function buildKernel( * @typedef { import('../types-internal.js').RunQueueEventRetireImports } RunQueueEventRetireImports * @typedef { import('../types-internal.js').RunQueueEventNegatedGCAction } RunQueueEventNegatedGCAction * @typedef { import('../types-internal.js').RunQueueEventBringOutYourDead } RunQueueEventBringOutYourDead + * @typedef { import('../types-internal.js').RunQueueEventCleanupTerminatedVat } RunQueueEventCleanupTerminatedVat * @typedef { import('../types-internal.js').RunQueueEvent } RunQueueEvent */ @@ -1211,6 +1239,8 @@ export default function buildKernel( } else if (message.type === 'negated-gc-action') { // processGCActionSet pruned some negated actions, but had no GC // action to perform. Record the DB changes in their own crank. + } else if (message.type === 'cleanup-terminated-vat') { + deliverP = processCleanupTerminatedVat(message); } else if (gcMessages.includes(message.type)) { deliverP = processGCMessage(message); } else { @@ -1270,6 +1300,10 @@ export default function buildKernel( // sometimes happens randomly because of vat eviction policy // which should not affect the in-consensus policyInput) policyInput = ['create-vat', {}]; + } else if (message.type === 'cleanup-terminated-vat') { + const { cleanups } = crankResults; + assert(cleanups !== undefined); + policyInput = ['cleanup', { cleanups }]; } else { policyInput = ['crank', {}]; } @@ -1303,7 +1337,9 @@ export default function buildKernel( const { computrons, meterID } = crankResults; if (computrons) { assert.typeof(computrons, 'bigint'); - policyInput[1].computrons = BigInt(computrons); + if (policyInput[0] !== 'cleanup') { + policyInput[1].computrons = BigInt(computrons); + } if (meterID) { const notify = kernelKeeper.deductMeter(meterID, computrons); if (notify) { @@ -1727,12 +1763,13 @@ export default function buildKernel( * Pulls the next message from the highest-priority queue and returns it * along with a corresponding processor. * + * @param {RunPolicy} [policy] - a RunPolicy to limit the work being done * @returns {{ * message: RunQueueEvent | undefined, * processor: (message: RunQueueEvent) => Promise, * }} */ - function getNextMessageAndProcessor() { + function getNextMessageAndProcessor(policy) { const acceptanceMessage = kernelKeeper.getNextAcceptanceQueueMsg(); if (acceptanceMessage) { return { @@ -1740,7 +1777,16 @@ export default function buildKernel( processor: processAcceptanceMessage, }; } + const allowCleanup = policy?.allowCleanup ? policy.allowCleanup() : {}; + // false, or an object with optional .budget + if (allowCleanup) { + assert.typeof(allowCleanup, 'object'); + if (allowCleanup.budget) { + assert.typeof(allowCleanup.budget, 'number'); + } + } const message = + kernelKeeper.nextCleanupTerminatedVatAction(allowCleanup) || processGCActionSet(kernelKeeper) || kernelKeeper.nextReapAction() || kernelKeeper.getNextRunQueueMsg(); @@ -1817,7 +1863,8 @@ export default function buildKernel( await null; try { kernelKeeper.establishCrankSavepoint('start'); - const { processor, message } = getNextMessageAndProcessor(); + const { processor, message } = + getNextMessageAndProcessor(foreverPolicy()); // process a single message if (message) { await tryProcessMessage(processor, message); @@ -1854,7 +1901,7 @@ export default function buildKernel( kernelKeeper.startCrank(); try { kernelKeeper.establishCrankSavepoint('start'); - const { processor, message } = getNextMessageAndProcessor(); + const { processor, message } = getNextMessageAndProcessor(policy); if (!message) { break; } @@ -1876,6 +1923,11 @@ export default function buildKernel( case 'crank-failed': policyOutput = policy.crankFailed(policyInput[1]); break; + case 'cleanup': { + const { didCleanup = () => true } = policy; + policyOutput = didCleanup(policyInput[1]); + break; + } case 'none': policyOutput = policy.emptyCrank(); break; diff --git a/packages/SwingSet/src/kernel/state/kernelKeeper.js b/packages/SwingSet/src/kernel/state/kernelKeeper.js index fd6d79533f70..791b116a96f2 100644 --- a/packages/SwingSet/src/kernel/state/kernelKeeper.js +++ b/packages/SwingSet/src/kernel/state/kernelKeeper.js @@ -65,6 +65,7 @@ const enableKernelGC = true; // vat.name.$NAME = $vatID = v$NN // vat.nextID = $NN // vat.nextUpgradeID = $NN +// vats.terminated = JSON([vatIDs..]) // device.names = JSON([names..]) // device.name.$NAME = $deviceID = d$NN // device.nextID = $NN @@ -212,6 +213,15 @@ export default function makeKernelKeeper(kernelStorage, kernelSlog) { insistStorageAPI(kvStore); + // upgrade from pre-"vats.terminated" storage, or populate cache + let terminatedVats; + if (kvStore.has('vats.terminated')) { + terminatedVats = JSON.parse(getRequired('vats.terminated')); + } else { + terminatedVats = []; + kvStore.set('vats.terminated', JSON.stringify(terminatedVats)); + } + /** * @param {string} key * @returns {string} @@ -679,8 +689,12 @@ export default function makeKernelKeeper(kernelStorage, kernelSlog) { function ownerOfKernelObject(kernelSlot) { insistKernelType('object', kernelSlot); const owner = kvStore.get(`${kernelSlot}.owner`); - if (owner) { - insistVatID(owner); + if (!owner) { + return undefined; + } + insistVatID(owner); + if (terminatedVats.includes(owner)) { + return undefined; } return owner; } @@ -885,14 +899,50 @@ export default function makeKernelKeeper(kernelStorage, kernelSlog) { kvStore.set(`${kernelSlot}.data.slots`, capdata.slots.join(',')); } - function cleanupAfterTerminatedVat(vatID) { + /** + * Perform some cleanup work for a specific (terminated but not + * fully-deleted) vat, possibly limited by a budget. Returns 'done' + * (where false means "please call me again", and true means "you + * can delete the vatID now"), and a count of how much work was done + * (so the runPolicy can decide when to stop). + * + * @param {string} vatID + * @param {number} [budget] + * @returns {{ done: boolean, cleanups: number }} + * + */ + function cleanupAfterTerminatedVat(vatID, budget = undefined) { insistVatID(vatID); + let cleanups = 0; + let spend = _did => false; // returns "stop now" + if (budget !== undefined) { + assert.typeof(budget, 'number'); + spend = (did = 1) => { + assert(budget !== undefined); // hush TSC + cleanups += did; + budget -= did; + return budget <= 0; + }; + } + + // TODO: it would be slightly cheaper to walk all kvStore keys in + // order, and act on each one according to its category (c-list + // export, c-list import, vatstore, other), so we use a single + // enumeratePrefixedKeys() call each time. Until we do that, the + // last phase of the cleanup (where we've deleted all the exports + // and imports, and are working on the remaining keys) will waste + // two DB queries on each call. OTOH, those queries will probably + // hit the same SQLite index page as the successful one, so it + // probably won't cause any extra disk IO. So we can defer this + // optimization for a while. Note: when we implement it, be + // prepared to encounter the clist entries in eiher order (kref + // first or vref first), and delete the other one in the same + // call, so we don't wind up with half an entry. + const vatKeeper = provideVatKeeper(vatID); const exportPrefix = `${vatID}.c.o+`; const importPrefix = `${vatID}.c.o-`; - vatKeeper.deleteSnapshotsAndTranscript(); - // Note: ASCII order is "+,-./", and we rely upon this to split the // keyspace into the various o+NN/o-NN/etc spaces. If we were using a // more sophisticated database, we'd keep each section in a separate @@ -915,8 +965,25 @@ export default function makeKernelKeeper(kernelStorage, kernelSlog) { // begin with `vMM.c.o+`. In addition to deleting the c-list entry, we // must also delete the corresponding kernel owner entry for the object, // since the object will no longer be accessible. + const vref = k.slice(`${vatID}.c.`.length); + assert(vref.startsWith('o+'), vref); const kref = kvStore.get(k); - orphanKernelObject(kref, vatID); + // we must delete the c-list entry, but we don't need to + // manipulate refcounts like the way vatKeeper/deleteCListEntry + // does, so just delete the keys directly + // vatKeeper.deleteCListEntry(kref, vref); + kvStore.delete(`${vatID}.c.${kref}`); + kvStore.delete(`${vatID}.c.${vref}`); + // if this object became unreferenced, processRefcounts will + // delete it, so don't be surprised. TODO: this results in an + // extra get() for each delete(), see if there's a way to avoid + // this. TODO: think carefully about other such races. + if (kvStore.has(`${kref}.owner`)) { + orphanKernelObject(kref, vatID); + } + if (spend()) { + return { done: false, cleanups }; + } } // then scan for imported objects, which must be decrefed @@ -927,6 +994,9 @@ export default function makeKernelKeeper(kernelStorage, kernelSlog) { const vref = k.slice(`${vatID}.c.`.length); vatKeeper.deleteCListEntry(kref, vref); // that will also delete both db keys + if (spend()) { + return { done: false, cleanups }; + } } // the caller used enumeratePromisesByDecider() before calling us, @@ -935,8 +1005,19 @@ export default function makeKernelKeeper(kernelStorage, kernelSlog) { // now loop back through everything and delete it all for (const k of enumeratePrefixedKeys(kvStore, `${vatID}.`)) { kvStore.delete(k); + if (spend()) { + return { done: false, cleanups }; + } } + const dc = vatKeeper.deleteSnapshotsAndTranscripts(budget); + // last task, so increment cleanups, but dc.done is authoritative + spend(dc.cleanups); + + return { done: dc.done, cleanups }; + } + + function deleteVatID(vatID) { // TODO: deleting entries from the dynamic vat IDs list requires a linear // scan of the list; arguably this collection ought to be represented in a // different way that makes it efficient to remove an entry from it, though @@ -1231,6 +1312,41 @@ export default function makeKernelKeeper(kernelStorage, kernelSlog) { return makeUpgradeID(nextID); } + function addTerminatedVat(vatID) { + if (!terminatedVats.includes(vatID)) { + terminatedVats.push(vatID); + kvStore.set(`vats.terminated`, JSON.stringify(terminatedVats)); + } + } + + function getFirstTerminatedVat() { + if (terminatedVats.length) { + return terminatedVats[0]; + } + return undefined; + } + + function deleteTerminatedVat(vatID) { + terminatedVats = terminatedVats.filter(id => id !== vatID); + kvStore.set(`vats.terminated`, JSON.stringify(terminatedVats)); + } + + function nextCleanupTerminatedVatAction(allowCleanup) { + if (!allowCleanup) { + return undefined; + } + // budget === undefined means "unlimited" + const { budget } = allowCleanup; + // if (getGCActions().size) { + // return undefined; + // } + const vatID = getFirstTerminatedVat(); + if (vatID) { + return { type: 'cleanup-terminated-vat', vatID, budget }; + } + return undefined; + } + // As refcounts are decremented, we accumulate a set of krefs for which // action might need to be taken: // * promises which are now resolved and unreferenced can be deleted @@ -1423,7 +1539,7 @@ export default function makeKernelKeeper(kernelStorage, kernelSlog) { function vatIsAlive(vatID) { insistVatID(vatID); - return kvStore.has(`${vatID}.o.nextID`); + return kvStore.has(`${vatID}.o.nextID`) && !terminatedVats.includes(vatID); } /** @@ -1722,6 +1838,12 @@ export default function makeKernelKeeper(kernelStorage, kernelSlog) { getDynamicVats, getStaticVats, getDevices, + deleteVatID, + + addTerminatedVat, + getFirstTerminatedVat, + deleteTerminatedVat, + nextCleanupTerminatedVatAction, allocateUpgradeID, diff --git a/packages/SwingSet/src/kernel/state/vatKeeper.js b/packages/SwingSet/src/kernel/state/vatKeeper.js index 7cd7ff51ef58..04d05c065352 100644 --- a/packages/SwingSet/src/kernel/state/vatKeeper.js +++ b/packages/SwingSet/src/kernel/state/vatKeeper.js @@ -726,11 +726,40 @@ export function makeVatKeeper( }); } - function deleteSnapshotsAndTranscript() { + /** + * Perform some (possibly-limited) cleanup work for a vat. Returns + * 'done' (where false means "please call me again", and true means + * "you can delete the vatID now"), and a count of how much work was + * done (so the runPolicy can decide when to stop). + * + * @param {number} [budget] + * @returns {{ done: boolean, cleanups: number }} + * + */ + function deleteSnapshotsAndTranscripts(budget = undefined) { + // Each budget=1 allows us to delete one snapshot entry, or one + // transcript span and any transcript items associated with that + // span. Some nodes will have historical transcript items, some + // will not. Using budget=5 and snapshotInterval=200 means we + // delete 5 snapshot (probably only-hash) records, or 5 span + // records and maybe 1000 span items. + + let cleanups = 0; if (snapStore) { - snapStore.deleteVatSnapshots(vatID); + const dc = snapStore.deleteVatSnapshots(vatID, budget); + // initially uses 2+budget DB statements, then just 1 when done + cleanups += dc.cleanups; + if (budget !== undefined) { + budget -= dc.cleanups; + } + if (!dc.done) { + return { done: false, cleanups }; + } } - transcriptStore.deleteVatTranscripts(vatID); + const dc = transcriptStore.deleteVatTranscripts(vatID, budget); + // initially uses 2+2*budget DB statements, then just 1 when done + cleanups += dc.cleanups; + return { done: dc.done, cleanups }; } function beginNewIncarnation() { @@ -812,7 +841,7 @@ export function makeVatKeeper( dumpState, saveSnapshot, getSnapshotInfo, - deleteSnapshotsAndTranscript, + deleteSnapshotsAndTranscripts, beginNewIncarnation, }); } diff --git a/packages/SwingSet/src/lib/runPolicies.js b/packages/SwingSet/src/lib/runPolicies.js index e3528d53c631..954784c9c1ad 100644 --- a/packages/SwingSet/src/lib/runPolicies.js +++ b/packages/SwingSet/src/lib/runPolicies.js @@ -3,6 +3,9 @@ import { assert } from '@agoric/assert'; export function foreverPolicy() { /** @type { RunPolicy } */ return harden({ + allowCleanup() { + return {}; // unlimited budget + }, vatCreated(_details) { return true; }, @@ -27,6 +30,9 @@ export function crankCounter( let vats = 0; /** @type { RunPolicy } */ const policy = harden({ + allowCleanup() { + return { budget: 100 }; // limited budget + }, vatCreated() { vats += 1; return vats < maxCreateVats; @@ -52,6 +58,9 @@ export function computronCounter(limit) { let total = 0n; /** @type { RunPolicy } */ const policy = harden({ + allowCleanup() { + return { budget: 100 }; // limited budget + }, vatCreated() { total += 100000n; // pretend vat creation takes 100k computrons return total < limit; @@ -79,6 +88,7 @@ export function wallClockWaiter(seconds) { const timeout = Date.now() + 1000 * seconds; /** @type { RunPolicy } */ const policy = harden({ + allowCleanup: () => ({}), // unlimited budget vatCreated: () => Date.now() < timeout, crankComplete: () => Date.now() < timeout, crankFailed: () => Date.now() < timeout, @@ -86,3 +96,34 @@ export function wallClockWaiter(seconds) { }); return policy; } + +export function noCleanup() { + /** @type { RunPolicy } */ + const policy = harden({ + allowCleanup: () => false, + vatCreated: () => true, + crankComplete: () => true, + crankFailed: () => true, + emptyCrank: () => true, + }); + return policy; +} + +export function someCleanup(budget) { + let once = true; + /** @type { RunPolicy } */ + const policy = harden({ + allowCleanup: () => { + if (once) { + once = false; + return { budget }; + } + return false; + }, + vatCreated: () => true, + crankComplete: () => true, + crankFailed: () => true, + emptyCrank: () => true, + }); + return policy; +} diff --git a/packages/SwingSet/src/types-external.js b/packages/SwingSet/src/types-external.js index c8f2233fb83d..3075931aae6d 100644 --- a/packages/SwingSet/src/types-external.js +++ b/packages/SwingSet/src/types-external.js @@ -218,12 +218,17 @@ export {}; * @typedef { [tag: 'create-vat', details: PolicyInputDetails ]} PolicyInputCreateVat * @typedef { [tag: 'crank', details: PolicyInputDetails ] } PolicyInputCrankComplete * @typedef { [tag: 'crank-failed', details: PolicyInputDetails ]} PolicyInputCrankFailed - * @typedef { PolicyInputNone | PolicyInputCreateVat | PolicyInputCrankComplete | PolicyInputCrankFailed } PolicyInput + * @typedef { [tag: 'cleanup', details: { cleanups: number }] } PolicyInputCleanup + * @typedef { PolicyInputNone | PolicyInputCreateVat | PolicyInputCrankComplete | + * PolicyInputCrankFailed | PolicyInputCleanup } PolicyInput * @typedef { boolean } PolicyOutput - * @typedef { { vatCreated: (details: {}) => PolicyOutput, + * @typedef { { + * allowCleanup?: () => false | { budget?: number }, + * vatCreated: (details: {}) => PolicyOutput, * crankComplete: (details: { computrons?: bigint }) => PolicyOutput, * crankFailed: (details: {}) => PolicyOutput, * emptyCrank: () => PolicyOutput, + * didCleanup?: (details: { cleanups: number }) => PolicyOutput, * } } RunPolicy * * @typedef {object} VatWarehousePolicy diff --git a/packages/SwingSet/src/types-internal.js b/packages/SwingSet/src/types-internal.js index e135ff848106..6a66bc8ed8c5 100644 --- a/packages/SwingSet/src/types-internal.js +++ b/packages/SwingSet/src/types-internal.js @@ -138,10 +138,12 @@ export {}; * @typedef { { type: 'retireImports', vatID: VatID, krefs: string[] } } RunQueueEventRetireImports * @typedef { { type: 'negated-gc-action', vatID?: VatID } } RunQueueEventNegatedGCAction * @typedef { { type: 'bringOutYourDead', vatID: VatID } } RunQueueEventBringOutYourDead + * @typedef { { type: 'cleanup-terminated-vat', vatID: VatID, + * budget: number | undefined } } RunQueueEventCleanupTerminatedVat * @typedef { RunQueueEventNotify | RunQueueEventSend | RunQueueEventCreateVat | * RunQueueEventUpgradeVat | RunQueueEventChangeVatOptions | RunQueueEventStartVat | * RunQueueEventDropExports | RunQueueEventRetireExports | RunQueueEventRetireImports | - * RunQueueEventNegatedGCAction | RunQueueEventBringOutYourDead + * RunQueueEventNegatedGCAction | RunQueueEventBringOutYourDead | RunQueueEventCleanupTerminatedVat * } RunQueueEvent */ diff --git a/packages/SwingSet/test/snapshots/test-state.js.md b/packages/SwingSet/test/snapshots/test-state.js.md index 48daa769554d..f522dbe3eddd 100644 --- a/packages/SwingSet/test/snapshots/test-state.js.md +++ b/packages/SwingSet/test/snapshots/test-state.js.md @@ -8,8 +8,8 @@ Generated by [AVA](https://avajs.dev). > initial state - '09c3651da4f2f1fc6cd4e61170aaab3954ee1ace51e6028c69361f73b0ac7272' + '9715d38dfb0dc0ba922d5740d0442616d773ef41cb62a7dab666532cbd9c9f46' > expected activityhash - '00a0e15b521f7c32726b0b2d4da425eba66fab98678975ea5b03cd07bfe3109a' + '9e3f0f0d8560c41f3d617768ad1f9760ea93e781352fd836fcc9e16a84f5073e' diff --git a/packages/SwingSet/test/snapshots/test-state.js.snap b/packages/SwingSet/test/snapshots/test-state.js.snap index 223f3081f3568f64aae27207bcb07f96082f9d16..809c77821e414072b314abe05f6b011256650f3c 100644 GIT binary patch literal 278 zcmV+x0qOohRzVqaTY100000000AZkUdTVF%X3}LJ>7L*p4caf5&#m38-nuo*Au`&2EGZ3MJfx zoRp2ID0t;-zVE$n$@5x{?eogL_L(n_mSTsm(XORw*>f80*^0l6Jg9%1vkj$q)75*V9DAA6b+45|+tS3b#4IA4yA$Xrbar9w~!3i8G%Si>wVzr9+ cCB4bzEZk(<$gRAW_HjGm2Sn8@2_OLg022v { ['kernel.snapshotInitial', '3'], ['kernel.snapshotInterval', '200'], ['meter.nextID', '1'], + ['vats.terminated', '[]'], ]); }); @@ -243,6 +244,7 @@ test('kernelKeeper vat names', async t => { ['kernel.snapshotInitial', '3'], ['kernel.snapshotInterval', '200'], ['meter.nextID', '1'], + ['vats.terminated', '[]'], ]); t.deepEqual(k.getStaticVats(), [ ['Frank', 'v2'], @@ -296,6 +298,7 @@ test('kernelKeeper device names', async t => { ['kernel.snapshotInitial', '3'], ['kernel.snapshotInterval', '200'], ['meter.nextID', '1'], + ['vats.terminated', '[]'], ]); t.deepEqual(k.getDevices(), [ ['Frank', 'd8'], @@ -481,6 +484,7 @@ test('kernelKeeper promises', async t => { ['kernel.snapshotInitial', '3'], ['kernel.snapshotInterval', '200'], ['meter.nextID', '1'], + ['vats.terminated', '[]'], ]); k.deleteKernelObject(ko); diff --git a/packages/SwingSet/test/vat-admin/slow-termination/bootstrap-slow-terminate.js b/packages/SwingSet/test/vat-admin/slow-termination/bootstrap-slow-terminate.js new file mode 100644 index 000000000000..135124209efd --- /dev/null +++ b/packages/SwingSet/test/vat-admin/slow-termination/bootstrap-slow-terminate.js @@ -0,0 +1,55 @@ +import { Far, E } from '@endo/far'; + +export function buildRootObject(_vatPowers) { + let root; + let adminNode; + const myImports = []; + + const self = Far('root', { + async bootstrap(vats, devices) { + const svc = E(vats.vatAdmin).createVatAdminService(devices.vatAdmin); + // create a dynamic vat, send it a message and let it respond, to make + // sure everything is working + const dude = await E(svc).createVatByName('dude'); + root = dude.root; + adminNode = dude.adminNode; + await E(root).alive(); + + // set up 20 "bootstrap exports, dude imports" c-list entries + for (let i = 0; i < 20; i += 1) { + await E(root).acceptImports(Far('bootstrap export', {})); + } + + // set up 20 "dude exports, bootstrap imports" c-list entries + + for (let i = 0; i < 20; i += 1) { + myImports.push(await E(root).sendExport()); + } + + // ask dude to creates 20 vatstore entries (in addition to the + // built-in liveslots stuff) + await E(root).makeVatstore(20); + + return 'bootstrap done'; + }, + + async kill(mode) { + switch (mode) { + case 'kill': + await E(adminNode).terminateWithFailure(mode); + break; + case 'dieHappy': + await E(root).dieHappy(mode); + break; + default: + console.log('something terrible has happened'); + break; + } + // confirm the vat is dead, even though cleanup happens later + return E(root) + .ping() + .catch(err => `kill done, ${err}`); + }, + }); + return self; +} diff --git a/packages/SwingSet/test/vat-admin/slow-termination/test-slow-termination.js b/packages/SwingSet/test/vat-admin/slow-termination/test-slow-termination.js new file mode 100644 index 000000000000..6d83325c527d --- /dev/null +++ b/packages/SwingSet/test/vat-admin/slow-termination/test-slow-termination.js @@ -0,0 +1,272 @@ +// @ts-nocheck +// eslint-disable-next-line import/order +import { test } from '../../../tools/prepare-test-env-ava.js'; + +import tmp from 'tmp'; +import sqlite3 from 'better-sqlite3'; +import path from 'path'; + +import { kser } from '@agoric/kmarshal'; +import { initSwingStore } from '@agoric/swing-store'; + +import { buildVatController, buildKernelBundles } from '../../../src/index.js'; +import { enumeratePrefixedKeys } from '../../../src/kernel/state/storageHelper.js'; + +/** + * @param {string} [prefix] + * @returns {Promise<[string, () => void]>} + */ +export const tmpDir = prefix => + new Promise((resolve, reject) => { + tmp.dir({ unsafeCleanup: true, prefix }, (err, name, removeCallback) => { + if (err) { + reject(err); + } else { + resolve([name, removeCallback]); + } + }); + }); + +test.before(async t => { + const kernelBundles = await buildKernelBundles(); + t.context.data = { kernelBundles }; +}); + +const makeCleanupPolicy = () => { + let budget = 5; + let cleanups = 0; + const stop = () => false; + const policy = harden({ + vatCreated: stop, + crankComplete: stop, + crankFailed: stop, + emptyCrank: stop, + allowCleanup() { + if (budget > 0) { + return { budget }; + } else { + return false; + } + }, + didCleanup(spent) { + budget -= spent.cleanups; + cleanups += spent.cleanups; + }, + }); + const getCleanups = () => cleanups; + return [policy, getCleanups]; +}; + +const bfile = relpath => new URL(relpath, import.meta.url).pathname; + +async function doSlowTerminate(t, mode) { + const config = { + defaultManagerType: 'xsnap', + defaultReapInterval: 'never', + snapshotInitial: 2, // same as the default + snapshotInterval: 10, // ensure multiple spans+snapshots + bootstrap: 'bootstrap', + bundles: { + dude: { + sourceSpec: bfile('vat-slow-terminate.js'), + }, + }, + vats: { + bootstrap: { + sourceSpec: bfile('bootstrap-slow-terminate.js'), + }, + }, + }; + const noCleanupPolicy = { + allowCleanup: () => false, + vatCreated: () => true, + crankComplete: () => true, + crankFailed: () => true, + emptyCrank: () => true, + didCleanup: () => true, + }; + + const [dbDir, cleanup] = await tmpDir('testdb'); + t.teardown(cleanup); + + const ss = initSwingStore(dbDir); + const { kernelStorage } = ss; + const { commit } = ss.hostStorage; + const { kvStore } = kernelStorage; + // look directly at DB to confirm changes + const db = sqlite3(path.join(dbDir, 'swingstore.sqlite')); + + const controller = await buildVatController(config, [], { + ...t.context.data, + kernelStorage, + }); + t.teardown(controller.shutdown); + t.is(controller.kpStatus(controller.bootstrapResult), 'unresolved'); + controller.pinVatRoot('bootstrap'); + await controller.run(noCleanupPolicy); + await commit(); + t.is(controller.kpStatus(controller.bootstrapResult), 'fulfilled'); + t.deepEqual( + controller.kpResolution(controller.bootstrapResult), + kser('bootstrap done'), + ); + + // bootstrap adds a fair amount of vat-dude state: + // * we have c-list entries for 20 imports and 20 exports, each of + // which need two kvStore entries, so 80 kvStore total + // * the vat has created 20 baggage entries, all of which go into + // the vatstore, adding 20 kvStore + // * an empty vat has about 29 kvStore entries just to track + // counters, the built-in collection types, baggage itself, etc + // * by sending 40-plus deliveries into an xsnap vat with + // snapInterval=5, we get 8-ish transcript spans (7 old, 1 + // current), and each old span generates a heap snapshot record + // Slow vat termination means deleting these entries slowly. + + const vatID = JSON.parse(kvStore.get('vat.dynamicIDs'))[0]; + t.is(vatID, 'v6'); // change if necessary + const remainingKV = () => + Array.from(enumeratePrefixedKeys(kvStore, `${vatID}.`)); + const remainingSnapshots = () => + db + .prepare('SELECT COUNT(*) FROM snapshots WHERE vatID=?') + .pluck() + .get(vatID); + const remainingTranscriptItems = () => + db + .prepare('SELECT COUNT(*) FROM transcriptItems WHERE vatID=?') + .pluck() + .get(vatID); + const remainingTranscriptSpans = () => + db + .prepare('SELECT COUNT(*) FROM transcriptSpans WHERE vatID=?') + .pluck() + .get(vatID); + + // 20*2 for imports, 21*2 for exports, 20*1 for vatstore = 102 + // plus 27 for usual liveslots stuff + t.is(remainingKV().length, 129); + t.false(JSON.parse(kvStore.get('vats.terminated')).includes(vatID)); + // we get one span for snapshotInitial (=2), then a span every + // snapshotInterval (=10). Each non-current span creates a + // snapshot. + t.is(remainingTranscriptSpans(), 6); + t.is(remainingTranscriptItems(), 59); + t.is(remainingSnapshots(), 5); + const remaining = () => + remainingKV().length + + remainingSnapshots() + + remainingTranscriptItems() + + remainingTranscriptSpans(); + + // note: mode=dieHappy means we send one extra message to the vat, + // which adds a single transcript item (but this doesn't happen to trigger an extra span) + + const kpid = controller.queueToVatRoot('bootstrap', 'kill', [mode]); + await controller.run(noCleanupPolicy); + await commit(); + t.is(controller.kpStatus(kpid), 'fulfilled'); + t.deepEqual( + controller.kpResolution(kpid), + kser('kill done, Error: vat terminated'), + ); + + t.true(JSON.parse(kvStore.get('vats.terminated')).includes(vatID)); + // no cleanups were allowed, so nothing should be removed yet + t.truthy(kernelStorage.kvStore.get(`${vatID}.options`, undefined)); + t.is(remainingKV().length, 129); + + // now do a series of cleanup runs, each with budget=5 + const clean = async () => { + const [policy, _getCleanups] = makeCleanupPolicy(); + await controller.run(policy); + await commit(); + }; + + // cleanup currently deletes c-list exports, then c-list imports, + // then all other kvStore entries, then snapshots, then transcripts + + let leftKV = remainingKV().length; + const cleanKV = async expected => { + await clean(); + const newLeftKV = remainingKV().length; + t.is(leftKV - newLeftKV, expected); + leftKV = newLeftKV; + }; + + // we have 21 c-list exports (1 vat root, plus 20 we added), we + // delete them 5 at a time (2 kv each, so 10kv per clean) + await cleanKV(10); // 5 c-list exports + await cleanKV(10); // 5 c-list exports + await cleanKV(10); // 5 c-list exports + await cleanKV(10); // 5 c-list exports + + // we have one export left, so this clean(budget=5) will delete the + // two kv for the export, then the first four of our 20 c-list + // imports, each of which also has 2 kv) + + await cleanKV(10); // 1 c-list exports, 4 c-list imports + await cleanKV(10); // 5 c-list imports + await cleanKV(10); // 5 c-list imports + await cleanKV(10); // 5 c-list imports + + // we have one import left, so this clean(budget=5) will delete its + // two kv, then the first four of our 47 other kv entries (20 + // vatstore plus 27 liveslots overhead + await cleanKV(6); // 1 c-list import, 4 other kv + // now there are 45 other kv entries left + t.is(remainingKV().length, 43); + + await cleanKV(5); // 5 other kv + await cleanKV(5); // 5 other kv + await cleanKV(5); // 5 other kv + await cleanKV(5); // 5 other kv + await cleanKV(5); // 5 other kv + t.is(remainingSnapshots(), 5); + await cleanKV(5); // 5 other kv + await cleanKV(5); // 5 other kv + await cleanKV(5); // 5 other kv + + // we have 3 kv entries left, so budget=5 will delete those three, + // then two snapshots + t.is(remainingSnapshots(), 5); + await clean(); + t.deepEqual(remainingKV(), []); + t.is(kernelStorage.kvStore.get(`${vatID}.options`, undefined)); + t.is(remainingSnapshots(), 3); + t.is(remainingTranscriptSpans(), 6); + if (mode === 'dieHappy') { + t.is(remainingTranscriptItems(), 60); + } else { + t.is(remainingTranscriptItems(), 59); + } + + // the next clean will delete the remaining three snapshots, plus + // two transcript spans, starting with the isCurrent=1 one (which + // had 9 or 10 items), finishing with the last old span (which had + // 10) + + await clean(); + t.is(remainingSnapshots(), 0); + t.is(remainingTranscriptSpans(), 4); + t.is(remainingTranscriptItems(), 40); + t.true(JSON.parse(kvStore.get('vats.terminated')).includes(vatID)); + + // the final clean deletes the remaining spans, and finishes by + // removing the "still being deleted" bookkeeping, and the .options + + await clean(); + t.is(remainingTranscriptSpans(), 0); + t.is(remainingTranscriptItems(), 0); + t.is(remaining(), 0); + + t.false(JSON.parse(kvStore.get('vats.terminated')).includes(vatID)); +} + +test.serial('slow terminate (kill)', async t => { + await doSlowTerminate(t, 'kill'); +}); + +test.serial('slow terminate (die happy)', async t => { + await doSlowTerminate(t, 'dieHappy'); +}); diff --git a/packages/SwingSet/test/vat-admin/slow-termination/vat-slow-terminate.js b/packages/SwingSet/test/vat-admin/slow-termination/vat-slow-terminate.js new file mode 100644 index 000000000000..78ec9a74e47a --- /dev/null +++ b/packages/SwingSet/test/vat-admin/slow-termination/vat-slow-terminate.js @@ -0,0 +1,16 @@ +import { Far } from '@endo/far'; + +export function buildRootObject(vatPowers, _vatParameters, baggage) { + const hold = []; + return Far('root', { + alive: () => true, + dieHappy: completion => vatPowers.exitVat(completion), + sendExport: () => Far('dude export', {}), + acceptImports: imports => hold.push(imports), + makeVatstore: count => { + for (let i = 0; i < count; i += 1) { + baggage.init(`key-${i}`, i); + } + }, + }); +} diff --git a/packages/SwingSet/test/vat-admin/terminate/test-terminate.js b/packages/SwingSet/test/vat-admin/terminate/test-terminate.js index 750862e084b5..a4759943e477 100644 --- a/packages/SwingSet/test/vat-admin/terminate/test-terminate.js +++ b/packages/SwingSet/test/vat-admin/terminate/test-terminate.js @@ -150,8 +150,12 @@ async function doTerminateCritical( t.is(thrown.message, mode); } t.is(controller.kpStatus(kpid), dynamic ? 'unresolved' : 'fulfilled'); + // the kernel is supposed to crash before deleting vNN.options, + // although strictly speaking it doesn't matter because the host is + // supposed to crash too, abandoning the uncommitted swingstore + // changes const postProbe = kernelStorage.kvStore.get(`${deadVatID}.options`); - t.is(postProbe, undefined); + t.truthy(postProbe); } test.serial('terminate (dynamic, non-critical)', async t => {