From 3c840df77504a21e45141509e413d392495d423a Mon Sep 17 00:00:00 2001 From: Aaron van Meerten Date: Fri, 19 Apr 2024 15:38:03 -0500 Subject: [PATCH] feat: sidecar shutdown complete reporting (#155) * feat: shutdown confirmation support * further sidecar shutdown hook support * fix handler * handler updates * audit support for sidecar shutdown * include new scale status * better log output --- src/app.ts | 9 +++ src/audit.ts | 24 +++++++ src/cloud_manager.ts | 7 ++ src/group_report.ts | 24 ++++++- src/handlers.ts | 25 +++++++ src/instance_tracker.ts | 15 ++-- src/shutdown_manager.ts | 51 +++++++++++++ src/test/shutdown_manager.ts | 135 +++++++++++++++++++++++++++++++++++ 8 files changed, 282 insertions(+), 8 deletions(-) create mode 100644 src/test/shutdown_manager.ts diff --git a/src/app.ts b/src/app.ts index 6d02288..7e606d1 100644 --- a/src/app.ts +++ b/src/app.ts @@ -279,6 +279,7 @@ const h = new Handlers({ instanceTracker, instanceGroupManager, shutdownManager, + cloudManager, reconfigureManager, groupReportGenerator, lockManager, @@ -340,6 +341,14 @@ app.post('/sidecar/poll', async (req, res, next) => { } }); +app.post('/sidecar/shutdown', async (req, res, next) => { + try { + await h.sidecarShutdown(req, res); + } catch (err) { + next(err); + } +}); + app.post('/sidecar/stats', async (req, res, next) => { try { await h.sidecarStats(req, res); diff --git a/src/audit.ts b/src/audit.ts index 407aeb2..97df28a 100644 --- a/src/audit.ts +++ b/src/audit.ts @@ -50,6 +50,7 @@ export interface InstanceAuditResponse { requestToTerminate: string; requestToReconfigure: string; reconfigureComplete: string; + terminationConfirmation: string; latestStatusInfo?: InstanceState; } @@ -96,6 +97,7 @@ export default class Audit { pipeline.expire(`audit:${groupName}:${instanceId}:request-to-launch`, this.auditTTL); pipeline.expire(`audit:${groupName}:${instanceId}:request-to-terminate`, this.auditTTL); + pipeline.expire(`audit:${groupName}:${instanceId}:confirmation-of-termination`, this.auditTTL); pipeline.expire(`audit:${groupName}:${instanceId}:request-to-reconfigure`, this.auditTTL); pipeline.expire(`audit:${groupName}:${instanceId}:reconfigure-complete`, this.auditTTL); @@ -130,6 +132,24 @@ export default class Audit { await pipeline.exec(); } + async saveShutdownConfirmationEvents(instanceDetails: Array): Promise { + const pipeline = this.redisClient.pipeline(); + for (const instance of instanceDetails) { + const value: InstanceAudit = { + instanceId: instance.instanceId, + type: 'confirmation-of-termination', + timestamp: Date.now(), + }; + pipeline.set( + `audit:${instance.group}:${instance.instanceId}:confirmation-of-termination`, + JSON.stringify(value), + 'EX', + this.auditTTL, + ); + } + await pipeline.exec(); + } + async saveUnsetReconfigureEvents(instanceId: string, group: string): Promise { const value: InstanceAudit = { instanceId: instanceId, @@ -307,6 +327,7 @@ export default class Audit { requestToTerminate: 'unknown', requestToReconfigure: 'unknown', reconfigureComplete: 'unknown', + terminationConfirmation: 'unknown', }; instanceAuditResponseList.push(instanceAuditResponse); }); @@ -322,6 +343,9 @@ export default class Audit { case 'request-to-terminate': instanceAuditResponse.requestToTerminate = new Date(instanceAudit.timestamp).toISOString(); break; + case 'confirmation-of-termination': + instanceAuditResponse.terminationConfirmation = new Date(instanceAudit.timestamp).toISOString(); + break; case 'request-to-reconfigure': instanceAuditResponse.requestToReconfigure = new Date(instanceAudit.timestamp).toISOString(); break; diff --git a/src/cloud_manager.ts b/src/cloud_manager.ts index eb5862c..b3d2207 100644 --- a/src/cloud_manager.ts +++ b/src/cloud_manager.ts @@ -114,6 +114,13 @@ export default class CloudManager { return true; } + async shutdownInstance(ctx: Context, instance: InstanceDetails): Promise { + const groupName = instance.group; + ctx.logger.info(`[CloudManager] Shutting down instance ${instance.instanceId} from group ${groupName}`); + await this.shutdownManager.setShutdownConfirmation(ctx, [instance]); + return true; + } + async getInstances( ctx: Context, group: InstanceGroup, diff --git a/src/group_report.ts b/src/group_report.ts index 587616d..d296df8 100644 --- a/src/group_report.ts +++ b/src/group_report.ts @@ -12,6 +12,7 @@ export interface InstanceReport { instanceName?: string; scaleStatus?: string; cloudStatus?: string; + shutdownComplete?: string | false; isShuttingDown?: boolean; isScaleDownProtected?: boolean; reconfigureScheduled?: string; @@ -33,6 +34,7 @@ export interface GroupReport { expiredCount?: number; cloudCount?: number; unTrackedCount?: number; + shutdownCount?: number; shuttingDownCount?: number; shutdownErrorCount?: number; reconfigureErrorCount?: number; @@ -108,6 +110,7 @@ export default class GroupReportGenerator { }); await this.addShutdownStatus(ctx, groupReport.instances); + await this.addShutdownConfirmations(ctx, groupReport.instances); await this.addReconfigureDate(ctx, groupReport.instances); await this.addShutdownProtectedStatus(ctx, groupReport.instances); @@ -118,6 +121,9 @@ export default class GroupReportGenerator { if (instanceReport.isShuttingDown) { groupReport.shuttingDownCount++; } + if (instanceReport.shutdownComplete) { + groupReport.shutdownCount++; + } if (instanceReport.isScaleDownProtected) { groupReport.scaleDownProtectedCount++; } @@ -186,12 +192,15 @@ export default class GroupReportGenerator { cloudStatus: 'unknown', version: 'unknown', isShuttingDown: instanceState.shutdownStatus, + shutdownComplete: instanceState.shutdownComplete, lastReconfigured: instanceState.lastReconfigured, reconfigureError: instanceState.reconfigureError, shutdownError: instanceState.shutdownError, isScaleDownProtected: false, }; - if (instanceState.shutdownStatus) { + if (instanceState.shutdownComplete) { + instanceReport.scaleStatus = 'SHUTDOWN COMPLETE'; + } else if (instanceState.shutdownStatus) { instanceReport.scaleStatus = 'SHUTDOWN'; } else if (instanceState.status.provisioning) { instanceReport.scaleStatus = 'PROVISIONING'; @@ -306,6 +315,19 @@ export default class GroupReportGenerator { }); } + private async addShutdownConfirmations(ctx: Context, instanceReports: Array): Promise { + ( + await this.shutdownManager.getShutdownConfirmations( + ctx, + instanceReports.map((instanceReport) => { + return instanceReport.instanceId; + }), + ) + ).map((confirmation, index) => { + instanceReports[index].shutdownComplete = confirmation; + }); + } + private async addShutdownProtectedStatus(ctx: Context, instanceReports: Array): Promise { const instanceReportsProtectedStatus: boolean[] = await this.shutdownManager.areScaleDownProtected( ctx, diff --git a/src/handlers.ts b/src/handlers.ts index c38f70b..df63f1c 100644 --- a/src/handlers.ts +++ b/src/handlers.ts @@ -9,6 +9,7 @@ import GroupReportGenerator from './group_report'; import Audit from './audit'; import ScalingManager from './scaling_options_manager'; import * as promClient from 'prom-client'; +import CloudManager from './cloud_manager'; const statsErrors = new promClient.Counter({ name: 'autoscaler_stats_errors', @@ -92,6 +93,7 @@ interface InstanceConfigurationUpdateRequest { } interface HandlersOptions { + cloudManager: CloudManager; instanceTracker: InstanceTracker; audit: Audit; shutdownManager: ShutdownManager; @@ -103,6 +105,7 @@ interface HandlersOptions { } class Handlers { + private cloudManager: CloudManager; private instanceTracker: InstanceTracker; private shutdownManager: ShutdownManager; private reconfigureManager: ReconfigureManager; @@ -116,6 +119,7 @@ class Handlers { this.sidecarPoll = this.sidecarPoll.bind(this); this.lockManager = options.lockManager; + this.cloudManager = options.cloudManager; this.instanceTracker = options.instanceTracker; this.instanceGroupManager = options.instanceGroupManager; this.shutdownManager = options.shutdownManager; @@ -150,6 +154,27 @@ class Handlers { } } + async sidecarShutdown(req: Request, res: Response): Promise { + const details: InstanceDetails = req.body; + req.context.logger.info('Received shutdown confirmation', { details }); + statsCounter.inc(); + try { + await this.cloudManager.shutdownInstance(req.context, details); + + const sendResponse = { + save: 'OK', + }; + + res.status(200); + res.send(sendResponse); + } catch (err) { + req.context.logger.error('Shutdown handling error', { err }); + statsErrors.inc(); + + res.status(500); + res.send({ save: 'ERROR' }); + } + } async sidecarStats(req: Request, res: Response): Promise { const report: StatsReport = req.body; statsCounter.inc(); diff --git a/src/instance_tracker.ts b/src/instance_tracker.ts index 6e2d4a8..c6650c3 100644 --- a/src/instance_tracker.ts +++ b/src/instance_tracker.ts @@ -136,6 +136,7 @@ export interface InstanceState { timestamp?: number; metadata: InstanceMetadata; shutdownStatus?: boolean; + shutdownComplete?: string; reconfigureError?: boolean; shutdownError?: boolean; statsError?: boolean; @@ -672,18 +673,18 @@ export class InstanceTracker { } async filterOutInstancesShuttingDown(ctx: Context, states: Array): Promise> { - const shutdownStatuses = await this.shutdownManager.getShutdownStatuses( - ctx, - states.map((state) => { - return state.instanceId; - }), - ); + const instanceIds = states.map((state) => { + return state.instanceId; + }); + const shutdownStatuses = await this.shutdownManager.getShutdownStatuses(ctx, instanceIds); + + const shutdownConfirmations = await this.shutdownManager.getShutdownConfirmations(ctx, instanceIds); const statesShutdownStatus: boolean[] = []; for (let i = 0; i < states.length; i++) { statesShutdownStatus.push(this.shutdownStatusFromState(states[i]) || shutdownStatuses[i]); } - return states.filter((instanceState, index) => !statesShutdownStatus[index]); + return states.filter((instanceState, index) => !statesShutdownStatus[index] && !shutdownConfirmations[index]); } mapToInstanceDetails(states: Array): Array { diff --git a/src/shutdown_manager.ts b/src/shutdown_manager.ts index 7f3bd8c..56af07a 100644 --- a/src/shutdown_manager.ts +++ b/src/shutdown_manager.ts @@ -24,6 +24,10 @@ export default class ShutdownManager { return `instance:shutdown:${instanceId}`; } + shutDownConfirmedKey(instanceId: string): string { + return `instance:shutdownConfirmed:${instanceId}`; + } + protectedKey(instanceId: string): string { return `instance:scaleDownProtected:${instanceId}`; } @@ -61,6 +65,27 @@ export default class ShutdownManager { } } + async getShutdownConfirmations(ctx: Context, instanceIds: Array): Promise<(string | false)[]> { + const pipeline = this.redisClient.pipeline(); + instanceIds.forEach((instanceId) => { + const key = this.shutDownConfirmedKey(instanceId); + pipeline.get(key); + }); + const instances = await pipeline.exec(); + if (instances) { + return instances.map((instance: [error: Error | null, result: unknown]) => { + if (instance[1] == null) { + return false; + } else { + return instance[1]; + } + }); + } else { + ctx.logger.error('ShutdownConfirmations Failed in pipeline.exec()'); + return []; + } + } + async getShutdownStatus(ctx: Context, instanceId: string): Promise { const key = this.shutDownKey(instanceId); const res = await this.redisClient.get(key); @@ -68,6 +93,32 @@ export default class ShutdownManager { return res == 'shutdown'; } + async getShutdownConfirmation(ctx: Context, instanceId: string): Promise { + const key = this.shutDownConfirmedKey(instanceId); + const res = await this.redisClient.get(key); + ctx.logger.debug('Read shutdown confirmation', { key, res }); + if (res) { + return res; + } + return false; + } + + async setShutdownConfirmation( + ctx: Context, + instanceDetails: Array, + status = new Date().toISOString(), + ): Promise { + const pipeline = this.redisClient.pipeline(); + for (const instance of instanceDetails) { + const key = this.shutDownConfirmedKey(instance.instanceId); + ctx.logger.debug('Writing shutdown confirmation', { key, status }); + pipeline.set(key, status, 'EX', this.shutdownTTL); + } + await pipeline.exec(); + await this.audit.saveShutdownConfirmationEvents(instanceDetails); + return true; + } + async setScaleDownProtected( ctx: Context, instanceId: string, diff --git a/src/test/shutdown_manager.ts b/src/test/shutdown_manager.ts new file mode 100644 index 0000000..af84b0f --- /dev/null +++ b/src/test/shutdown_manager.ts @@ -0,0 +1,135 @@ +/* eslint-disable @typescript-eslint/ban-ts-comment */ +// @ts-nocheck + +import assert from 'node:assert'; +import test, { afterEach, describe, mock } from 'node:test'; + +import ShutdownManager from '../shutdown_manager'; + +describe('ShutdownManager', () => { + let context = { + logger: { + info: mock.fn(), + debug: mock.fn(), + error: mock.fn(), + warn: mock.fn(), + }, + }; + + let _keys = []; + + const mockPipeline = { + get: mock.fn((key) => _keys.push(key)), + set: mock.fn(), + exec: mock.fn(() => Promise.resolve(_keys.map(() => [null, null]))), + }; + + const redisClient = { + expire: mock.fn(), + zremrangebyscore: mock.fn(() => 0), + hgetall: mock.fn(), + hset: mock.fn(), + hdel: mock.fn(), + del: mock.fn(), + scan: mock.fn(), + zrange: mock.fn(), + get: mock.fn(), + pipeline: mock.fn(() => mockPipeline), + }; + + const audit = { + log: mock.fn(), + }; + + const shutdownManager = new ShutdownManager({ + redisClient, + audit, + shutdownTTL: 86400, + }); + + afterEach(() => { + _keys = []; + mockPipeline.exec.mock.resetCalls(); + mockPipeline.get.mock.resetCalls(); + context = { + logger: { + info: mock.fn(), + debug: mock.fn(), + error: mock.fn(), + warn: mock.fn(), + }, + }; + mock.restoreAll(); + }); + + // these tests are for the shutdown confirmation statuses + describe('shutdownConfirmationStatuses', () => { + test('read non-existent shutdown confirmation status', async () => { + redisClient.get.mock.mockImplementationOnce(() => null); + const result = await shutdownManager.getShutdownConfirmation(context, 'instanceId'); + assert.equal(result, false, 'expect no shutdown confirmation when no key exists'); + }); + + test('read existing shutdown confirmation status', async () => { + const shutdownConfirmation = new Date().toISOString(); + redisClient.get.mock.mockImplementationOnce(() => shutdownConfirmation); + const result = await shutdownManager.getShutdownConfirmation(context, 'instanceId'); + assert.ok(result, 'expect ok result'); + assert.equal(result, shutdownConfirmation, 'expect shutdown confirmation to match mock date'); + }); + + test('read multiple non-existent shutdown confirmation statuses', async () => { + const instances = ['instanceId', 'instanceId2']; + const result = await shutdownManager.getShutdownConfirmations(context, instances); + assert.ok(result, 'expect ok result'); + assert.equal(result.length, instances.length, 'expect confirmation length to match instances length'); + assert.equal(result[0], false, 'expect first confirmation to be false'); + assert.equal(result[1], false, 'expect second confirmation to be false'); + }); + + test('read multiple existing shutdown confirmation statuses', async () => { + const shutdownConfirmation = new Date().toISOString(); + + const instances = ['instanceId', 'instanceId2']; + mockPipeline.exec.mock.mockImplementationOnce(() => + Promise.resolve(instances.map(() => [null, shutdownConfirmation])), + ); + + const result = await shutdownManager.getShutdownConfirmations(context, instances); + assert.ok(result, 'expect ok result'); + assert.equal(mockPipeline.exec.mock.callCount(), 1, 'expect exec to be called once'); + assert.equal( + mockPipeline.get.mock.callCount(), + instances.length, + 'expect get to be called once per instance', + ); + assert.equal(result.length, instances.length, 'expect confirmation length to match instances length'); + assert.equal(result[0], shutdownConfirmation, 'expect first confirmation to match mock date'); + assert.equal(result[1], shutdownConfirmation, 'expect second confirmation to match mock date'); + }); + + test('read multiple mixed shutdown confirmation statuses', async () => { + const shutdownConfirmation = new Date().toISOString(); + + const instances = ['instanceId', 'instanceId2']; + mockPipeline.exec.mock.mockImplementationOnce(() => + Promise.resolve([ + [null, null], + [null, shutdownConfirmation], + ]), + ); + + const result = await shutdownManager.getShutdownConfirmations(context, instances); + assert.ok(result, 'expect ok result'); + assert.equal(mockPipeline.exec.mock.callCount(), 1, 'expect exec to be called once'); + assert.equal( + mockPipeline.get.mock.callCount(), + instances.length, + 'expect get to be called once per instance', + ); + assert.equal(result.length, instances.length, 'expect confirmation length to match instances length'); + assert.equal(result[0], false, 'expect first confirmation to be false'); + assert.equal(result[1], shutdownConfirmation, 'expect second confirmation to match mock date'); + }); + }); +});