diff --git a/docs/src/pages/guides/probes.md b/docs/src/pages/guides/probes.md index 38fee50ea..fb346ccb1 100644 --- a/docs/src/pages/guides/probes.md +++ b/docs/src/pages/guides/probes.md @@ -58,19 +58,20 @@ probes: Details of the field are given in the table below. -| Topic | Description | -| :--------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| method (optional) | Http method such as GET, POST, PUT, DELETE. | -| url (required) | This is the url endpoint to dispatch the request to. | -| timeout (optional) | Request timeout in **milliseconds**, Default value is 10000 which corresponds to 10 seconds. If the request takes longer than `timeout`, the request will be aborted. | -| headers (optional) | Http headers you might need for your request. | -| body (optional) | Any http body if your method requires it. | -| interval (optional) | Number of probe's interval (in seconds). Default value is 10 seconds. | -| incidentThreshold (optional) | Number of times an alert should return true before Monika sends notifications. For example, when incidentThreshold is 3, Monika will only send notifications when the probed URL returns non-2xx status 3 times in a row. After sending the notifications, Monika will not send notifications anymore until the alert status changes. Default value is 5. | -| saveBody (optional) | When set to true, the response body of the request is stored in the internal database. The default is off when not defined. This is to keep the log file size small as some responses can be sizable. The setting is for each probe request. | -| alerts (optional) | The condition which will trigger an alert, and the subsequent notification method to send out the alert. See below for further details on alerts and notifications. See [alerts](./alerts) section for detailed information. | -| ping (optional) | (boolean), If set true then send a PING to the specified url instead. | -| allowUnauthorized (optional) | (boolean), If set to true, will make https agent to not check for ssl certificate validity | +| Topic | Description | +| :--------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| method (optional) | Http method such as GET, POST, PUT, DELETE. | +| url (required) | This is the url endpoint to dispatch the request to. | +| timeout (optional) | Request timeout in **milliseconds**, Default value is 10000 which corresponds to 10 seconds. If the request takes longer than `timeout`, the request will be aborted. | +| headers (optional) | Http headers you might need for your request. | +| body (optional) | Any http body if your method requires it. | +| interval (optional) | Number of probe's interval (in seconds). Default value is 10 seconds. | +| incidentThreshold (optional) | Number of times an alert should return true before Monika sends notifications. For example, when incidentThreshold is 3, Monika will only send incident notifications when the probed URL returns non-2xx status 3 times in a row. After sending the notifications, Monika will not send notifications anymore until the alert status changes. Default value is 5. However, the actual number of retries will be the the greatest number between `incidentThreshold` and `recoveryThreshold`. So if you want to have 3 retries, you need to set both `incidentThreshold` and `recoveryThreshold`. | +| recoveryThreshold (optional) | Number of retries before Monika sends recovery notifications. For example, when recoveryThreshold is 3 and when previously a probe is marked as incident, Monika will only send recovery notification when the probing succeeds 3 times in a row. Default value is 5. However, the actual number of retries will be the the greatest number between `incidentThreshold` and `recoveryThreshold`. So if you want to have 3 retries, you need to set both `incidentThreshold` and `recoveryThreshold`. | +| saveBody (optional) | When set to true, the response body of the request is stored in the internal database. The default is off when not defined. This is to keep the log file size small as some responses can be sizable. The setting is for each probe request. | +| alerts (optional) | The condition which will trigger an alert, and the subsequent notification method to send out the alert. See below for further details on alerts and notifications. See [alerts](./alerts) section for detailed information. | +| ping (optional) | (boolean), If set true then send a PING to the specified url instead. | +| allowUnauthorized (optional) | (boolean), If set to true, will make https agent to not check for ssl certificate validity | ## Request Body diff --git a/src/commands/monika.ts b/src/commands/monika.ts index 5e8f72124..c3444d9ff 100644 --- a/src/commands/monika.ts +++ b/src/commands/monika.ts @@ -284,36 +284,17 @@ export default class Monika extends Command { } deprecationHandler(config: Config): Config { - const showDeprecateMsg: Record< - 'query' | 'incidentThreshold' | 'recoveryThreshold', - boolean - > = { + const showDeprecateMsg: Record<'query', boolean> = { query: false, - incidentThreshold: false, - recoveryThreshold: false, } const checkedConfig = { ...config, - probes: config.probes?.map((probe) => { - if (probe?.recoveryThreshold) { - showDeprecateMsg.recoveryThreshold = true - } - - return { - ...probe, - requests: probe.requests?.map((request) => ({ - ...request, - alert: request.alerts?.map((alert) => { - if (alert.query) { - showDeprecateMsg.query = true - return { ...alert, assertion: alert.query } - } - - return alert - }), - })), - alerts: probe.alerts?.map((alert) => { + probes: config.probes?.map((probe) => ({ + ...probe, + requests: probe.requests?.map((request) => ({ + ...request, + alert: request.alerts?.map((alert) => { if (alert.query) { showDeprecateMsg.query = true return { ...alert, assertion: alert.query } @@ -321,14 +302,16 @@ export default class Monika extends Command { return alert }), - } - }), - } - - if (showDeprecateMsg.recoveryThreshold) { - log.warn( - 'recoveryThreshold is deprecated. It will be managed internally by Monika.' - ) + })), + alerts: probe.alerts?.map((alert) => { + if (alert.query) { + showDeprecateMsg.query = true + return { ...alert, assertion: alert.query } + } + + return alert + }), + })), } if (showDeprecateMsg.query) { diff --git a/src/components/config/validation/validator/default-values.ts b/src/components/config/validation/validator/default-values.ts new file mode 100644 index 000000000..16b08fead --- /dev/null +++ b/src/components/config/validation/validator/default-values.ts @@ -0,0 +1,27 @@ +/********************************************************************************** + * MIT License * + * * + * Copyright (c) 2023 Hyperjump Technology * + * * + * Permission is hereby granted, free of charge, to any person obtaining a copy * + * of this software and associated documentation files (the "Software"), to deal * + * in the Software without restriction, including without limitation the rights * + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * + * copies of the Software, and to permit persons to whom the Software is * + * furnished to do so, subject to the following conditions: * + * * + * The above copyright notice and this permission notice shall be included in all * + * copies or substantial portions of the Software. * + * * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * + * SOFTWARE. * + **********************************************************************************/ + +export const DEFAULT_INCIDENT_THRESHOLD = 5 +export const DEFAULT_RECOVERY_THRESHOLD = 5 +export const DEFAULT_INTERVAL = 10 diff --git a/src/components/config/validation/validator/probe.ts b/src/components/config/validation/validator/probe.ts index e47a1b8aa..7bc09f443 100644 --- a/src/components/config/validation/validator/probe.ts +++ b/src/components/config/validation/validator/probe.ts @@ -30,6 +30,11 @@ import { getContext } from '../../../../context' import { FAILED_REQUEST_ASSERTION } from '../../../../looper' import { compileExpression } from '../../../../utils/expression-parser' import { isValidURL } from '../../../../utils/is-valid-url' +import { + DEFAULT_INCIDENT_THRESHOLD, + DEFAULT_INTERVAL, + DEFAULT_RECOVERY_THRESHOLD, +} from './default-values' export async function validateProbes(probes: Probe[]): Promise { const alertSchema = joi.alternatives().try( @@ -101,8 +106,15 @@ export async function validateProbes(probes: Probe[]): Promise { }), description: joi.string().allow(''), id: joi.string().required(), - incidentThreshold: joi.number().default(5).min(1), - interval: joi.number().default(10).min(1), + incidentThreshold: joi + .number() + .default(DEFAULT_INCIDENT_THRESHOLD) + .min(1), + recoveryThreshold: joi + .number() + .default(DEFAULT_RECOVERY_THRESHOLD) + .min(1), + interval: joi.number().default(DEFAULT_INTERVAL).min(1), lastEvent: joi.object({ createdAt: joi.string().allow(''), recoveredAt: joi.string().allow('', null), diff --git a/src/components/probe/index.test.ts b/src/components/probe/index.test.ts index 65ef0c984..d05e7e45b 100644 --- a/src/components/probe/index.test.ts +++ b/src/components/probe/index.test.ts @@ -23,7 +23,6 @@ **********************************************************************************/ import { expect } from '@oclif/test' -import { AxiosError } from 'axios' import { rest } from 'msw' import { setupServer } from 'msw/node' import sinon from 'sinon' @@ -32,7 +31,6 @@ import { MongoClient, type Db } from 'mongodb' import net from 'net' import { Pool } from 'pg' import * as redis from 'redis' -import * as httpRequest from '../../utils/http' import { doProbe } from '.' import { initializeProbeStates } from '../../utils/probe-state' import type { Probe } from '../../interfaces/probe' @@ -41,11 +39,9 @@ import type { MonikaFlags } from '../../flag' import { FAILED_REQUEST_ASSERTION } from '../../looper' import { closeLog, openLogfile } from '../logger/history' -let urlRequestTotal = 0 let notificationAlert: Record> = {} const server = setupServer( rest.get('https://example.com', (_, res, ctx) => { - urlRequestTotal += 1 return res(ctx.status(200)) }), rest.post('https://example.com/webhook', async (req, res, ctx) => { @@ -57,23 +53,8 @@ const server = setupServer( return res(ctx.status(200)) }) ) -const probes: Probe[] = [ - { - id: '1', - name: 'Example', - interval: 1, - requests: [ - { - url: 'https://example.com', - body: '', - timeout: 30, - }, - ], - alerts: [], - }, -] - -describe('Probe processing', () => { + +describe('Base Probe processing', () => { before(async () => { server.listen() await openLogfile() @@ -83,7 +64,6 @@ describe('Probe processing', () => { }) afterEach(() => { resetContext() - urlRequestTotal = 0 notificationAlert = {} server.resetHandlers() sinon.restore() @@ -93,238 +73,6 @@ describe('Probe processing', () => { await closeLog() }) - describe('HTTP Probe', () => { - it('should not run probe if the probe is running', async () => { - // arrange - initializeProbeStates(probes) - // wait until the interval passed - const seconds = 1000 - await sleep(seconds) - - // act - doProbe({ probe: probes[0], notifications: [] }) - await doProbe({ probe: probes[0], notifications: [] }) - // wait for random timeout - await sleep(3 * seconds) - - // assert - expect(urlRequestTotal).eq(1) - }) - - it('should not run probe if it is not the time', () => { - // arrange - initializeProbeStates(probes) - - // act - doProbe({ notifications: [], probe: { ...probes[0], interval: 10 } }) - - // assert - expect(urlRequestTotal).eq(0) - }) - - it('should not run probe if the cycle is end', async () => { - // arrange - initializeProbeStates(probes) - setContext({ - ...getContext(), - flags: { ...getContext().flags, repeat: 1 }, - }) - // wait until the interval passed - const seconds = 1000 - await sleep(seconds) - - // act - await doProbe({ probe: probes[0], notifications: [] }) - await doProbe({ probe: probes[0], notifications: [] }) - await doProbe({ probe: probes[0], notifications: [] }) - // wait for random timeout - await sleep(3 * seconds) - - // assert - expect(urlRequestTotal).eq(1) - }) - - it('should run the probe', async () => { - // arrange - const uniqueProbes: Probe[] = Array.from({ length: 5 }).map( - (_, index) => ({ - ...probes[0], - id: `${index}`, - }) - ) - initializeProbeStates(uniqueProbes) - // wait until the interval passed - const seconds = 1000 - await sleep(seconds) - - // act - await Promise.all( - uniqueProbes.map((probe) => doProbe({ probe, notifications: [] })) - ) - // wait for random timeout - await sleep(3 * seconds) - - // assert - expect(urlRequestTotal).eq(5) - }) - - it('should send incident notification if the request is failed', async () => { - // arrange - sinon.stub(httpRequest, 'sendHttpRequest').callsFake(async () => { - throw new AxiosError('ECONNABORTED', undefined, undefined, {}) - }) - const probe = { - ...probes[0], - id: '2md9a', - requests: [ - { - url: 'https://example.com', - body: '', - timeout: 30, - }, - ], - alerts: [ - { - id: 'Cqkjh', - ...FAILED_REQUEST_ASSERTION, - }, - { - id: 'fKBzx', - assertion: 'response.status == 200', - message: 'The assertion failed.', - }, - ], - } - initializeProbeStates([probe]) - // wait until the interval passed - const seconds = 1000 - await sleep(seconds) - - // act - await doProbe({ - probe, - notifications: [ - { - id: 'jFQBd', - data: { url: 'https://example.com/webhook' }, - type: 'webhook', - }, - ], - }) - // wait for random timeout - await sleep(3 * seconds) - // wait for send notification function to resolve - await sleep(2 * seconds) - - // assert - expect(notificationAlert?.[probe.requests[0].url]?.body?.url).eq( - 'https://example.com' - ) - expect(notificationAlert?.[probe.requests[0].url]?.body.alert).eq('') - }).timeout(10_000) - - it('should send incident notification', async () => { - // arrange - server.use( - rest.get('https://example.com', (_, res, ctx) => { - urlRequestTotal += 1 - return res(ctx.status(404)) - }) - ) - const probe = { - ...probes[0], - id: '2md9o', - alerts: [ - { - id: 'P7-fN', - assertion: 'response.status != 200', - message: 'The assertion failed.', - }, - ], - } - initializeProbeStates([probe]) - // wait until the interval passed - const seconds = 1000 - await sleep(seconds) - - // act - await doProbe({ - probe, - notifications: [ - { - id: 'jFQBd', - data: { url: 'https://example.com/webhook' }, - type: 'webhook', - }, - ], - }) - // wait for random timeout - await sleep(3 * seconds) - // wait for send notification function to resolve - await sleep(2 * seconds) - - // assert - expect(notificationAlert?.[probe?.requests?.[0]?.url || 0]?.body?.url).eq( - 'https://example.com' - ) - expect( - notificationAlert?.[probe?.requests?.[0]?.url || 0]?.body?.alert - ).eq('response.status != 200') - - // restore - server.resetHandlers() - }).timeout(10_000) - - it('should send recovery notification', async () => { - // arrange - server.use( - rest.get('https://example.com', (_, res, ctx) => { - urlRequestTotal += 1 - return res(ctx.status(404)) - }) - ) - const probe = { - ...probes[0], - id: 'fj43l', - incidentThreshold: 1, - requests: [{ url: 'https://example.com', body: '', timeout: 30 }], - alerts: [ - { id: 'jFQBd', assertion: 'response.status != 200', message: '' }, - ], - } - const notifications = [ - { - id: 'jFQBd', - data: { url: 'https://example.com/webhook' }, - type: 'webhook', - }, - ] - initializeProbeStates([probe]) - // wait until the interval passed - const seconds = 1000 - await sleep(seconds) - - // act - await doProbe({ - probe, - notifications, - }) - // wait for random timeout - await sleep(3 * seconds) - server.resetHandlers() - // wait for the send notification function to resolve - await sleep(3 * seconds) - - // assert - expect(notificationAlert?.[probe.requests[0].url]?.body?.url).eq( - 'https://example.com' - ) - expect(notificationAlert?.[probe.requests[0].url]?.body?.alert).eq( - 'response.status != 200' - ) - }).timeout(10_000) - }) - describe('Non HTTP Probe', () => { it('should probe MariaDB', async () => { // arrange @@ -425,6 +173,27 @@ describe('Probe processing', () => { }).timeout(10_000) it('should send recovery notification for MariaDB probe', async () => { + // simulate the incindent first by throwing on first call + // then simulate recovery on second call + const requestStub = sinon.stub(mariadb, 'createConnection') + requestStub.onFirstCall().throws() + requestStub.onSecondCall().callsFake( + async (_connectionUri) => + ({ + end: async () => { + Promise.resolve() + }, + } as mariadb.Connection) + ) + + // repeat needs to be 0 so that monika can probe twice + // where in the first time it will send incident notification + // then in the second time it will send recovery notification + setContext({ + ...getContext(), + flags: { ...getContext().flags, repeat: 0 }, + }) + // arrange const probe = { id: '3ngd4', @@ -462,25 +231,14 @@ describe('Probe processing', () => { }, ], }) - // wait for random timeout - await sleep(3 * seconds) - - const requestStub = sinon.stub(mariadb, 'createConnection').callsFake( - async (_connectionUri) => - ({ - end: async () => { - Promise.resolve() - }, - } as mariadb.Connection) - ) - // wait for send notification function to resolve - await sleep(3 * seconds) + // gonna need to wait for a while until monika does the probing twice + await sleep(7000) // assert sinon.assert.called(requestStub) expect(notificationAlert?.[probe.id]?.body?.url).eq('3ngd4') expect(notificationAlert?.[probe.id]?.body?.alert).eq('') - }).timeout(10_000) + }).timeout(15_000) it('should probe MongoDB', async () => { // arrange diff --git a/src/components/probe/index.ts b/src/components/probe/index.ts index 15e763225..8aa222fd1 100644 --- a/src/components/probe/index.ts +++ b/src/components/probe/index.ts @@ -34,6 +34,10 @@ import { setProbeRunning, } from '../../utils/probe-state' import { createProbers } from './prober/factory' +import { + DEFAULT_INCIDENT_THRESHOLD, + DEFAULT_RECOVERY_THRESHOLD, +} from '../config/validation/validator/default-values' type doProbeParams = { probe: Probe // probe contains all the probes @@ -48,14 +52,13 @@ export async function doProbe({ probe, notifications, }: doProbeParams): Promise { - if ( - !isTimeToProbe(probe) || - isCycleEnd(probe.id) || - !setProbeRunning(probe.id) - ) { + if (!isTimeToProbe(probe) || isCycleEnd(probe.id)) { return } + const randomTimeoutMilliseconds = getRandomTimeoutMilliseconds() + setProbeRunning(probe.id) + setTimeout(async () => { const probeCtx = getProbeContext(probe.id) if (!probeCtx) { @@ -68,7 +71,14 @@ export async function doProbe({ probeConfig: probe, }) + const maxAttempts = Math.max( + // since we will retry for both incident and recovery, let's just get the biggest threshold + probe.incidentThreshold || DEFAULT_INCIDENT_THRESHOLD, + probe.recoveryThreshold || DEFAULT_RECOVERY_THRESHOLD + ) + await retry(handleAll, { + maxAttempts, backoff: new ExponentialBackoff({ initialDelay: getContext().flags.retryInitialDelayMs, maxDelay: getContext().flags.retryMaxDelayMs, @@ -78,7 +88,7 @@ export async function doProbe({ ) setProbeFinish(probe.id) - }, getRandomTimeoutMilliseconds()) + }, randomTimeoutMilliseconds) } function isTimeToProbe({ id, interval }: Probe) { diff --git a/src/components/probe/prober/http/index.test.ts b/src/components/probe/prober/http/index.test.ts new file mode 100644 index 000000000..139bf9208 --- /dev/null +++ b/src/components/probe/prober/http/index.test.ts @@ -0,0 +1,324 @@ +/********************************************************************************** + * MIT License * + * * + * Copyright (c) 2021 Hyperjump Technology * + * * + * Permission is hereby granted, free of charge, to any person obtaining a copy * + * of this software and associated documentation files (the "Software"), to deal * + * in the Software without restriction, including without limitation the rights * + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * + * copies of the Software, and to permit persons to whom the Software is * + * furnished to do so, subject to the following conditions: * + * * + * The above copyright notice and this permission notice shall be included in all * + * copies or substantial portions of the Software. * + * * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * + * SOFTWARE. * + **********************************************************************************/ + +import { expect } from '@oclif/test' +import { AxiosError } from 'axios' +import { rest } from 'msw' +import { setupServer } from 'msw/node' +import sinon from 'sinon' +import * as httpRequest from '../../../../utils/http' +import { doProbe } from '../..' +import { initializeProbeStates } from '../../../../utils/probe-state' +import type { Probe } from '../../../../interfaces/probe' +import { getContext, resetContext, setContext } from '../../../../context' +import type { MonikaFlags } from '../../../../flag' +import { FAILED_REQUEST_ASSERTION } from '../../../../looper' +import { closeLog, openLogfile } from '../../../logger/history' + +let urlRequestTotal = 0 +let notificationAlert: Record> = {} +const server = setupServer( + rest.get('https://example.com', (_, res, ctx) => { + urlRequestTotal += 1 + return res(ctx.status(200)) + }), + rest.post('https://example.com/webhook', async (req, res, ctx) => { + const requestBody = await req.json() + if (requestBody?.body?.url) { + notificationAlert[requestBody.body.url] = requestBody + } + + return res(ctx.status(200)) + }) +) +const probes: Probe[] = [ + { + id: '1', + name: 'Example', + interval: 1, + requests: [ + { + url: 'https://example.com', + body: '', + timeout: 30, + }, + ], + alerts: [], + }, +] + +describe('HTTP Probe processing', () => { + before(async () => { + server.listen() + await openLogfile() + }) + beforeEach(() => { + setContext({ flags: { repeat: 1 } as MonikaFlags }) + }) + afterEach(() => { + resetContext() + urlRequestTotal = 0 + notificationAlert = {} + server.resetHandlers() + sinon.restore() + }) + after(async () => { + server.close() + await closeLog() + }) + + it('should not run probe if the probe is running', async () => { + // arrange + initializeProbeStates(probes) + // wait until the interval passed + const seconds = 1000 + await sleep(seconds) + + // act + doProbe({ probe: probes[0], notifications: [] }) + await doProbe({ probe: probes[0], notifications: [] }) + // wait for random timeout + await sleep(3 * seconds) + + // assert + expect(urlRequestTotal).eq(1) + }) + + it('should not run probe if it is not the time', () => { + // arrange + initializeProbeStates(probes) + + // act + doProbe({ notifications: [], probe: { ...probes[0], interval: 10 } }) + + // assert + expect(urlRequestTotal).eq(0) + }) + + it('should not run probe if the cycle is end', async () => { + // arrange + initializeProbeStates(probes) + setContext({ + ...getContext(), + flags: { ...getContext().flags, repeat: 1 }, + }) + // wait until the interval passed + const seconds = 1000 + await sleep(seconds) + + // act + await doProbe({ probe: probes[0], notifications: [] }) + await doProbe({ probe: probes[0], notifications: [] }) + await doProbe({ probe: probes[0], notifications: [] }) + // wait for random timeout + await sleep(3 * seconds) + + // assert + expect(urlRequestTotal).eq(1) + }) + + it('should run the probe', async () => { + // arrange + const uniqueProbes: Probe[] = Array.from({ length: 5 }).map((_, index) => ({ + ...probes[0], + id: `${index}`, + })) + initializeProbeStates(uniqueProbes) + // wait until the interval passed + const seconds = 1000 + await sleep(seconds) + + // act + await Promise.all( + uniqueProbes.map((probe) => doProbe({ probe, notifications: [] })) + ) + // wait for random timeout + await sleep(3 * seconds) + + // assert + expect(urlRequestTotal).eq(5) + }) + + it('should send incident notification if the request is failed', async () => { + // arrange + sinon.stub(httpRequest, 'sendHttpRequest').callsFake(async () => { + throw new AxiosError('ECONNABORTED', undefined, undefined, {}) + }) + const probe = { + ...probes[0], + id: '2md9a', + requests: [ + { + url: 'https://example.com', + body: '', + timeout: 30, + }, + ], + alerts: [ + { + id: 'Cqkjh', + ...FAILED_REQUEST_ASSERTION, + }, + { + id: 'fKBzx', + assertion: 'response.status == 200', + message: 'The assertion failed.', + }, + ], + } + initializeProbeStates([probe]) + // wait until the interval passed + const seconds = 1000 + await sleep(seconds) + + // act + await doProbe({ + probe, + notifications: [ + { + id: 'jFQBd', + data: { url: 'https://example.com/webhook' }, + type: 'webhook', + }, + ], + }) + // wait for random timeout + await sleep(3 * seconds) + // wait for send notification function to resolve + await sleep(2 * seconds) + + // assert + expect(notificationAlert?.[probe.requests[0].url]?.body?.url).eq( + 'https://example.com' + ) + expect(notificationAlert?.[probe.requests[0].url]?.body.alert).eq('') + }).timeout(10_000) + + it('should send incident notification when assertion fails', async () => { + // arrange + server.use( + rest.get('https://example.com', (_, res, ctx) => { + urlRequestTotal += 1 + return res(ctx.status(404)) + }) + ) + const probe = { + ...probes[0], + id: '2md9o', + alerts: [ + { + id: 'P7-fN', + assertion: 'response.status != 200', + message: 'The assertion failed.', + }, + ], + } + initializeProbeStates([probe]) + // wait until the interval passed + const seconds = 1000 + await sleep(seconds) + + // act + await doProbe({ + probe, + notifications: [ + { + id: 'jFQBd', + data: { url: 'https://example.com/webhook' }, + type: 'webhook', + }, + ], + }) + // wait for random timeout + await sleep(3 * seconds) + // wait for send notification function to resolve + await sleep(2 * seconds) + + // assert + expect(notificationAlert?.[probe?.requests?.[0]?.url || 0]?.body?.url).eq( + 'https://example.com' + ) + expect(notificationAlert?.[probe?.requests?.[0]?.url || 0]?.body?.alert).eq( + 'response.status != 200' + ) + + // restore + server.resetHandlers() + }).timeout(10_000) + + it('should send recovery notification', async () => { + // arrange + server.use( + rest.get('https://example.com', (_, res, ctx) => { + urlRequestTotal += 1 + return res(ctx.status(404)) + }) + ) + const probe = { + ...probes[0], + id: 'fj43l', + incidentThreshold: 1, + requests: [{ url: 'https://example.com', body: '', timeout: 30 }], + alerts: [ + { id: 'jFQBd', assertion: 'response.status != 200', message: '' }, + ], + } + const notifications = [ + { + id: 'jFQBd', + data: { url: 'https://example.com/webhook' }, + type: 'webhook', + }, + ] + initializeProbeStates([probe]) + // wait until the interval passed + const seconds = 1000 + await sleep(seconds) + + // act + await doProbe({ + probe, + notifications, + }) + // wait for random timeout + await sleep(3 * seconds) + server.resetHandlers() + // wait for the send notification function to resolve + await sleep(3 * seconds) + + // assert + expect(notificationAlert?.[probe.requests[0].url]?.body?.url).eq( + 'https://example.com' + ) + expect(notificationAlert?.[probe.requests[0].url]?.body?.alert).eq( + 'response.status != 200' + ) + }).timeout(10_000) +}) + +function sleep(durationMs: number): Promise { + return new Promise((resolve) => { + setTimeout(resolve, durationMs) + }) +} diff --git a/src/components/probe/prober/http/index.ts b/src/components/probe/prober/http/index.ts index 6225dc25e..0ffa8551b 100644 --- a/src/components/probe/prober/http/index.ts +++ b/src/components/probe/prober/http/index.ts @@ -1,3 +1,4 @@ +/* eslint-disable complexity */ /********************************************************************************** * MIT License * * * @@ -26,7 +27,7 @@ import { getContext } from '../../../../context' import events from '../../../../events' import { getEventEmitter } from '../../../../utils/events' import { httpRequest } from './request' -import { BaseProber, DEFAULT_INCIDENT_THRESHOLD, NotificationType } from '..' +import { BaseProber, NotificationType } from '..' import { type ProbeRequestResponse, probeRequestResult, @@ -50,9 +51,6 @@ export class HTTPProber extends BaseProber { const requests = this.probeConfig.requests! // sending multiple http requests for request chaining const responses: ProbeRequestResponse[] = [] - const isIncidentThresholdMet = - incidentRetryAttempt === - (this.probeConfig.incidentThreshold || DEFAULT_INCIDENT_THRESHOLD) - 1 for (const requestConfig of requests) { responses.push( @@ -69,37 +67,35 @@ export class HTTPProber extends BaseProber { ) if (hasFailedRequest) { if (this.hasIncident()) { + // this probe is currently 'incident' state so no need to continue this.logMessage( false, getErrorMessage(hasFailedRequest.errMessage || 'Unknown error.') ) - throw new Error('There is an ongoing incident.') + return } - if (!isIncidentThresholdMet) { - this.logMessage( - false, - `Probe request failed. Attempt (${ - incidentRetryAttempt + 1 - }) with incident threshold (${this.probeConfig.incidentThreshold}).` - ) - throw new Error( - 'Probe request is failed but incident threshold is not met.' - ) - } + // if the incident threshold is not yet met, this will throw and return the execution to `retry` function in src/components/probe/index.ts + this.throwIncidentIfNeeded( + incidentRetryAttempt, + this.probeConfig.incidentThreshold + ) + // the threshold has been met, so let's log the message this.logMessage( false, getErrorMessage(hasFailedRequest.errMessage || 'Unknown error.'), getNotificationMessage({ isIncident: true }) ) + // this probe is definitely in incident state, so send notification, etc. this.handleFailedProbe( responses.map((requestResponse) => ({ requestResponse })) ) - throw new Error('Probe request is failed.') + return } + // from here on, the probe can be accessed but might still trigger the assertion for (const requestIndex of responses.keys()) { const response = responses[requestIndex] const validatedResponse = this.validateResponse( @@ -114,35 +110,31 @@ export class HTTPProber extends BaseProber { const { alert } = triggeredAlertResponse if (this.hasIncident()) { + // this probe is still in incident state this.logMessage(false, getAssertionMessage(alert.assertion)) - throw new Error(alert.message) + return } - if (!isIncidentThresholdMet) { - this.logMessage( - false, - `Probe assertion failed. Attempt (${ - incidentRetryAttempt + 1 - }) with incident threshold (${this.probeConfig.incidentThreshold}).` - ) - throw new Error( - 'Probe assertion is failed but incident threshold is not met.' - ) - } + // if the incident threshold is not yet met, this will throw and return the execution to `retry` function in src/components/probe/index.ts + this.throwIncidentIfNeeded( + incidentRetryAttempt, + this.probeConfig.incidentThreshold, + 'Probe assertion failed' + ) + // this probe is definitely in incident state because of fail assertion, so send notification, etc. this.handleAssertionFailed(response, requestIndex, alert) - throw new Error(alert.message) + return } } - const isRecovery = this.hasIncident() - if (isRecovery) { - this.logMessage(true, getNotificationMessage({ isIncident: false })) - this.handleRecovery( - responses.map((requestResponse) => ({ requestResponse })) - ) - } + // from here on, the probe is definitely healthy, but if it was incident, we don't want to immediately send notification + this.sendRecoveryNotificationIfNeeded( + incidentRetryAttempt, + responses.map((requestResponse) => ({ requestResponse })) + ) + // the probe is healthy and not recovery for (const requestIndex of responses.keys()) { const response = responses[requestIndex] getEventEmitter().emit(events.probe.response.received, { diff --git a/src/components/probe/prober/index.ts b/src/components/probe/prober/index.ts index 3fe8e8b86..18fad03e3 100644 --- a/src/components/probe/prober/index.ts +++ b/src/components/probe/prober/index.ts @@ -42,6 +42,10 @@ import { stopDowntimeCounter, } from '../../downtime-counter' import { FAILED_REQUEST_ASSERTION } from '../../../looper' +import { + DEFAULT_INCIDENT_THRESHOLD, + DEFAULT_RECOVERY_THRESHOLD, +} from '../../config/validation/validator/default-values' export type ProbeResult = { isAlertTriggered: boolean @@ -76,8 +80,6 @@ enum ProbeState { Down = 'DOWN', } -export const DEFAULT_INCIDENT_THRESHOLD = 5 - export class BaseProber implements Prober { protected readonly counter: number protected readonly notifications: Notification[] @@ -112,32 +114,25 @@ export class BaseProber implements Prober { ) ) { if (this.hasIncident()) { - throw new Error('There is an ongoing incident.') + // this probe is still in incident state + return } - const isIncidentThresholdMet = - incidentRetryAttempt === - (this.probeConfig.incidentThreshold || DEFAULT_INCIDENT_THRESHOLD) - 1 - if (!isIncidentThresholdMet) { - this.logMessage( - false, - `Probe request failed. Attempt (${ - incidentRetryAttempt + 1 - }) with incident threshold (${this.probeConfig.incidentThreshold}).` - ) - throw new Error( - 'Probe request is failed but incident threshold is not met.' - ) - } + // if the incident threshold is not yet met, this will throw and return the execution to `retry` function in src/components/probe/index.ts + this.throwIncidentIfNeeded( + incidentRetryAttempt, + this.probeConfig.incidentThreshold + ) + // this probe is definitely in incident state because of fail assertion, so send notification, etc. this.handleFailedProbe(probeResults) - throw new Error('Probe request is failed.') + return } - if (this.hasIncident()) { - this.handleRecovery(probeResults) - } + // from here on, the probe is definitely healthy, but if it was incident, we don't want to immediately send notification + this.sendRecoveryNotificationIfNeeded(incidentRetryAttempt, probeResults) + // the probe is healthy and not recovery for (const index of probeResults.keys()) { const { requestResponse } = probeResults[index] getEventEmitter().emit(events.probe.response.received, { @@ -200,6 +195,69 @@ export class BaseProber implements Prober { ) } + /** + * If the probe is healthy and previously not (so it's a recovery), this function will call the function to send recovery notification only when the retry attempts equals to the recovery threshold - 1. + * Otherwise, it will throw and return the execution to the retry function in src/components/probe/index.ts. + * If the probe is healthy just like before, nothing to do in this function. + * @param incidentRetryAttempt The number of retry attempts + * @param probeResults The probe results + * @returns void + */ + protected sendRecoveryNotificationIfNeeded( + incidentRetryAttempt: number, + probeResults: Pick[] + ) { + const isRecoveryThresholdMet = + incidentRetryAttempt === + (this.probeConfig.recoveryThreshold || DEFAULT_RECOVERY_THRESHOLD) - 1 + const isRecovery = this.hasIncident() + if (isRecovery) { + if (!isRecoveryThresholdMet) { + this.logMessage( + false, + `Probing succeeds but previously incident. Will retry. Attempt (${ + incidentRetryAttempt + 1 + }) with recovery threshold (${this.probeConfig.recoveryThreshold}).` + ) + // throw here so that the retry function in src/components/probe/index.ts can retry again + throw new Error('Probing succeeds but recovery threshold is not met.') + } + + // at this state, the probe has definitely recovered, so send notifications, etc. + this.handleRecovery(probeResults) + } + } + + /** + * If the number of attempts is equal to the incidentThreshold - 1, this function will throw which will return execution to the retry function in src/components/probe/index.ts. + * Otherwise, it will not do anything. + * @param incidentRetryAttempt How many times have monika retry probing + * @param incidentThreshold The incident threshold of the probe + * @param message Message to display to stdout + * @throws + * @returns void + */ + protected throwIncidentIfNeeded( + incidentRetryAttempt: number, + incidentThreshold: number = DEFAULT_INCIDENT_THRESHOLD, + message: string = 'Probing failed' + ) { + const isIncidentThresholdMet = + incidentRetryAttempt === incidentThreshold - 1 + + if (!isIncidentThresholdMet) { + this.logMessage( + false, + `${message}. Will try again. Attempt (${ + incidentRetryAttempt + 1 + }) with incident threshold (${incidentThreshold}).` + ) + + // throw here so that the retry function in src/components/probe/index.ts can retry again + throw new Error(`${message} but incident threshold is not met.`) + } + } + protected async sendNotification({ requestURL, notificationType, diff --git a/src/flag.ts b/src/flag.ts index 40120a45f..c39185804 100644 --- a/src/flag.ts +++ b/src/flag.ts @@ -81,7 +81,7 @@ export const monikaFlagsDefaultValue: MonikaFlags = { logs: false, 'one-probe': false, repeat: 0, - retryInitialDelayMs: 128, + retryInitialDelayMs: 2000, retryMaxDelayMs: 30_000, // default is 20s interval lookup stun: 20, @@ -98,12 +98,11 @@ export const symonAPIVersion = Flags.custom({ }) export const retryInitialDelayMs = Flags.integer({ - default: 128, - description: - 'The initial, first delay of the backoff retry when probe request is failed, in milliseconds. Defaults to 128ms.', + default: monikaFlagsDefaultValue.retryInitialDelayMs, + description: `The initial, first delay of the backoff retry when probe request is failed, in milliseconds. Defaults to ${monikaFlagsDefaultValue.retryInitialDelayMs}ms`, }) export const retryMaxDelayMs = Flags.integer({ - default: 30_000, - description: 'Maximum backoff retry delay, in milliseconds. Defaults to 30s.', + default: monikaFlagsDefaultValue.retryMaxDelayMs, + description: `Maximum backoff retry delay, in milliseconds. Defaults to ${monikaFlagsDefaultValue.retryMaxDelayMs}ms.`, }) diff --git a/src/looper/index.ts b/src/looper/index.ts index c37ef24cd..ad9d88473 100644 --- a/src/looper/index.ts +++ b/src/looper/index.ts @@ -38,13 +38,18 @@ import { initializeProbeStates, } from '../utils/probe-state' import { getPublicIp, isConnectedToSTUNServer } from '../utils/public-ip' +import { + DEFAULT_INCIDENT_THRESHOLD, + DEFAULT_RECOVERY_THRESHOLD, +} from '../components/config/validation/validator/default-values' let checkSTUNinterval: NodeJS.Timeout const DISABLE_STUN = -1 // -1 is disable stun checking export function sanitizeProbe(isSymonMode: boolean, probe: Probe): Probe { - const { id, name, requests, incidentThreshold, alerts } = probe + const { id, name, requests, incidentThreshold, recoveryThreshold, alerts } = + probe if (!name) { log.warn( @@ -52,12 +57,6 @@ export function sanitizeProbe(isSymonMode: boolean, probe: Probe): Probe { ) } - if (!incidentThreshold) { - log.warn( - `Warning: Probe ${id} has no incidentThreshold configuration defined. Using the default threshold: 5` - ) - } - const isHTTPProbe = Boolean(requests) const isAlertsEmpty = alerts === undefined || alerts.length === 0 if (!isSymonMode && isHTTPProbe && isAlertsEmpty) { @@ -68,6 +67,8 @@ export function sanitizeProbe(isSymonMode: boolean, probe: Probe): Probe { return { ...probe, + incidentThreshold: incidentThreshold || DEFAULT_INCIDENT_THRESHOLD, + recoveryThreshold: recoveryThreshold || DEFAULT_RECOVERY_THRESHOLD, alerts: isSymonMode ? [] : addFailedRequestAssertions(alerts), } } diff --git a/test/others/fakes.test.ts b/test/others/fakes.test.ts index 126dc1bce..96ba258d7 100644 --- a/test/others/fakes.test.ts +++ b/test/others/fakes.test.ts @@ -144,7 +144,12 @@ describe('Fake data', () => { it('should returns a timestamp', () => { const case1 = Handlebars.compile('{{ timestamp }}') - expect(case1({})).to.be.equals(Date.now().toString()) + // sometimes the rendered timestamp and current date differs by 1 ms which causes the test to fail intermittently. + // so it's better to check the diff with lessThanOrEqual. + const now = Date.now() + const rendered = Number.parseInt(case1({}), 10) + const diff = Math.abs(now - rendered) + expect(diff).to.be.lessThanOrEqual(1) }) it('should returns a isodate string', () => { diff --git a/tsconfig.json b/tsconfig.json index 7547928b7..61375d3b4 100644 --- a/tsconfig.json +++ b/tsconfig.json @@ -12,7 +12,8 @@ "sourceMap": true, "baseUrl": ".", "typeRoots": ["node_modules/@types", "custom_types"], - "lib": ["ES2021.String"] + "lib": ["ES2021.String"], + "composite": true }, "include": [ "src/**/*",