From 4e053a49a8cb23db75ba78891baadaec9c2302b2 Mon Sep 17 00:00:00 2001 From: Tom Wey Date: Thu, 30 Jan 2025 12:26:53 +0000 Subject: [PATCH] Add the concept of a higher threshold alarm metric For example, we don't care about every instance of "email already taken" from Identity, but we want to know if there are suddenly lots of them. --- .../app/actions/CustomActionBuilders.scala | 29 +++++++++++++++---- .../com/gu/aws/AwsCloudWatchMetricSetup.scala | 8 +++++ 2 files changed, 31 insertions(+), 6 deletions(-) diff --git a/support-frontend/app/actions/CustomActionBuilders.scala b/support-frontend/app/actions/CustomActionBuilders.scala index 8cdd023345..d8fad3b071 100644 --- a/support-frontend/app/actions/CustomActionBuilders.scala +++ b/support-frontend/app/actions/CustomActionBuilders.scala @@ -62,28 +62,45 @@ class CustomActionBuilders( case class LoggingAndAlarmOnFailure[A](chainedAction: Action[A]) extends EssentialAction with SafeLogging { + private def pushMetric(cloudwatchEvent: AwsCloudWatchMetricPut.MetricRequest) = { + AwsCloudWatchMetricPut(AwsCloudWatchMetricPut.client)(cloudwatchEvent) + } private def pushAlarmMetric = { val cloudwatchEvent = AwsCloudWatchMetricSetup.serverSideCreateFailure(stage) - AwsCloudWatchMetricPut(AwsCloudWatchMetricPut.client)(cloudwatchEvent) + pushMetric(cloudwatchEvent) + } + + private def pushHighThresholdAlarmMetric = { + val cloudwatchEvent = AwsCloudWatchMetricSetup.serverSideHighThresholdCreateFailure(stage) + pushMetric(cloudwatchEvent) } private def maybePushAlarmMetric(result: Result) = { + // We'll never alarm on these val ignoreList = Set( emailProviderRejectedCode, invalidEmailAddressCode, recaptchaFailedCode, + ) + // We'll alarm on these, but only over a certain threshold + val highThresholdList = Set( emailAddressAlreadyTakenCode, ) if (result.header.status == 500) { - if (!ignoreList.contains(result.header.reasonPhrase.getOrElse(""))) { + if (ignoreList.contains(result.header.reasonPhrase.getOrElse(""))) { + logger.info( + s"not pushing alarm metric for ${result.header.status} ${result.header.reasonPhrase} as it is in our ignore list", + ) + } else if (highThresholdList.contains(result.header.reasonPhrase.getOrElse(""))) { + logger.info( + s"pushing higher threshold alarm metric for ${result.header.status} ${result.header.reasonPhrase}", + ) + pushHighThresholdAlarmMetric + } else { logger.error( scrub"pushing alarm metric - non 2xx response. Http code: ${result.header.status}, reason: ${result.header.reasonPhrase}", ) pushAlarmMetric - } else { - logger.info( - s"not pushing alarm metric for ${result.header.status} ${result.header.reasonPhrase} as it is in our ignore list", - ) } } } diff --git a/support-services/src/main/scala/com/gu/aws/AwsCloudWatchMetricSetup.scala b/support-services/src/main/scala/com/gu/aws/AwsCloudWatchMetricSetup.scala index e0a40c80c9..eea49006a1 100644 --- a/support-services/src/main/scala/com/gu/aws/AwsCloudWatchMetricSetup.scala +++ b/support-services/src/main/scala/com/gu/aws/AwsCloudWatchMetricSetup.scala @@ -47,6 +47,14 @@ object AwsCloudWatchMetricSetup { ), ) + def serverSideHighThresholdCreateFailure(stage: Stage): MetricRequest = + getMetricRequest( + MetricName("ServerSideHighThresholdCreateFailure"), + Map( + MetricDimensionName("Stage") -> MetricDimensionValue(stage.toString), + ), + ) + def defaultPromotionsLoadingFailure(stage: Stage): MetricRequest = getMetricRequest( MetricName("DefaultPromotionsLoadingFailure"),