From 5141d85a35718363f8412ca207f11bca51137d26 Mon Sep 17 00:00:00 2001 From: Khurram Nizami Date: Thu, 20 Oct 2022 20:58:07 -0500 Subject: [PATCH] Support manual execution without EC2 stop / start. Support custom identifier prefix for alarms. Provide error checking skip platform specific alarms when the platform can't be identified from image. Remove default alarm for CPUCreditBalance. --- CloudWatchAutoAlarms.yaml | 8 +++- README.md | 21 +++++++-- src/actions.py | 93 ++++++++++++++++++++++++++++++--------- src/cw_auto_alarms.py | 32 ++++++++------ 4 files changed, 115 insertions(+), 39 deletions(-) diff --git a/CloudWatchAutoAlarms.yaml b/CloudWatchAutoAlarms.yaml index 0ab5040..1134094 100755 --- a/CloudWatchAutoAlarms.yaml +++ b/CloudWatchAutoAlarms.yaml @@ -22,6 +22,11 @@ Parameters: Description: Enter the Amazon SNS Notification ARN for alarm notifications, leave blank to disable notifications. Type: String Default: "" + AlarmIdentifierPrefix: + Description: Enter the prefix that should be added to the beginning of each alarm created by the solution, (e.g. AutoAlarm-i-00e4f327736cb077f-CPUUtilization-GreaterThanThreshold-80-5m) + Type: String + Default: AutoAlarm + Conditions: ConfigureAlarmNotifications: !Not [!Equals ["", !Ref AlarmNotificationARN]] @@ -48,6 +53,7 @@ Resources: ALARM_CPU_CREDIT_BALANCE_LOW_THRESHOLD: 100 ALARM_MEMORY_HIGH_THRESHOLD: 75 ALARM_DISK_PERCENT_LOW_THRESHOLD: 20 + ALARM_IDENTIFIER_PREFIX: !Ref AlarmIdentifierPrefix CLOUDWATCH_APPEND_DIMENSIONS: 'InstanceId, ImageId, InstanceType' ALARM_LAMBDA_ERROR_THRESHOLD: 0 @@ -102,7 +108,7 @@ Resources: - cloudwatch:DescribeAlarms - cloudwatch:DeleteAlarms - cloudwatch:PutMetricAlarm - Resource: !Sub "arn:aws:cloudwatch:${AWS::Region}:${AWS::AccountId}:alarm:AutoAlarm-*" + Resource: !Sub "arn:aws:cloudwatch:${AWS::Region}:${AWS::AccountId}:alarm:${AlarmIdentifierPrefix}-*" - Effect: Allow Action: - cloudwatch:DescribeAlarms diff --git a/README.md b/README.md index 6777f3c..ae49331 100755 --- a/README.md +++ b/README.md @@ -65,7 +65,11 @@ The following list provides a description of the setting along with the environm * You can add EC2 metric dimensions to all metrics collected by the CloudWatch agent. This environment variable aligns to your CloudWatch configuration setting for [**append_dimensions**](https://docs.aws.amazon.com/AmazonCloudWatch/latest/monitoring/CloudWatch-Agent-Configuration-File-Details.html#CloudWatch-Agent-Configuration-File-Metricssection). The default setting includes all the supported dimensions: InstanceId, ImageId, InstanceType, AutoScalingGroupName * **DEFAULT_ALARM_SNS_TOPIC_ARN**: arn:aws:sns:${AWS::Region}:${AWS::AccountId}:CloudWatchAutoAlarmsSNSTopic * You can define an Amazon Simple Notification Service (Amazon SNS) topic that the Lambda function will specify as the notification target for created alarms. You provide the Amazon SNS Topic Amazon Resource Name (ARN) with the **AlarmNotificationARN** parameter when you deploy the CloudWatchAutoAlarms.yaml CloudFormation template.  If you leave the **AlarmNotificationARN** parameter value blank, then this environment variable is not set and created alarms won't use notifications. -* You can update the thresholds for the default alarms by updating the following environment variables: +* **ALARM_IDENTIFIER_PREFIX**: AutoAlarm + * The prefix name that is added to the beginning of each CloudWatch alarm created by the solution. (e.g. For "AutoAlarm": (e.g. AutoAlarm-i-00e4f327736cb077f-CPUUtilization-GreaterThanThreshold-80-5m)) You should update this variable via the **AlarmIdentifierPrefix** in the [CloudWatchAutoAlarms.yaml](./CloudWatchAutoAlarms.yaml) CloudFormation template so that the IAM policy is updated to align with your custom name. + +You can update the thresholds for the default alarms by updating the following environment variables: + **For Amazon EC2**: * **ALARM_CPU_HIGH_THRESHOLD**: 75 @@ -85,9 +89,9 @@ The following list provides a description of the setting along with the environm 2. Configure the AWS CLI with credentials for your AWS account. This walkthrough uses temporary credentials provided by AWS Single Sign On using the **Command line or programmatic access** option. This sets the **AWS_ACCESS_KEY_ID**, **AWS_SECRET_ACCESS_KEY**, and **AWS_SESSION_TOKEN** AWS environment variables with the appropriate credentials for use with the AWS CLI. 3. Create an Amazon SNS topic that CloudWatchAutoAlarms will use for notifications. You can use this sample Amazon SNS CloudFormation template to create an SNS topic.  Leave the OrganizationID parameter blank, it is used for multi-account deployments. - aws cloudformation create-stack --stack-name amazon-cloudwatch-auto-alarms-sns-topic \ - --template-body file://CloudWatchAutoAlarms-SNS.yaml \ - --parameters ParameterKey=OrganizationID,ParameterValue="" \ + aws cloudformation create-stack --stack-name amazon-cloudwatch-auto-alarms-sns-topic \ + --template-body file://CloudWatchAutoAlarms-SNS.yaml \ + --parameters ParameterKey=OrganizationID,ParameterValue="" \ --region 4. Create an S3 bucket that will be used to store and access the CloudWatchAutoAlarms lambda function deployment package if you don't have one. You can use [this sample S3 CloudFormation template](./CloudWatchAutoAlarms-S3.yaml). You can leave the AWS Organizations ID parameter blank if this lambda function will only be deployed in your current account: @@ -134,6 +138,15 @@ In order to create the default alarm set for an Amazon EC2 instance or AWS Lambd For Amazon EC2 instances, you must add this tag during instance launch or you can add this tag at any time to an instance and then stop and start the instance in order to create the default alarm set as well as any custom, instance specific alarms. +You can also manually invoke the CloudWatchAutoAlarms lambda function with the following event payload to create / update EC2 alarms without having to stop and start your EC2 instances: + +```json +{ + "manual_update": "aws.ec2" +} +``` +You can do this with a test execution of the CloudWatchAUtoAlarms AWS Lambda function. Open the AWS Lambda Management Console and perform a test invocation from the **Test** tab with the payload provided here. + For AWS Lambda, you can add this tag to an AWS Lambda function at any time in order to create the default alarm set as well as any custom, function specific alarms. diff --git a/src/actions.py b/src/actions.py index 3b0e408..7401cfa 100755 --- a/src/actions.py +++ b/src/actions.py @@ -35,6 +35,46 @@ def boto3_client(resource, assumed_credentials=None): return client +def retrieve_ec2_instances(tag_key): + try: + ec2_client = boto3_client('ec2') + paginator = ec2_client.get_paginator('describe_instances') + response_iterator = paginator.paginate( + Filters=[ + { + 'Name': 'tag-key', + 'Values': [ + tag_key + ] + } + ], + ) + instance_list = [] + for i in response_iterator: + if 'Reservations' in i and len(i['Reservations']) > 0: + for reservation in i['Reservations']: + instance_list.extend(reservation['Instances']) + instance_ids = [instance['InstanceId'] for instance in reservation['Instances']] + logger.debug("Instance IDs matching alarm tag: {}".format(instance_ids)) + # can handle up to 1K resource ids... + # https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/ec2.html#EC2.ServiceResource.create_tags + ec2_client.create_tags( + Resources=instance_ids, + Tags=[ + { + 'Key': tag_key, + 'Value': str(datetime.utcnow()) + } + ] + ) + return instance_list + except Exception as e: + # If any other exceptions which we didn't expect are raised + # then fail and log the exception message. + logger.error('Failure describing instances with tag key: {} : {}'.format(tag_key, e)) + raise + + def check_alarm_tag(instance_id, tag_key): try: ec2_client = boto3_client('ec2') @@ -78,7 +118,8 @@ def check_alarm_tag(instance_id, tag_key): raise -def process_lambda_alarms(function_name, tags, activation_tag, default_alarms, sns_topic_arn, alarm_separator): +def process_lambda_alarms(function_name, tags, activation_tag, default_alarms, sns_topic_arn, alarm_separator, + alarm_identifier): activation_tag = tags.get(activation_tag, 'not_found') if activation_tag == 'not_found': logger.debug('Activation tag not found for {}, nothing to do'.format(function_name)) @@ -86,7 +127,7 @@ def process_lambda_alarms(function_name, tags, activation_tag, default_alarms, s else: logger.debug('Processing function specific alarms for: {}'.format(default_alarms)) for tag_key in tags: - if tag_key.startswith('AutoAlarm'): + if tag_key.startswith(alarm_identifier): default_alarms['AWS/Lambda'].append({'Key': tag_key, 'Value': tags[tag_key]}) # get the default dimensions for AWS/EC2 @@ -106,14 +147,16 @@ def process_lambda_alarms(function_name, tags, activation_tag, default_alarms, s Period = alarm_properties[4] Statistic = alarm_properties[5] - AlarmName = 'AutoAlarm-{}-{}-{}-{}-{}-{}'.format(function_name, Namespace, MetricName, ComparisonOperator, - Period, - Statistic) + AlarmName = '{}-{}-{}-{}-{}-{}-{}'.format(alarm_identifier, function_name, Namespace, MetricName, + ComparisonOperator, + Period, + Statistic) create_alarm(AlarmName, MetricName, ComparisonOperator, Period, tag['Value'], Statistic, Namespace, dimensions, sns_topic_arn) -def create_alarm_from_tag(id, alarm_tag, instance_info, metric_dimensions_map, sns_topic_arn, alarm_separator): +def create_alarm_from_tag(id, alarm_tag, instance_info, metric_dimensions_map, sns_topic_arn, alarm_separator, + alarm_identifier): alarm_properties = alarm_tag['Key'].split(alarm_separator) namespace = alarm_properties[1] MetricName = alarm_properties[2] @@ -158,7 +201,7 @@ def create_alarm_from_tag(id, alarm_tag, instance_info, metric_dimensions_map, s logger.error('Unable to determine the dimensions for alarm tag: {}'.format(alarm_tag)) raise Exception - AlarmName = 'AutoAlarm-{}-{}-{}'.format(id, namespace, MetricName) + AlarmName = '{}-{}-{}-{}'.format(alarm_identifier, id, namespace, MetricName) properties_offset = 0 if additional_dimensions: for num, dim in enumerate(additional_dimensions[::2]): @@ -182,10 +225,10 @@ def create_alarm_from_tag(id, alarm_tag, instance_info, metric_dimensions_map, s dimensions, sns_topic_arn) -def process_alarm_tags(instance_id, instance_info, default_alarms, metric_dimensions_map, sns_topic_arn, cw_namespace, - create_default_alarms_flag, alarm_separator): +def process_alarm_tags(instance_info, default_alarms, metric_dimensions_map, sns_topic_arn, cw_namespace, + create_default_alarms_flag, alarm_separator, alarm_identifier): tags = instance_info['Tags'] - + instance_id = instance_info['InstanceId'] ImageId = instance_info['ImageId'] logger.info('ImageId is: {}'.format(ImageId)) platform = determine_platform(ImageId) @@ -194,15 +237,21 @@ def process_alarm_tags(instance_id, instance_info, default_alarms, metric_dimens custom_alarms = dict() # get all alarm tags from instance and add them into a custom tag list for instance_tag in tags: - if instance_tag['Key'].startswith('AutoAlarm'): - create_alarm_from_tag(instance_id, instance_tag, instance_info, metric_dimensions_map, sns_topic_arn, alarm_separator) + if instance_tag['Key'].startswith(alarm_identifier): + create_alarm_from_tag(instance_id, instance_tag, instance_info, metric_dimensions_map, sns_topic_arn, + alarm_separator, alarm_identifier) if create_default_alarms_flag == 'true': for alarm_tag in default_alarms['AWS/EC2']: - create_alarm_from_tag(instance_id, alarm_tag, instance_info, metric_dimensions_map, sns_topic_arn, alarm_separator) - - for alarm_tag in default_alarms[cw_namespace][platform]: - create_alarm_from_tag(instance_id, alarm_tag, instance_info, metric_dimensions_map, sns_topic_arn, alarm_separator) + create_alarm_from_tag(instance_id, alarm_tag, instance_info, metric_dimensions_map, sns_topic_arn, + alarm_separator, alarm_identifier) + # unable to determine platform, don't create platform specific alarms... + if not platform: + logger.error("unable to determine platform, no platform specific alarms created.") + else: + for alarm_tag in default_alarms[cw_namespace][platform]: + create_alarm_from_tag(instance_id, alarm_tag, instance_info, metric_dimensions_map, sns_topic_arn, + alarm_separator, alarm_identifier) else: logger.info("Default alarm creation is turned off") @@ -229,8 +278,10 @@ def determine_platform(imageid): elif 'SUSE' in platform_details: return 'SUSE' elif 'Linux/UNIX' in platform_details: - if 'ubuntu' in image_info['Images'][0]['Description'].lower() or 'ubuntu' in image_info['Images'][0][ - 'Name'].lower(): + description = image_info['Images'][0]['Description'].lower() + name = image_info['Images'][0]['Name'].lower() + logger.debug("Linux image name is: {} with description: {}".format(name, description)) + if 'ubuntu' in description or 'ubuntu' in name: return 'Ubuntu' else: return 'Amazon Linux' @@ -257,7 +308,7 @@ def convert_to_seconds(s): raise -# Alarm Name Format: AutoAlarm------ +# Alarm Name Format: ------ # Example: AutoAlarm-i-00e4f327736cb077f-CPUUtilization-GreaterThanThreshold-80-5m def create_alarm(AlarmName, MetricName, ComparisonOperator, Period, Threshold, Statistic, Namespace, Dimensions, sns_topic_arn): @@ -302,9 +353,9 @@ def create_alarm(AlarmName, MetricName, ComparisonOperator, Period, Threshold, S 'Error creating alarm {}!: {}'.format(AlarmName, e)) -def delete_alarms(name): +def delete_alarms(name, alarm_identifier): try: - AlarmNamePrefix = "AutoAlarm-{}".format(name) + AlarmNamePrefix = "{}-{}".format(name, alarm_identifier) cw_client = boto3_client('cloudwatch') logger.info('calling describe alarms with prefix {}'.format(AlarmNamePrefix)) response = cw_client.describe_alarms( diff --git a/src/cw_auto_alarms.py b/src/cw_auto_alarms.py index ba54808..68a87d6 100755 --- a/src/cw_auto_alarms.py +++ b/src/cw_auto_alarms.py @@ -1,8 +1,11 @@ import logging -from actions import check_alarm_tag, process_alarm_tags, delete_alarms, process_lambda_alarms +from actions import check_alarm_tag, process_alarm_tags, delete_alarms, process_lambda_alarms, retrieve_ec2_instances from os import getenv logger = logging.getLogger() +log_level = getenv("LOGLEVEL", "INFO") +level = logging.getLevelName(log_level) +logger.setLevel(level) create_alarm_tag = getenv("ALARM_TAG", "Create_Auto_Alarms") @@ -14,7 +17,6 @@ append_dimensions = [dimension.strip() for dimension in append_dimensions.split(',')] alarm_cpu_high_default_threshold = getenv("ALARM_CPU_HIGH_THRESHOLD", "75") -alarm_credit_balance_low_default_threshold = getenv("ALARM_CPU_CREDIT_BALANCE_LOW_THRESHOLD", "100") alarm_memory_high_default_threshold = getenv("ALARM_MEMORY_HIGH_THRESHOLD", "75") alarm_disk_space_percent_free_threshold = getenv("ALARM_DISK_PERCENT_LOW_THRESHOLD", "20") alarm_disk_used_percent_threshold = 100 - int(alarm_disk_space_percent_free_threshold) @@ -27,7 +29,7 @@ sns_topic_arn = getenv("DEFAULT_ALARM_SNS_TOPIC_ARN", None) alarm_separator = '-' -alarm_identifier = 'AutoAlarm' +alarm_identifier = getenv("ALARM_IDENTIFIER_PREFIX", 'AutoAlarm') # For Redhat, the default device is xvda2, xfs, for Ubuntu, the default fstype is ext4, # for Amazon Linux, the default device is xvda1, xfs default_alarms = { @@ -36,11 +38,6 @@ 'Key': alarm_separator.join( [alarm_identifier, 'AWS/EC2', 'CPUUtilization', 'GreaterThanThreshold', '5m', 'Average']), 'Value': alarm_cpu_high_default_threshold - }, - { - 'Key': alarm_separator.join( - [alarm_identifier, 'AWS/EC2', 'CPUCreditBalance', 'LessThanThreshold', '5m', 'Average']), - 'Value': alarm_credit_balance_low_default_threshold } ], 'AWS/Lambda': [ @@ -142,23 +139,32 @@ def lambda_handler(event, context): # instance has been tagged for alarming, confirm an alarm doesn't already exist if instance_info: - process_alarm_tags(instance_id, instance_info, default_alarms, metric_dimensions_map, sns_topic_arn, - cw_namespace, create_default_alarms_flag, alarm_separator) + process_alarm_tags(instance_info, default_alarms, metric_dimensions_map, sns_topic_arn, + cw_namespace, create_default_alarms_flag, alarm_separator, alarm_identifier) + elif 'manual_update' in event and event['manual_update'] == 'aws.ec2': + logger.debug("manual invocation started") + instances = retrieve_ec2_instances(create_alarm_tag) + logger.debug("Instance Info with tag {} are: {}".format(create_alarm_tag, instances)) + + for instance_info in instances: + process_alarm_tags(instance_info, default_alarms, metric_dimensions_map, sns_topic_arn, + cw_namespace, create_default_alarms_flag, alarm_separator, alarm_identifier) elif 'source' in event and event['source'] == 'aws.ec2' and event['detail']['state'] == 'terminated': instance_id = event['detail']['instance-id'] - result = delete_alarms(instance_id) + result = delete_alarms(instance_id, alarm_identifier) elif 'source' in event and event['source'] == 'aws.lambda' and event['detail'][ 'eventName'] == 'TagResource20170331v2': logger.debug( 'Tag Lambda Function event occurred, tags are: {}'.format(event['detail']['requestParameters']['tags'])) tags = event['detail']['requestParameters']['tags'] function = event['detail']['requestParameters']['resource'].split(":")[-1] - process_lambda_alarms(function, tags, create_alarm_tag, default_alarms, sns_topic_arn, alarm_separator) + process_lambda_alarms(function, tags, create_alarm_tag, default_alarms, sns_topic_arn, alarm_separator, + alarm_identifier) elif 'source' in event and event['source'] == 'aws.lambda' and event['detail'][ 'eventName'] == 'DeleteFunction20150331': function = event['detail']['requestParameters']['functionName'] logger.debug('Delete Lambda Function event occurred for: {}'.format(function)) - result = delete_alarms(function) + result = delete_alarms(function, alarm_identifier) except Exception as e: # If any other exceptions which we didn't expect are raised