Skip to content

Commit

Permalink
Merge pull request #583 from airbnb/jacknaglieri-firehose-req-id-and-…
Browse files Browse the repository at this point in the history
…configurable-monitoring

Add Firehose Request Id and Support Custom CW Alarm Params
  • Loading branch information
jacknagz authored Jan 25, 2018
2 parents c7c690d + e5d78e9 commit 538e92a
Show file tree
Hide file tree
Showing 8 changed files with 154 additions and 24 deletions.
5 changes: 3 additions & 2 deletions stream_alert/rule_processor/firehose.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,9 +208,10 @@ def firehose_request_wrapper():
MetricLogger.log_metric(FUNCTION_NAME,
MetricLogger.FIREHOSE_RECORDS_SENT,
record_batch_size)
LOGGER.info('[Firehose] Successfully sent %d messages to %s',
LOGGER.info('[Firehose] Successfully sent %d messages to %s with RequestId [%s]',
record_batch_size,
stream_name)
stream_name,
resp.get('ResponseMetadata', {}).get('RequestId', ''))

def firehose_log_name(self, log_name):
"""Convert conventional log names into Firehose delievery stream names
Expand Down
6 changes: 4 additions & 2 deletions stream_alert_cli/terraform/cloudtrail.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,10 @@ def generate_cloudtrail(cluster_name, cluster_dict, config):
cloudtrail_module = 'cloudtrail_{}'.format(cluster_name)

enabled_legacy = modules['cloudtrail'].get('enabled')
cloudtrail_enabled = modules['cloudtrail'].get('enable_logging')
kinesis_enabled = modules['cloudtrail'].get('enable_kinesis')

cloudtrail_enabled = modules['cloudtrail'].get('enable_logging', True)
kinesis_enabled = modules['cloudtrail'].get('enable_kinesis', True)

account_ids = list(
set([config['global']['account']['aws_account_id']] + modules['cloudtrail'].get(
'cross_account_ids', [])))
Expand Down
25 changes: 25 additions & 0 deletions stream_alert_cli/terraform/monitoring.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,19 @@
def generate_monitoring(cluster_name, cluster_dict, config):
"""Add the CloudWatch Monitoring module to the Terraform cluster dict.
Example configuration:
"cloudwatch_monitoring": {
"enabled": true,
"kinesis_alarms_enabled": true,
"lambda_alarms_enabled": true,
"settings": {
"lambda_invocation_error_period": "600",
"kinesis_iterator_age_error_period": "600",
"kinesis_write_throughput_exceeded_threshold": "100"
}
}
Args:
cluster_name (str): The name of the currently generating cluster
cluster_dict (defaultdict): The dict containing all Terraform config for a given cluster.
Expand All @@ -36,6 +49,10 @@ def generate_monitoring(cluster_name, cluster_dict, config):
LOGGER_CLI.error('Invalid config: Make sure you declare global infrastructure options!')
return False

if not monitoring_config.get('enabled', False):
LOGGER_CLI.info('CloudWatch Monitoring not enabled, skipping...')
return True

if infrastructure_config['monitoring'].get('create_sns_topic'):
topic_name = 'stream_alert_monitoring'

Expand Down Expand Up @@ -69,4 +86,12 @@ def generate_monitoring(cluster_name, cluster_dict, config):
'kinesis_alarms_enabled': True
})

# Add support for custom settings for tweaking alarm thresholds, eval periods, and periods
# Note: This does not strictly check for proper variable names, since there are so many.
# Instead, Terraform will error out if an imporper name is used.
# Also, every value in these settings should be a string, so cast for safety.
for setting_name, setting_value in monitoring_config.get('settings', {}).iteritems():
cluster_dict['module']['cloudwatch_monitoring_{}'.format(
cluster_name)][setting_name] = str(setting_value)

return True
30 changes: 15 additions & 15 deletions terraform/modules/tf_stream_alert_monitoring/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,9 @@ resource "aws_cloudwatch_metric_alarm" "streamalert_lambda_invocation_errors" {
metric_name = "Errors"
statistic = "Sum"
comparison_operator = "GreaterThanThreshold"
threshold = "0"
evaluation_periods = "1"
period = "300"
threshold = "${var.lambda_invocation_error_threshold}"
evaluation_periods = "${var.lambda_invocation_error_evaluation_periods}"
period = "${var.lambda_invocation_error_period}"
alarm_description = "StreamAlert Lambda Invocation Errors: ${element(var.lambda_functions, count.index)}"
alarm_actions = ["${var.sns_topic_arn}"]

Expand All @@ -29,9 +29,9 @@ resource "aws_cloudwatch_metric_alarm" "streamalert_lambda_throttles" {
metric_name = "Throttles"
statistic = "Sum"
comparison_operator = "GreaterThanThreshold"
threshold = "0"
evaluation_periods = "1"
period = "300"
threshold = "${var.lambda_throttle_error_threshold}"
evaluation_periods = "${var.lambda_throttle_error_evaluation_periods}"
period = "${var.lambda_throttle_error_period}"
alarm_description = "StreamAlert Lambda Throttles: ${element(var.lambda_functions, count.index)}"
alarm_actions = ["${var.sns_topic_arn}"]

Expand All @@ -49,9 +49,9 @@ resource "aws_cloudwatch_metric_alarm" "streamalert_lambda_iterator_age" {
metric_name = "IteratorAge"
statistic = "Maximum"
comparison_operator = "GreaterThanThreshold"
threshold = "1000000"
evaluation_periods = "1"
period = "300"
threshold = "${var.lambda_iterator_age_error_threshold}"
evaluation_periods = "${var.lambda_iterator_age_error_evaluation_periods}"
period = "${var.lambda_iterator_age_error_period}"
alarm_description = "StreamAlert Lambda High Iterator Age: ${element(var.lambda_functions, count.index)}"
alarm_actions = ["${var.sns_topic_arn}"]

Expand All @@ -69,9 +69,9 @@ resource "aws_cloudwatch_metric_alarm" "streamalert_kinesis_iterator_age" {
metric_name = "GetRecords.IteratorAgeMilliseconds"
statistic = "Maximum"
comparison_operator = "GreaterThanThreshold"
threshold = "1000000"
evaluation_periods = "1"
period = "300"
threshold = "${var.kinesis_iterator_age_error_threshold}"
evaluation_periods = "${var.kinesis_iterator_age_error_evaluation_periods}"
period = "${var.kinesis_iterator_age_error_period}"
alarm_description = "StreamAlert Kinesis High Iterator Age: ${var.kinesis_stream}"
alarm_actions = ["${var.sns_topic_arn}"]

Expand All @@ -88,9 +88,9 @@ resource "aws_cloudwatch_metric_alarm" "streamalert_kinesis_write_exceeded" {
metric_name = "WriteProvisionedThroughputExceeded"
statistic = "Sum"
comparison_operator = "GreaterThanThreshold"
threshold = "10"
evaluation_periods = "6"
period = "300"
threshold = "${var.kinesis_write_throughput_exceeded_threshold}"
evaluation_periods = "${var.kinesis_write_throughput_exceeded_evaluation_periods}"
period = "${var.kinesis_write_throughput_exceeded_period}"
alarm_description = "StreamAlert Kinesis Write Throughput Exceeded: ${var.kinesis_stream}"
alarm_actions = ["${var.sns_topic_arn}"]

Expand Down
66 changes: 66 additions & 0 deletions terraform/modules/tf_stream_alert_monitoring/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -19,3 +19,69 @@ variable "lambda_functions" {
type = "list"
default = []
}

// Lambda Invocation Error Alarm Settings

variable "lambda_invocation_error_threshold" {
default = "0"
}

variable "lambda_invocation_error_evaluation_periods" {
default = "1"
}

variable "lambda_invocation_error_period" {
default = "300"
}

// Lambda Throttling Alarm Settings
variable "lambda_throttle_error_threshold" {
default = "0"
}

variable "lambda_throttle_error_evaluation_periods" {
default = "1"
}

variable "lambda_throttle_error_period" {
default = "300"
}

// Lambda Iterator Age Alarm Settings
variable "lambda_iterator_age_error_threshold" {
default = "1000000"
}

variable "lambda_iterator_age_error_evaluation_periods" {
default = "1"
}

variable "lambda_iterator_age_error_period" {
default = "300"
}

// Kinesis Iterator Age Alarm Settings
variable "kinesis_iterator_age_error_threshold" {
default = "1000000"
}

variable "kinesis_iterator_age_error_evaluation_periods" {
default = "1"
}

variable "kinesis_iterator_age_error_period" {
default = "300"
}

// Kinesis Write Throughput Alarm Settings
variable "kinesis_write_throughput_exceeded_threshold" {
default = "10"
}

variable "kinesis_write_throughput_exceeded_evaluation_periods" {
default = "6"
}

variable "kinesis_write_throughput_exceeded_period" {
default = "300"
}
5 changes: 4 additions & 1 deletion tests/unit/conf/clusters/advanced.json
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,10 @@
"enable_logging": true
},
"cloudwatch_monitoring": {
"enabled": true
"enabled": true,
"settings": {
"kinesis_iterator_age_error_threshold": 3000000
}
},
"flow_logs": {
"enabled": true,
Expand Down
2 changes: 1 addition & 1 deletion tests/unit/conf/clusters/trusted.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
"id": "trusted",
"modules": {
"cloudwatch_monitoring": {
"enabled": true
"enabled": false
},
"kinesis": {
"firehose": {
Expand Down
39 changes: 36 additions & 3 deletions tests/unit/stream_alert_cli/terraform/test_monitoring.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ def test_generate_cloudwatch_monitoring():
cluster_dict = _common.infinitedict()
result = monitoring.generate_monitoring('test', cluster_dict, CONFIG)

# Test a the default SNS topic option
# Test the default SNS topic option
expected_cloudwatch_tf = {
'source': 'modules/tf_stream_alert_monitoring',
'sns_topic_arn': 'arn:aws:sns:us-west-1:12345678910:stream_alert_monitoring',
Expand All @@ -44,14 +44,47 @@ def test_generate_cloudwatch_monitoring():
cluster_dict['module']['cloudwatch_monitoring_test'],
expected_cloudwatch_tf)

def test_generate_cloudwatch_monitoring_with_settings():
"""CLI - Terraform Generate Cloudwatch Monitoring with Custom Settings"""
cluster_dict = _common.infinitedict()
result = monitoring.generate_monitoring('advanced', cluster_dict, CONFIG)

# Test the default SNS topic option
expected_cloudwatch_tf = {
'source': 'modules/tf_stream_alert_monitoring',
'sns_topic_arn': 'arn:aws:sns:us-west-1:12345678910:stream_alert_monitoring',
'lambda_functions': [
'unit-testing_advanced_streamalert_rule_processor',
'unit-testing_advanced_streamalert_alert_processor'
],
'kinesis_stream': 'unit-testing_advanced_stream_alert_kinesis',
'lambda_alarms_enabled': True,
'kinesis_alarms_enabled': True,
'kinesis_iterator_age_error_threshold': '3000000'
}

assert_true(result)
assert_equal(
cluster_dict['module']['cloudwatch_monitoring_advanced'],
expected_cloudwatch_tf)

def test_generate_cloudwatch_monitoring_disabled():
"""CLI - Terraform Generate Cloudwatch Monitoring Disabled"""
cluster_dict = _common.infinitedict()
cluster = 'trusted'
result = monitoring.generate_monitoring(cluster, cluster_dict, CONFIG)

assert_true(result)
assert_true('cloudwatch_monitoring_{}'.format(cluster) not in cluster_dict['module'])

def test_generate_cloudwatch_monitoring_no_kinesis():
"""CLI - Terraform Generate Cloudwatch Monitoring - Kinesis Disabled"""
cluster_dict = _common.infinitedict()
CONFIG['clusters']['test']['modules']['cloudwatch_monitoring']['kinesis_alarms_enabled'] = False
CONFIG['clusters']['test']['modules']['cloudwatch_monitoring']['lambda_alarms_enabled'] = True
result = monitoring.generate_monitoring('test', cluster_dict, CONFIG)

# Test a the default SNS topic option
# Test the default SNS topic option
expected_cloudwatch_tf = {
'source': 'modules/tf_stream_alert_monitoring',
'sns_topic_arn': 'arn:aws:sns:us-west-1:12345678910:stream_alert_monitoring',
Expand All @@ -75,7 +108,7 @@ def test_generate_cloudwatch_monitoring_no_lambda():
CONFIG['clusters']['test']['modules']['cloudwatch_monitoring']['kinesis_alarms_enabled'] = True
result = monitoring.generate_monitoring('test', cluster_dict, CONFIG)

# Test a the default SNS topic option
# Test the default SNS topic option
expected_cloudwatch_tf = {
'source': 'modules/tf_stream_alert_monitoring',
'sns_topic_arn': 'arn:aws:sns:us-west-1:12345678910:stream_alert_monitoring',
Expand Down

0 comments on commit 538e92a

Please sign in to comment.