Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added Cloudwatch reporting #144

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 36 additions & 0 deletions check_mongodb.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,18 +18,23 @@
# - @Andor on github
# - Steven Richards - Captainkrtek on github
# - Max Vernimmen
# - @burdandrei Added CloudWatch monitoring for Mongodb
#
# USAGE
#
# See the README.md
#


import sys
import time
import datetime
import optparse
import textwrap
import re
import os
import commands
from boto.ec2.cloudwatch import CloudWatchConnection
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this will break the plugin for a lot of people that don't have boto installed.. we should wrap it in a try/except to enable the check later on or not

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

got you, will wrap it inside def


try:
import pymongo
Expand All @@ -44,6 +49,7 @@
else:
import pymongo.son as son

cloudwatch_report = False

#
# thanks to http://stackoverflow.com/a/1229667/72987
Expand Down Expand Up @@ -82,6 +88,17 @@ def numeric_type(param):
return True
return False

#Get the instanceId for our machine. This is important later for
#autoscaling. The dimensions we select here when publishing
#must be matched later by our autoscale policy
def get_instance_id():
ret, instanceId = commands.getstatusoutput("wget -q -O - http://169.254.169.254/latest/meta-data/instance-id")
return instanceId

def put_data(namespace, name, value, unit, dimensions):
c = CloudWatchConnection()
now = datetime.datetime.now()
c.put_metric_data(namespace, name, value, now, unit, dimensions)

def check_levels(param, warning, critical, message, ok=[]):
if (numeric_type(critical) and numeric_type(warning)):
Expand Down Expand Up @@ -145,6 +162,7 @@ def main(argv):
p.add_option('-q', '--querytype', action='store', dest='query_type', default='query', help='The query type to check [query|insert|update|delete|getmore|command] from queries_per_second')
p.add_option('-c', '--collection', action='store', dest='collection', default='admin', help='Specify the collection to check')
p.add_option('-T', '--time', action='store', type='int', dest='sample_time', default=1, help='Time used to sample number of pages faults')
p.add_option('--cloudwatch-report', action='store_true', dest='cloudwatch_report', default=False,help='Report sampled data to cloudwatch')

options, arguments = p.parse_args()
host = options.host
Expand All @@ -167,6 +185,8 @@ def main(argv):
database = options.database
ssl = options.ssl
replicaset = options.replicaset
global cloudwatch_report
cloudwatch_report = options.cloudwatch_report

if action == 'replica_primary' and replicaset is None:
return "replicaset must be passed in when using replica_primary check"
Expand Down Expand Up @@ -438,6 +458,12 @@ def check_rep_lag(con, host, port, warning, critical, percent, perf_data, max_la
except:
lag = float(optime_lag.seconds + optime_lag.days * 24 * 3600)

if cloudwatch_report:
replicaset = rs_status["set"]
instanceId = get_instance_id()
put_data('Mongo', 'replicationLag', lag, 'Seconds',{'replicaSet': replicaset})
put_data('Mongo', 'replicationLag', lag, 'Seconds',{'InstanceId': instanceId, 'replicaSet': replicaset})

if percent:
err, con = mongo_connect(primary_node['name'].split(':')[0], int(primary_node['name'].split(':')[1]), False, user, passwd)
if err != 0:
Expand Down Expand Up @@ -617,6 +643,11 @@ def check_lock(con, warning, critical, perf_data):
lock_percentage = float(lockTime) / float(totalTime) * 100
message = "Lock Percentage: %.2f%%" % lock_percentage
message += performance_data(perf_data, [("%.2f" % lock_percentage, "lock_percentage", warning, critical)])
if cloudwatch_report:
instanceId = get_instance_id()
replicaset = data['repl']['setName']
put_data('Mongo', 'LockPercentage', lock_percentage, 'Percent',{'replicaSet': replicaset})
put_data('Mongo', 'LockPercentage', lock_percentage, 'Percent',{'InstanceId': instanceId, 'replicaSet': replicaset})
return check_levels(lock_percentage, warning, critical, message)

except Exception, e:
Expand Down Expand Up @@ -1130,6 +1161,11 @@ def check_current_lock(con, host, warning, critical, perf_data):
lock_percentage = delta[2] / delta[1] * 100 # lockTime/totalTime*100
message = "Current Lock Percentage: %.2f%%" % lock_percentage
message += performance_data(perf_data, [("%.2f" % lock_percentage, "current_lock_percentage", warning, critical)])
if cloudwatch_report:
instanceId = get_instance_id()
replicaset = data['repl']['setName']
put_data('Mongo', 'CurrentLockPercentage', lock_percentage, 'Percent',{'replicaSet': replicaset})
put_data('Mongo', 'CurrentLockPercentage', lock_percentage, 'Percent',{'InstanceId': instanceId, 'replicaSet': replicaset})
return check_levels(lock_percentage, warning, critical, message)
else:
return exit_with_general_warning("problem reading data from temp file")
Expand Down