-
Notifications
You must be signed in to change notification settings - Fork 10
/
Copy pathcheck_puppet_agent
executable file
·210 lines (184 loc) · 6.46 KB
/
check_puppet_agent
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
#!/bin/bash
#
# A nrpe plugin that will poll the state of puppet on a machine. We want
# to have different notification intervals for different states, so this
# script operates in two different modes.
#
# -c: Check for catalog compile failures only
# -t: Check the last time puppet ran and check for run time errors
#
# Based on code by Alexander Swen <[email protected]>
#
# CHANGELOG
# 5/15/2013 bhourigan Initial commit
# 6/27/2013 bhourigan Addressing bug 887439, looking at /var/lib/puppet/state/agent_catalog_run.lock
# 2/3/2014 bhourigan Bug 967265 - not properly parsing multi-line YAML messages
#
PATH=/usr/bin:/usr/sbin:/opt/puppetlabs/bin
puppetversion=$(puppet -V)
statedir=$(puppet config print statedir)
lastrunfile=${statedir}/last_run_summary.yaml
reportfile=${statedir}/last_run_report.yaml
usage(){
if [ $# -gt 0 ]; then
echo "ERROR: $*"
echo
fi
echo -e "Usage: $0 [-c] [-t <threshold>] [-s <lastrunfile>] [-r <reportfile>]"
echo
echo -e "\t-c check for catalog compilation failures only"
echo -e "\t-t last run alert threshold"
echo -e "\t-r report file location (default: ${reportfile})"
echo -e "\t-s summary file location (default: ${lastrunfile})"
echo
exit 1
}
result(){
echo "$1"
exit "$2"
}
check_stale_lock_files(){
if [ -f /var/lib/puppet/state/agent_catalog_run.lock ]; then
age=$(date -d "now - $(stat -c "%Y" /var/lib/puppet/state/agent_catalog_run.lock) seconds" +%s)
if [ "${age:-0}" -gt "${threshold}" ]; then
result "Agent lockfile /var/lib/puppet/state/agent_catalog_run.lock ${age} seconds old" 3
fi
fi
}
check_catalog_version(){
if [ -z "$configuration_version" ]; then
echo "Catalog version is unknown"
exit 3
fi
}
check_last_run_time(){
if puppetctl is-enabled >/dev/null ; then
# Parse last_last run from lastrunfile
last_run_time=$(awk '/\s*last_run:/ {print $2}' "$lastrunfile")
if [ "${last_run_time:-0}" -eq 0 ]; then
result "Can't get last_run from $lastrunfile" 3
fi
now=$(date "+%s")
time_since_last=$((now-last_run_time))
if [ "$time_since_last" -gt "$threshold" ]; then
result "Last run was ${time_since_last} seconds ago" 3
fi
else
comment=$(puppetctl lock-status | grep disabled)
# we return 0 because, if someone has actively disabled puppet,
# a human has control and thus it's not an alert.
result "${comment}" 0
fi
}
check_last_run_errors(){
{
# We add /opt/puppetlabs/puppet/bin because we want access to
# the ruby binary (for parsing the last-run yaml), since trying to parse
# YAML in bash is insane.
PATH=/opt/puppetlabs/puppet/bin:/usr/local/bin:/usr/bin:/usr/local/sbin:/usr/sbin
# We used to capture the value down at 'events.failure' but that's basically
# a dupe of resource failures so we just count resource failures.
sum_of_fail=$(lastrunfile=$lastrunfile ruby -rpuppet -ryaml \
-e "output = File.open(ENV['lastrunfile']){ |data| YAML::load(data) }; \
failed = output.fetch('resources').fetch('failed'); \
failed_to_restart = output.fetch('resources').fetch('failed_to_restart'); \
sum_of_fail = failed + failed_to_restart; \
puts sum_of_fail") 2>/dev/null
}
if [ "${sum_of_fail:-0}" -gt 0 ]; then
if [ "${sum_of_fail:-0}" -gt 1 ]; then
s="s"
fi
result "Last run had ${sum_of_fail} error${s:-}" 1
fi
}
check_catalog_compile(){
{
# We add /opt/puppetlabs/puppet/bin because we want access to
# the ruby binary (for parsing the last-run yaml), since trying to parse
# YAML in bash is insane.
PATH=/opt/puppetlabs/puppet/bin:/usr/local/bin:/usr/bin:/usr/local/sbin:/usr/sbin
# see https://puppet.com/docs/puppet/latest/format_report.html for hints
# on how we go through the object from the YAML file.
#
# Loop through the logs, and print the first log at the 'err' level
# that ISN'T a transient file issue when puppetserver restarts on you
# in the middle of your puppet run.
#
# Future caution: the failure message is presently always the first error.
# This might need better logic in the future.
failed_catalog=$(reportfile=$reportfile ruby -rpuppet -ryaml \
-e "output = File.open(ENV['reportfile']){ |data| YAML::load(data) }; \
for line in output.logs do \
next if line.level.to_s() != 'err'; \
next if line.message.to_s().include? 'Attempted to borrow a JRubyInstance from the pool during a shutdown'; \
puts line; \
break; \
end" 2>/dev/null)
}
# strip out string portion if contains secret
has_secret=$(echo "${failed_catalog}" | grep -c 'secret')
if [ "${has_secret}" -gt 0 ]; then
failed_catalog=${failed_catalog//.secret.*$/<redacted>}
fi
if [ -n "${failed_catalog:-}" ]; then
if [[ "${failed_catalog:-}" =~ "execution expired" ]]; then
result "${failed_catalog}" 3
else
result "${failed_catalog}" 2
fi
fi
}
result_ok(){
result "Puppet agent ${puppetversion} running catalog ${configuration_version:-UNKNOWN}" 0
}
while getopts "t:s:r:c" opt; do
case $opt in
t)
threshold=$OPTARG
;;
s)
lastrunfile=$OPTARG
;;
r)
reportfile=$OPTARG
;;
c)
check_catalog=1
;;
\?)
usage "Invalid option: -$OPTARG"
;;
:)
usage "Option -$OPTARG requires an argument."
;;
esac
done
if [ ! -f "$lastrunfile" ]; then
result "Summary file $lastrunfile doesn't exist" 3
fi
if [ ! -f "$reportfile" ]; then
result "Report file $reportfile doesn't exist" 3
fi
# Nagios won't allow us to have different alerting timers for different
# states (we want to see catalog compile failures and recoveries
# immediately, and run time errors daily), so this script will operate
# in two modes which will be defined with two different notification
# intervals in Nagios
configuration_version=$(grep ^configuration_version: "$reportfile" | cut -d ':' -f 2)
if [ "${check_catalog:-0}" -eq 1 ]; then
if [ "${threshold:-0}" -gt 1 ]; then
usage "Threshold argument not allowed with -c"
fi
check_catalog_compile
check_catalog_version
else
if [ "${threshold:-0}" -lt 1 ]; then
usage "Invalid time threshold $OPTARG"
fi
check_stale_lock_files
check_last_run_time
check_last_run_errors
check_catalog_version
fi
result_ok