Skip to content

Commit

Permalink
TM-500: add endpoint monitoring via collectd (#1096)
Browse files Browse the repository at this point in the history
* add collectd-endpoint-monitoring role

* remove old endpoint-monitoring role

* TM-500: move endpoint-monitoring role to collectd

* fix

* Commit changes made by code formatters

---------

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
  • Loading branch information
drobinson-moj and github-actions[bot] authored Nov 7, 2024
1 parent 0b2c5de commit 7d34882
Show file tree
Hide file tree
Showing 17 changed files with 321 additions and 136 deletions.
28 changes: 28 additions & 0 deletions ansible/group_vars/environment_name_hmpps_oem_preproduction.yml
Original file line number Diff line number Diff line change
Expand Up @@ -72,3 +72,31 @@ housekeeping_cron:
emcli: /u01/app/oracle/product/mw135/bin/emcli
emctl_oem: /u01/app/oracle/product/mw135/bin/emctl
emctl_agent: /u01/app/oracle/product/oem-agent/agent_inst/bin/emctl

collectd_endpoint_monitoring:
- metric_dimension: c.pp-nomis.az.justice.gov.uk
url: https://c.pp-nomis.az.justice.gov.uk/forms/frmservlet?config=tag
- metric_dimension: c.lsast-nomis.az.justice.gov.uk
url: https://c.lsast-nomis.az.justice.gov.uk/forms/frmservlet?config=tag
- metric_dimension: pp-oasys.az.justice.gov.uk
url: https://pp-oasys.az.justice.gov.uk/eor/f?p=100
- metric_dimension: onr.pp-oasys.az.justice.gov.uk
url: https://onr.pp-oasys.az.justice.gov.uk/InfoViewApp
- metric_dimension: r1.pp.csr.service.justice.gov.uk
url: http://r1.pp.csr.service.justice.gov.uk:7770/isps/index.html?2057
- metric_dimension: r2.pp.csr.service.justice.gov.uk
url: http://r2.pp.csr.service.justice.gov.uk:7771/isps/index.html?2057
- metric_dimension: r3.pp.csr.service.justice.gov.uk
url: http://r3.pp.csr.service.justice.gov.uk:7770/isps/index.html?2057
- metric_dimension: r4.pp.csr.service.justice.gov.uk
url: http://r4.pp.csr.service.justice.gov.uk:7771/isps/index.html?2057
- metric_dimension: r5.pp.csr.service.justice.gov.uk
url: http://r5.pp.csr.service.justice.gov.uk:7770/isps/index.html?2057
- metric_dimension: r6.pp.csr.service.justice.gov.uk
url: http://r6.pp.csr.service.justice.gov.uk:7771/isps/index.html?2057
- metric_dimension: traina.csr.service.justice.gov.uk
url: http://traina.csr.service.justice.gov.uk/isps/index.html?2057
- metric_dimension: cafmwebx.pp.planetfm.service.justice.gov.uk
url: https://cafmwebx.pp.planetfm.service.justice.gov.uk/PlanetPortal
- metric_dimension: hpa-preprod.service.hmpps.dsd.io
url: https://hpa-preprod.service.hmpps.dsd.io/
38 changes: 38 additions & 0 deletions ansible/group_vars/environment_name_hmpps_oem_production.yml
Original file line number Diff line number Diff line change
Expand Up @@ -90,3 +90,41 @@ endpoint_monitoring_targets:
emcli: /u01/app/oracle/product/mw135/bin/emcli
emctl_oem: /u01/app/oracle/product/mw135/bin/emctl
emctl_agent: /u01/app/oracle/product/oem-agent/agent_inst/bin/emctl

collectd_endpoint_monitoring:
- metric_dimension: c.nomis.az.justice.gov.uk
url: https://c.nomis.az.justice.gov.uk/forms/frmservlet?config=tag
- metric_dimension: reporting.nomis.az.justice.gov.uk
url: https://reporting.nomis.az.justice.gov.uk/BOE/BI
- metric_dimension: oasys.az.justice.gov.uk
url: https://oasys.az.justice.gov.uk/eor/f?p=100
- metric_dimension: training.oasys.az.justice.gov.uk
url: https://training.oasys.az.justice.gov.uk/eor/f?p=100
- metric_dimension: practice.oasys.az.justice.gov.uk
url: https://practice.oasys.az.justice.gov.uk/eor/f?p=100
- metric_dimension: bridge-oasys.az.justice.gov.uk
url: https://bridge-oasys.az.justice.gov.uk/
- metric_dimension: onr.oasys.az.justice.gov.uk
url: https://onr.oasys.az.justice.gov.uk/InfoViewApp
- metric_dimension: r1.csr.service.justice.gov.uk
url: http://r1.csr.service.justice.gov.uk:7770/isps/index.html?2057
- metric_dimension: r2.csr.service.justice.gov.uk
url: http://r2.csr.service.justice.gov.uk:7771/isps/index.html?2057
- metric_dimension: r3.csr.service.justice.gov.uk
url: http://r3.csr.service.justice.gov.uk:7770/isps/index.html?2057
- metric_dimension: r4.csr.service.justice.gov.uk
url: http://r4.csr.service.justice.gov.uk:7771/isps/index.html?2057
- metric_dimension: r5.csr.service.justice.gov.uk
url: http://r5.csr.service.justice.gov.uk:7770/isps/index.html?2057
- metric_dimension: r6.csr.service.justice.gov.uk
url: http://r6.csr.service.justice.gov.uk:7771/isps/index.html?2057
- metric_dimension: cafmwebx2.az.justice.gov.uk
url: https://cafmwebx2.az.justice.gov.uk/PlanetPortal
- metric_dimension: cafmtrainweb.az.justice.gov.uk
url: https://cafmtrainweb.az.justice.gov.uk/PlanetPortal
- metric_dimension: www.offloc.service.justice.gov.uk
url: https://www.offloc.service.justice.gov.uk/health
- metric_dimension: hpa.service.hmpps.dsd.io
url: https://hpa.service.hmpps.dsd.io/
- metric_dimension: hmpps-az-gw1.justice.gov.uk
url: https://hmpps-az-gw1.justice.gov.uk/RDWeb
16 changes: 16 additions & 0 deletions ansible/group_vars/environment_name_hmpps_oem_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -71,3 +71,19 @@ housekeeping_cron:
emcli: /u01/app/oracle/product/mw135/bin/emcli
emctl_oem: /u01/app/oracle/product/mw135/bin/emctl
emctl_agent: /u01/app/oracle/product/oem-agent/agent_inst/bin/emctl

collectd_endpoint_monitoring:
- metric_dimension: c-t1.test.nomis.service.justice.gov.uk
url: https://c-t1.test.nomis.service.justice.gov.uk/forms/frmservlet?config=tag
- metric_dimension: c-t2.test.nomis.service.justice.gov.uk
url: https://c-t1.test.nomis.service.justice.gov.uk/forms/frmservlet?config=tag
- metric_dimension: c-t3.test.nomis.service.justice.gov.uk
url: https://c-t3.test.nomis.service.justice.gov.uk/forms/frmservlet?config=tag
- metric_dimension: t1-int.oasys.service.justice.gov.uk
url: https://t1-int.oasys.service.justice.gov.uk/
- metric_dimension: t2-int.oasys.service.justice.gov.uk
url: https://t2-int.oasys.service.justice.gov.uk/
- metric_dimension: stage.offloc.service.justice.gov.uk
url: https://stage.offloc.service.justice.gov.uk/health
- metric_dimension: hmppgw1.justice.gov.uk
url: https://hmppgw1.justice.gov.uk/RDWeb
2 changes: 1 addition & 1 deletion ansible/group_vars/server_type_hmpps_oem.yml
Original file line number Diff line number Diff line change
Expand Up @@ -65,8 +65,8 @@ server_type_roles_list:
- collectd-service-metrics
- collectd-oracle-db-connected
- collectd-textfile-monitoring
- collectd-endpoint-monitoring
- oracle-db-refresh
- endpoint-monitoring

collectd_monitored_services_servertype:
- metric_name: service_status_os
Expand Down
32 changes: 32 additions & 0 deletions ansible/roles/collectd-endpoint-monitoring/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# Role to configure endpoint monitoring via collectd

Monitor the status of endpoints via collectd and cloudwatch.

The role installs a collectd configuration file for using an exec plugin,
and a script for checking the status of an endpoint.

Use this if you cannot use an alternative solution such as pingdom due
to IP allow listing restrictions, and you already have a linux EC2 that
can be used for this kind of monitoring.

Use `collectd-connectivity-tests` role if you just want to check
connectivity to an IP/port.

Why collectd? This is Amazon recommended approach for collecting metrics
from an EC2 via CWAgent

## Finding metrics in Cloudwatch

Metrics collected by the Cloudwatch agent will appear in the 'metrics' panel under the CWAgent namespace

```
metric: collectd_endpoint_monitoring_status (the metric_name)
type: exitcode (fixed, 0 = ok, non-zero = error)
type_instance: Friendly name of URL, e.g. c.nomis.service.justice.gov.uk (the metric_dimension)
metric: collectd_endpoint_monitoring_cert_days_to_expiry (the metric_name)
type: gauge (number of days until cert expires)
type_instance: Friendly name of URL, e.g. amazonssmagent (the metric_dimension)
```

Cloudwatch metrics are easily filtered by `instance_id` so you can see all the metrics for a particular instance.
15 changes: 15 additions & 0 deletions ansible/roles/collectd-endpoint-monitoring/defaults/main.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
---
collectd_script_path: /usr/local/bin
collectd_script_name: collectd_endpoint_monitoring
collectd_script_user: ec2-user
collectd_script_interval: 30

# define in relevant group vars
collectd_endpoint_monitoring:
# for example
# collectd_endpoint_monitoring:
# - metric_dimension: oasys.az.justice.gov.uk
# url: https://oasys.az.justice.gov.uk
# follow_redirect: 0 # optionally include and set to 0 if you don't want to follow redirects
# timeout: 5 # optionally include to change timeout from default 5s
# time_ranges: "1.0900-1.1700,2.0900-2.1700,3.0900-3.1700,4.0900-4.1700,5.0900-5.1700" # optionally include to limit monitoring to 9-5pm weekdays
10 changes: 10 additions & 0 deletions ansible/roles/collectd-endpoint-monitoring/handlers/main.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
---
- name: restart collectd
ansible.builtin.service:
name: collectd
state: restarted

- name: restart plugin script
ansible.builtin.shell: |
pkill -u {{ collectd_script_user }} -f {{ collectd_script_path }}/{{ collectd_script_name }}.sh
failed_when: false
3 changes: 3 additions & 0 deletions ansible/roles/collectd-endpoint-monitoring/meta/main.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
---
dependencies:
- role: amazon-cloudwatch-agent-collectd
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
---
- name: copy collectd config
ansible.builtin.template:
src: "{{ collectd_script_name }}.conf.j2"
dest: "/etc/collectd.d/{{ collectd_script_name }}.conf"
owner: root
mode: 0644
notify:
- restart collectd

- name: copy collectd plugin script
ansible.builtin.template:
src: "{{ collectd_script_name }}.sh.j2"
dest: "{{ collectd_script_path }}/{{ collectd_script_name }}.sh"
owner: root
mode: 0755
notify:
- restart plugin script
6 changes: 6 additions & 0 deletions ansible/roles/collectd-endpoint-monitoring/tasks/main.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
---
- import_tasks: configure_collectd.yml
tags:
- ec2provision
- ec2patch
when: ansible_distribution in ['RedHat', 'OracleLinux']
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
LoadPlugin exec
<Plugin exec>
Exec "{{ collectd_script_user }}" "{{ collectd_script_path }}/{{ collectd_script_name }}.sh"
</Plugin>
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
#!/bin/bash
# Managed by collectd-endpoint-monitoring ansible role
# If manually editing, just kill script and collectd will respawn
# e.g. pkill -u {{ collectd_script_user }} -f {{ collectd_script_path }}/{{ collectd_script_name }}.sh
#
# To debug, run INTERVAL=5 LOGGER_INTERVAL_FOR_ERRORS=0 {{ collectd_script_path }}/{{ collectd_script_name }}.sh

ENDPOINTS=()
CERT_EXPIRY_METRIC_INTERVAL=3600
LOGGER_INTERVAL_FOR_ERRORS="${LOGGER_INTERVAL_FOR_ERRORS:-3600}" # set to 0 to display to stdout
DEFAULT_INTERVAL="{{ collectd_script_interval }}"
HOSTNAME="${HOSTNAME:-localhost}"
INTERVAL="${INTERVAL:-$DEFAULT_INTERVAL}"

{% for item in collectd_endpoint_monitoring %}
ENDPOINTS+=("{{ item.follow_redirect|default(1) }} {{ item.timeout_sec|default(5) }} {{ item.url }} {{ item.metric_dimension }} {{ item.time_ranges|default('') }}")
{% endfor %}
{% raw %}

#Comment in below for testing
#INTERVAL=5
#LOGGER_INTERVAL_FOR_ERRORS=0
#ENDPOINTS+=("1 5 https://www.google.com www.google.com 1.0900-1.1700,2.0900-2.1700,3.0900-3.1700,4.0900-4.1700,5.0900-5.1700")
#ENDPOINTS+=("1 5 https://www.microsoft.com www.microsoft.com")
#ENDPOINTS+=("1 5 https://www.amazon.com www.amazon.com 1.0900-5.1700")

check_within_timeranges() {
local now
local timeranges
local times
now=$1
timeranges=$2
for timerange in ${timeranges//,/ }; do
times=(${timerange/-/ })
if [[ ($now == "${times[0]}" || $now > "${times[0]}") && $now < "${times[1]}" ]]; then
return 0
fi
done
return 1
}

check_endpoint() {
local follow_redirect
local timeout_secs
local url
local optional_curl_args
local output
local http_code
local expiry
local expiry_epoch_secs
local now_epoch_secs
local secs_to_expiry
local days_to_expiry

follow_redirect="$1"
timeout_secs="$2"
url="$3"

optional_curl_args=
if [[ $follow_redirect == 1 ]]; then
optional_curl_args="-L"
fi
if ! output=$(curl -sSv -m "$timeout_secs" -o /dev/null -w "http_code=%{http_code}" $optional_curl_args "$url" 2>&1); then
grep -v "^\*" <<<"$output" | grep -v ^http_code= | grep -v "^>" | grep -v "^<" | grep -v "^{" | grep -v "^}" >&2
return 1
fi
http_code=$(grep "^http_code=" <<< "$output" | cut -d= -f2)
if [[ -z $http_code ]]; then
echo "missing http_code in curl output" >&2
return 1
fi
if [[ $http_code != 200 ]]; then
if [[ $follow_redirect == 1 || ! $http_code =~ ^30* ]]; then
echo "unexpected http_code $http_code" >&2
return 1
fi
fi
if [[ $url =~ https: ]]; then
expiry=$(grep -F "* expire date:" <<< "$output" | cut -d: -f2-)
if [[ -z $expiry ]]; then
echo "could not find expiry date in curl output" >&2
return 1
fi
if [[ "$(uname)" == "Darwin" ]]; then
expiry_epoch_secs=$(date -j -f " %b %d %T %Y %Z" "$expiry" +%s)
else
expiry_epoch_secs=$(date +%s -d "$expiry" 2>/dev/null)
fi
if [[ -z $expiry_epoch_secs ]]; then
echo "could not parse expiry date $expiry" >&2
return 1
fi
now_epoch_secs=$(date +%s)
secs_to_expiry=$(( expiry_epoch_secs - now_epoch_secs ))
days_to_expiry=$(( secs_to_expiry / 86400 ))
echo "days_to_expiry=$days_to_expiry"
fi
}


n=${#ENDPOINTS[@]}

last_error_log_timestamp=()
last_days_to_expiry=()
last_expiry_metric_timestamp=()
for ((i=0; i<n; i++)); do
last_days_to_expiry[i]=
last_expiry_metric_timestamp[i]=0
last_error_log_timestamp[i]=0
done

while true; do
now_epoch_secs=$(date +%s)
for ((i=0; i<n; i++)); do
args=(${ENDPOINTS[$i]})
timeranges="${args[4]}"
if [[ -n $timeranges ]]; then
now_dayhourminute=$(date +%u%H%M)
if ! check_within_timeranges "$now_dayhourminute" "$timeranges"; then
continue
fi
fi
output=$(check_endpoint "${args[0]}" "${args[1]}" "${args[2]}" 2>&1)
exitcode=$?
days_to_expiry=$(grep "^days_to_expiry=" <<< "$output" | cut -d= -f2)
echo "PUTVAL $HOSTNAME/endpoint_status/exitcode-${args[3]} interval=$INTERVAL N:$exitcode"
if [[ -n $days_to_expiry ]]; then
if [[ ${last_days_to_expiry[i]} != "$days_to_expiry" || $((now_epoch_secs - last_expiry_metric_timestamp[i])) -gt $CERT_EXPIRY_METRIC_INTERVAL ]]; then
echo "PUTVAL $HOSTNAME/endpoint_cert_expiry/gauge-${args[3]} interval=$INTERVAL N:$days_to_expiry"
last_expiry_metric_timestamp[i]="$now_epoch_secs"
last_days_to_expiry[i]="$days_to_expiry"
fi
fi
if [[ $exitcode -ne 0 ]]; then
if [[ $LOGGER_INTERVAL_FOR_ERRORS -eq 0 ]]; then
echo "${args[3]}: $output"
elif [[ $((now_epoch_secs - last_error_log_timestamp[i])) -gt $LOGGER_INTERVAL_FOR_ERRORS ]]; then
echo "${args[3]}: $output" | logger -p local3.info -t collectd_endpoint_monitoring
last_error_log_timestamp[i]="$now_epoch_secs"
fi
fi
done
new_epoch_secs=$(date +%s)
elapsed=$((new_epoch_secs - now_epoch_secs))
if (( elapsed >= 0 && elapsed < INTERVAL )); then
sleep $((INTERVAL - elapsed))
fi
done

{% endraw %}
2 changes: 0 additions & 2 deletions ansible/roles/endpoint-monitoring/defaults/main.yml

This file was deleted.

3 changes: 0 additions & 3 deletions ansible/roles/endpoint-monitoring/meta/main.yml

This file was deleted.

36 changes: 0 additions & 36 deletions ansible/roles/endpoint-monitoring/tasks/endpoint-monitoring.yml

This file was deleted.

Loading

0 comments on commit 7d34882

Please sign in to comment.