Skip to content

Commit

Permalink
Nomis: DSOS-2318: add collectd db monitoring (#384)
Browse files Browse the repository at this point in the history
* tidy up

* readme

* add collectd-oracle-db-connected role

* Add audit role

* Commit changes made by code formatters

---------

Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
  • Loading branch information
drobinson-moj and github-actions[bot] authored Nov 2, 2023
1 parent 0db24ea commit af38c1b
Show file tree
Hide file tree
Showing 15 changed files with 163 additions and 17 deletions.
1 change: 1 addition & 0 deletions ansible/roles/audit/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Role for enabling audit daemon
13 changes: 13 additions & 0 deletions ansible/roles/audit/tasks/audit.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
- name: Install audit package
yum:
name: audit
state: present
lock_timeout: 60
retries: 3
delay: 10

- name: Start auditd
service:
name: auditd
state: started
enabled: yes
6 changes: 6 additions & 0 deletions ansible/roles/audit/tasks/main.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
- import_tasks: audit.yml
tags:
- amibuild
- ec2provision
- ec2patch
when: ansible_distribution in ['RedHat', 'OracleLinux']
5 changes: 5 additions & 0 deletions ansible/roles/collectd-oracle-db-connected/defaults/main.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
collectd_script_path: /usr/local/bin
collectd_script_name: collectd_oracle_db_connected
collectd_script_user: oracle
collectd_script_interval: 60
10 changes: 10 additions & 0 deletions ansible/roles/collectd-oracle-db-connected/handlers/main.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
---
- name: restart collectd
ansible.builtin.service:
name: collectd
state: restarted

- name: restart plugin script
ansible.builtin.shell: |
pkill -u {{ collectd_script_user }} -f {{ collectd_script_path }}/{{ collectd_script_name }}.sh
failed_when: false
4 changes: 4 additions & 0 deletions ansible/roles/collectd-oracle-db-connected/meta/main.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
---
dependencies:
- role: get-ec2-facts
- role: amazon-cloudwatch-agent-collectd
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
---
- name: copy collectd config
ansible.builtin.template:
src: "{{ collectd_script_name }}.conf.j2"
dest: "/etc/collectd.d/{{ collectd_script_name }}.conf"
owner: root
mode: 0644
notify:
- restart collectd

- name: copy collectd plugin script
ansible.builtin.template:
src: "{{ collectd_script_name }}.sh.j2"
dest: "{{ collectd_script_path }}/{{ collectd_script_name }}.sh"
owner: root
mode: 0755
notify:
- restart plugin script
6 changes: 6 additions & 0 deletions ansible/roles/collectd-oracle-db-connected/tasks/main.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
---
- import_tasks: configure_collectd.yml
tags:
- ec2provision
- ec2patch
when: ansible_distribution in ['RedHat', 'OracleLinux']
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
#!/bin/bash
# Managed by collectd-oracle-db-connected ansible role
# If manually editing, just kill script and collectd will respawn
# e.g. pkill -u {{ collectd_script_user }} -f {{ collectd_script_path }}/{{ collectd_script_name }}.sh

HOSTNAME="${HOSTNAME:-localhost}"
INTERVAL="${INTERVAL:-{{ collectd_script_interval }}}"

if [[ "$(whoami)" != "oracle" ]]
then
echo "This script is expected to be run as the Oracle user" 1>&2
exit 1
fi

# We need to make sure this is in the path
export PATH=${PATH}:/usr/local/bin

get_sids() {
aws ec2 describe-tags --filters "Name=resource-id,Values={{ ansible_ec2_instance_id }}" "Name=key,Values=oracle-sids" --query Tags[0].Value --output=text
}

db_connected() {
# DB resources names are usually 'ora.${DB}.db' but some have a suffix after ${DB}
DB="$(crsctl status resource | grep -m1 -i ora\.${SID}.*\.db | cut -f2 -d=)"

# Check added to alert on not having a database resource BEFORE trying to get it's status
if [[ -z "$DB" ]]
then
echo "Failed to find a database resource for ${SID}" 1>&2
return 1
fi

# Worth noting here that crsctl exits with code 0 even if you try and find details of a database that doesn't exist
STATUS=$(timeout $INTERVAL crsctl status resource ${DB} -v | grep STATE_DETAILS | cut -f2 -d= | cut -f1 -d,)

case ${STATUS} in
"Open")
return 0
;;
"Open,Readonly")
return 0
;;
"Mounted (Closed)")
return 0
;;
*)
# If this check returns a non-zero value then the database is not connected
return 1
;;
esac
}

ORACLE_SID="+ASM"
ORAENV_ASK="NO"
. oraenv > /dev/null

while sleep "$INTERVAL"; do
SIDS=$(get_sids)
if [[ "$SIDS" != "None" ]]; then
for SID in $(get_sids); do
db_connected $SID >/dev/null 2>&1
echo "PUTVAL $HOSTNAME/exec-db_connected/bool-$SID interval=$INTERVAL N:$?"
done
fi
done
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
---
- name: copy collectd config
ansible.builtin.template:
src: "collectd.conf.j2"
src: "{{ collectd_script_name }}.conf.j2"
dest: "/etc/collectd.d/{{ collectd_script_name }}.conf"
owner: root
mode: 0644
Expand All @@ -11,7 +11,7 @@
- name: copy collectd plugin script
ansible.builtin.template:
src: "{{ collectd_script_name }}.sh.j2"
dest: "/usr/local/bin/{{ collectd_script_name }}.sh"
dest: "{{ collectd_script_path }}/{{ collectd_script_name }}.sh"
owner: root
mode: 0755
notify:
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
LoadPlugin exec
<Plugin exec>
Exec "{{ collectd_script_user }}" "{{ collectd_script_path }}/{{ collectd_script_name }}.sh"
</Plugin>
23 changes: 8 additions & 15 deletions ansible/roles/collectd/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,20 +4,20 @@ Installs collectd and configures it based on the values in group_vars `collectd_

Collectd is able to run scripts and perform other tasks based on plugins. The scripts are run by the exec plugin and the results are made available to the Cloudwatch agent on the same host via the network plugin. The Cloudwatch agent then sends the metrics to Cloudwatch.

The common plugins are defined in collectd.conf.j2 (network plugin being the most important) with additional plugins pulled in by the statement
`Include "/etc/collectd.d` in the main collectd.conf file.
The common plugins are defined in collectd.conf.j2 (network plugin being the most important) with additional plugins pulled in by the statement
`Include "/etc/collectd.d` in the main collectd.conf file.

The collectd_configure task does the following:

1. reads values of `collectd_metric_configs` from group_vars, for example:

```
```
collectd_metric_configs:
- nomis-db
```

2. loops through values of files/[collectd_metric_configs] and templates/[collectd_metric_configs] deploys them to the host if the relevant files exist

3. files/linux.conf and templates/linux.sh.j2 are deployed to the host by default if additional collectd_metric_configs are not defined


Expand All @@ -35,15 +35,15 @@ Further collectd Troubleshooting [here](https://collectd.org/wiki/index.php/Trou

1. *.conf files must have an empty line at the end to load, otherwise collectd won't start...

2. formatting for the exec message (sent to localhost udp port 25826) is very important. It MUST be in the format "PUTVAL $HOSTNAME/exec-<name_of_metric>/guage-$signifier. Values after exec- and guage- (or other value type) cannot use additional '-' characters or spaces otherwise the exec plugin will deliver a mal-formed message.
2. formatting for the exec message (sent to localhost udp port 25826) is very important. It MUST be in the format "PUTVAL $HOSTNAME/exec-<name_of_metric>/guage-$signifier. Values after exec- and guage- (or other value type) cannot use additional '-' characters or spaces otherwise the exec plugin will deliver a mal-formed message.

## Collectd and Selinux

There is an additional task specifically to create a selinux policy for collectd. This is because collectd runs scripts via the exec plugin and selinux will block this by default.
There is an additional task specifically to create a selinux policy for collectd. This is because collectd runs scripts via the exec plugin and selinux will block this by default.

Having logging for collectd is NOT enabled. Most of the useful information goes to /var/log/messages anyway or with selinux to /var/log/audit/audit.log where you can see what's being blocked in relation to collectd

There are selinux exceptions for collectd when it comes to Rhel 7 & 8. It _seems_ this isn't needed for Rhel 6 but there is an existing task to automatically scan the audit.log for issues and then create a policy file.
There are selinux exceptions for collectd when it comes to Rhel 7 & 8. It _seems_ this isn't needed for Rhel 6 but there is an existing task to automatically scan the audit.log for issues and then create a policy file.

### Some useful selinux commands for troubleshooting

Expand All @@ -63,14 +63,7 @@ Once you have found an AVC denial message in /var/log/audit/audit.log you can us

If/when there are additional instances of this please add the settings back to the relevant collectd_selinux_policy_rhel_(version).te file and re-run the ansible task to create the policy file.

At some point we may simply decide to place the whole collectd_t domain into permissive mode.

```
- name: change the collectd_t domain to permissive
community.general.selinux_permissive:
type: collectd_t
permissive: true
```
Although we create a specific collectd policy, it is unlikely to cover everything. Especially when scripts are triggered from collectd. For this reason, we set collectd domain to permissive mode by default.

You can also grab AVC rules like this:

Expand Down
2 changes: 2 additions & 0 deletions ansible/roles/collectd/defaults/main.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
---
collectd_selinux_permissive: true
19 changes: 19 additions & 0 deletions ansible/roles/collectd/tasks/collectd_selinux_policy.yml
Original file line number Diff line number Diff line change
Expand Up @@ -32,3 +32,22 @@

# block
when: collectd_selinux_mode.stdout|lower == "enforcing" or collectd_selinux_mode.stdout|lower == "permissive"

- name: Check if permissive state applied already
ansible.builtin.stat:
path: /root/.ansible-collectd-selinux
register: ansible_collectd_selinux_installed

- name: Enable permissive mode for collectd
ansible.builtin.shell: |
set -eo pipefail
main() {
if [[ ! -e /root/.ansible-collectd-selinux ]]; then
semanage permissive -a collectd_t > /root/.ansible-collectd-selinux
fi
}
main 2>&1 | logger -p local3.info -t ansible-collectd
when:
- collectd_selinux_mode.stdout|lower == "enforcing" or collectd_selinux_mode.stdout|lower == "permissive"
- not ansible_collectd_selinux_installed.stat.exists
- collectd_selinux_permissive|bool

0 comments on commit af38c1b

Please sign in to comment.