Skip to content

Commit

Permalink
Monitor feature (#8)
Browse files Browse the repository at this point in the history
* Add monitor

* monitor script

* monitor

* check ansible

* Test monitor
  • Loading branch information
fewensa authored Jul 29, 2024
1 parent d0f11b3 commit 74fc944
Show file tree
Hide file tree
Showing 9 changed files with 309 additions and 1 deletion.
2 changes: 1 addition & 1 deletion .editorconfig
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ charset = utf-8
end_of_line = lf
insert_final_newline = true
indent_style = space
indent_size = 4
indent_size = 2
trim_trailing_whitespace = true

[*.{yml,yaml}]
Expand Down
55 changes: 55 additions & 0 deletions .github/workflows/check-ansible.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
name: Check

on:
pull_request:
branches: [main]

env:
SLACK_INCOMING_WEBHOOK_URL: ${{ secrets.SLACK_INCOMING_WEBHOOK_URL }}

jobs:
deploy-essentials:
name: Deploy essentials
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
with:
fetch-depth: 0

- name: Verify essentials file changed
uses: tj-actions/[email protected]
id: changed_files
with:
files: |
ansible/inventories/hosts.ini
ansible/playbooks/_essentials/*
ansible/playbooks/essentials.yml
- name: Deploy essentials
id: deploy-essentials
if: steps.changed_files.outputs.any_changed == 'true'
uses: dawidd6/action-ansible-playbook@v2
with:
directory: ansible
playbook: playbooks/essentials.yml
key: "${{ secrets.SSH_PRIVATE_KEY }}"
options: --user ansible

check-playbooks:
name: Check playbook
runs-on: ubuntu-latest
needs: [deploy-essentials]
strategy:
matrix:
playbook:
- snapshots_crab
steps:
- uses: actions/checkout@v2

- name: Run playbook
uses: dawidd6/action-ansible-playbook@v2
with:
directory: ansible
playbook: playbooks/${{ matrix.playbook }}/playbook.yml
key: "${{ secrets.SSH_PRIVATE_KEY }}"
options: --user ansible --verbose --diff --check
25 changes: 25 additions & 0 deletions .github/workflows/monitor.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
name: Monitor

on:
schedule:
- cron: "0 */2 * * *"
workflow_dispatch:

jobs:
monitor:
name: monitor
runs-on: ubuntu-latest
strategy:
matrix:
playbook:
- monitor
steps:
- uses: actions/checkout@v2

- name: Run playbook
uses: dawidd6/action-ansible-playbook@v2
with:
directory: ansible
playbook: playbooks/${{ matrix.playbook }}/playbook.yml
key: "${{ secrets.SSH_PRIVATE_KEY }}"
options: --user ansible --verbose --diff
23 changes: 23 additions & 0 deletions ansible/inventories/hosts.ini
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,26 @@ g1.crab2.darwinia.network
[darwinia_nodes]
g1.darwinia2.darwinia.network

[monitor]
g1.crab2.darwinia.network
c1.crab2.darwinia.network
g1.darwinia2.darwinia.network
c1.darwinia2.darwinia.network
g1.testnets.darwinia.network
g2.testnets.darwinia.network
g3.testnets.darwinia.network
g1.generic.darwinia.network
g2.generic.darwinia.network

c1.darwinia-rpc.itering.io
c2.darwinia-rpc.itering.io
c1.crab-rpc.itering.io
c2.crab-rpc.itering.io

c1.collator.itering.io
c2.collator.itering.io
c3.collator.itering.io
c4.collator.itering.io

c5.collator.itering.io
c6.collator.itering.io
3 changes: 3 additions & 0 deletions ansible/playbooks/monitor/group_vars/monitor.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@

monitor:
notify_slack_webhook: "{{ lookup('env', 'SLACK_INCOMING_WEBHOOK_URL') }}"
3 changes: 3 additions & 0 deletions ansible/playbooks/monitor/playbook.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
- hosts: monitor
roles:
- monitor
16 changes: 16 additions & 0 deletions ansible/roles/monitor/defaults/main.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@

monitor:
workdir: /tmp/monitor
notify_slack_webhook: ''
notify_slack_channel: 'darwinia-alert-notification'
server_name: ''
check_disks:
- /dev/sda
- /dev/sdb
alert_thread_cpu_p2: 90
alert_thread_cpu_p1: 98
alert_thread_ram_p2: 90
alert_thread_ram_p1: 98
alert_thread_disk_p2: 90
alert_thread_disk_p1: 98

15 changes: 15 additions & 0 deletions ansible/roles/monitor/tasks/main.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@

- name: Creates workdir
file:
path: "{{ monitor.workdir }}"
state: directory

- name: Generate scripts file
template:
src: crawl.sh
dest: "{{ monitor.workdir }}/crawl.sh"
mode: "0644"

- name: Run snapshot
command: bash {{ monitor.workdir }}/crawl.sh

168 changes: 168 additions & 0 deletions ansible/roles/monitor/templates/crawl.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,168 @@
#!/bin/bash

DISKS_TO_MONITOR=({{ monitor.check_disks | join(' ') }})

SERVER_NAME='{{ monitor.server_name }}'
NOTIFY_SLACK_WEBHOOK='{{ monitor.notify_slack_webhook }}'
NOTIFY_SLACK_CHANNEL='{{ monitor.notify_slack_channel }}'

ALERT_THREAD_CPU_P2={{ monitor.alert_thread_cpu_p2 }}
ALERT_THREAD_CPU_P1={{ monitor.alert_thread_cpu_p1 }}
ALERT_THREAD_RAM_P2={{ monitor.alert_thread_ram_p2 }}
ALERT_THREAD_RAM_P1={{ monitor.alert_thread_ram_p1 }}
ALERT_THREAD_DISK_P2={{ monitor.alert_thread_disk_p2 }}
ALERT_THREAD_DISK_P1={{ monitor.alert_thread_disk_p1 }}

timestamp() {
date +"%Y-%m-%d %H:%M:%S"
}

cpu_usage() {
top -bn1 | grep "Cpu(s)" | awk '{print $2 + $4}' | sed 's/%//'
}

memory_usage() {
free | grep Mem | awk '{print $3/$2 * 100.0}'
}

disk_usage() {
for disk in "${DISKS_TO_MONITOR[@]}"; do
usage=$(df -h | grep "^$disk" | awk '{print $5}' | sed 's/%//')
echo "$disk $usage"
done
}

request_count() {
ss -s | grep 'estab' | awk '{print $2}'
}

generate_alert_message() {
local cpu=$(cpu_usage)
local ram=$(memory_usage)
local tcp=$(request_count)
local alert_message="[]"
local priority='P2'

if (( $(echo "$cpu > $ALERT_THREAD_CPU_P1" | bc -l) )); then
priority='P1'
fi
if (( $(echo "$ram > $ALERT_THREAD_RAM_P1" | bc -l) )); then
priority='P1'
fi
if [[ "P1" == "$priority" ]]; then
priority_alert=$(jq -n --arg priority "${priority}" '[{"type":"mrkdwn","text":"*Priority*"},{"type":"plain_text","text":$priority}]')
alert_message=$(echo "$alert_message" | jq --argjson priority_alert "$priority_alert" '. += $priority_alert')
fi

if (( $(echo "$cpu > $ALERT_THREAD_CPU_P2" | bc -l) )); then
cpu_alert=$(jq -n --arg cpu "${cpu}%" '[{"type":"mrkdwn","text":"*CPU*"},{"type":"plain_text","text":$cpu}]')
alert_message=$(echo "$alert_message" | jq --argjson cpu_alert "$cpu_alert" '. += $cpu_alert')
fi

if (( $(echo "$ram > $ALERT_THREAD_RAM_P2" | bc -l) )); then
ram_alert=$(jq -n --arg ram "${ram}%" '[{"type":"mrkdwn","text":"*RAM*"},{"type":"plain_text","text":$ram}]')
alert_message=$(echo "$alert_message" | jq --argjson ram_alert "$ram_alert" '. += $ram_alert')
fi

if [[ "$alert_message" != "[]" ]]; then
tcp_alert=$(jq -n --arg tcp "${tcp}" '[{"type":"mrkdwn","text":"*TCP*"},{"type":"plain_text","text":$tcp}]')
alert_message=$(echo "$alert_message" | jq --argjson tcp_alert "$tcp_alert" '. += $tcp_alert')
fi

echo "$alert_message"
}


generate_disk_alert_message() {
local alert_message="[]"
local priority='P2'

while IFS= read -r line; do
local disk=$(echo $line | awk '{print $1}')
local usage=$(echo $line | awk '{print $2}')
if [[ -z "$usage" ]]; then
continue
fi

if (( $(echo "$usage > $ALERT_THREAD_DISK_P1" | bc -l) )); then
priority='P1'
fi
if (( $(echo "$usage > $ALERT_THREAD_DISK_P2" | bc -l) )); then
disk_alert=$(jq -n --arg disk "*DISK* ($disk)" --arg usage "${usage}%" '[{"type":"mrkdwn","text":$disk},{"type":"plain_text","text":$usage}]')
alert_message=$(echo "$alert_message" | jq --argjson disk_alert "$disk_alert" '. += $disk_alert')
fi
done < <(disk_usage)

if [[ "P1" == "$priority" ]]; then
priority_alert=$(jq -n --arg priority "${priority}" '[{"type":"mrkdwn","text":"*Priority*"},{"type":"plain_text","text":$priority}]')
alert_message=$(echo "$alert_message" | jq --argjson priority_alert "$priority_alert" '. += $priority_alert')
fi

echo "$alert_message"
}


check_and_send_alert() {
local alert_message=$(generate_alert_message)
local disk_alert_message=$(generate_disk_alert_message)
local HOSTNAME=${SERVER_NAME:-$(hostname)}

local blocks="[]"

if [[ "$alert_message" != "[]" ]]; then
alert_block=$(
jq -n \
--arg warning "[*WARNING*]: New server alert > $HOSTNAME" \
--argjson msg "$alert_message" \
'{ "type": "section", "text": {"type": "mrkdwn", "text": $warning}, "fields": $msg }'
)
blocks=$(echo "$blocks" | jq --argjson block "$alert_block" '. += [$block]')
fi

if [[ "$disk_alert_message" != "[]" ]]; then
disk_block=$(
jq -n \
--arg warning "[*WARNING*]: New disk alert > $HOSTNAME" \
--argjson msg "$disk_alert_message" \
'{ "type": "section", "text": {"type": "mrkdwn", "text": $warning}, "fields": $msg }'
)
blocks=$(echo "$blocks" | jq --argjson block "$disk_block" '. += [$block]')
fi

if [[ "$blocks" != "[]" ]]; then
local data=$(
jq -n \
--arg channel "$NOTIFY_SLACK_CHANNEL" \
--argjson blocks "$blocks" \
'{
"username": "ServerBot",
"icon_emoji": ":loudspeaker:",
"channel": $channel,
"blocks": $blocks
}'
)

send_alert "$data"
fi
}

send_alert() {
local message=$1

curl -X POST \
-H "Content-type: application/json" \
$NOTIFY_SLACK_WEBHOOK \
--data "$message"
}

main() {
local cpu=$(cpu_usage)
local ram=$(memory_usage)
local disk=$(disk_usage)
local requests=$(request_count)
echo "$(timestamp) CPU: ${cpu}% RAM: ${ram}% Disk: ${disk}% Requests: ${requests}"

check_and_send_alert
}

main

0 comments on commit 74fc944

Please sign in to comment.