Skip to content

Commit

Permalink
concourse-worker: add watchdog process
Browse files Browse the repository at this point in the history
  • Loading branch information
Steve Durrheimer committed May 29, 2020
1 parent fb0f13c commit 26a3bcf
Show file tree
Hide file tree
Showing 6 changed files with 61 additions and 3 deletions.
2 changes: 2 additions & 0 deletions defaults/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,8 @@ concourse_worker: no
concourse_worker_name: "{{ ansible_hostname }}"
concourse_worker_launcher_path: "{{ concourse_install_dir }}/concourse-worker"
concourse_retire_worker_path: "{{ concourse_install_dir }}/concourse-retire-worker"
concourse_worker_watchdog_sec: 10
concourse_worker_watchdog_path: "{{ concourse_install_dir }}/concourse-worker-watchdog"
concourse_work_dir: "{{ concourse_install_dir }}/work"
concourse_tsa_public_key_path: "{{ concourse_install_dir }}/host_key.pub"
concourse_tsa_worker_key_path: "{{ concourse_install_dir }}/worker_key"
Expand Down
11 changes: 9 additions & 2 deletions tasks/install-worker.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,14 +24,21 @@
dest: "{{ concourse_worker_launcher_path }}"
- src: concourse-retire-worker.j2
dest: "{{ concourse_retire_worker_path }}"
- src: concourse-worker-watchdog.j2
dest: "{{ concourse_worker_watchdog_path }}"

- name: create worker service | concourse
template:
src: concourse-worker.service.j2
dest: /etc/systemd/system/concourse-worker.service
src: "{{ item['src'] }}"
dest: "{{ item['dest'] }}"
owner: root
force: yes
become: yes
become_user: root
with_items:
- src: concourse-worker.service.j2
dest: /etc/systemd/system/concourse-worker.service
- src: concourse-worker-watchdog.service.j2
dest: /etc/systemd/system/concourse-worker-watchdog.service
notify:
- restart concourse worker
2 changes: 1 addition & 1 deletion templates/concourse-retire-worker.j2
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ export {{ key }}="{{ value }}"
# If $1 PID of concourse worker is provided, do a kill instead of an api call
# Mostly used by systemd for concourse compatiility issues https://github.com/concourse/concourse/pull/3929

until ! curl --fail 127.0.0.1:7777/ping; do
until ! curl --silent --fail 127.0.0.1:7777/ping; do

if [[ -z "$1" ]]; then
{{ concourse_binary_path }} retire-worker \
Expand Down
26 changes: 26 additions & 0 deletions templates/concourse-worker-watchdog.j2
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
#!/bin/bash

# {{ ansible_managed }}

export NOTIFY_SOCKET=/run/systemd/notify

while(true); do
FAIL=0

WORKER_PID=$(systemctl show -p MainPID concourse-worker.service 2>/dev/null | cut -d= -f2)
if [[ $WORKER_PID -eq 0 ]]; then
echo "can't get concourse-worker systemd MainPID: service is not running"
sleep 1
continue
fi

curl --silent 127.0.0.1:8888 || FAIL=1

if [[ $FAIL -eq 0 ]]; then
/bin/systemd-notify --pid=$WORKER_PID "WATCHDOG=1"
sleep $(({{ concourse_worker_watchdog_sec * 1000000 }} / 2000000))
else
echo "concourse-worker healthcheck failed"
sleep 1
fi
done
12 changes: 12 additions & 0 deletions templates/concourse-worker-watchdog.service.j2
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# {{ ansible_managed }}

[Unit]
Description=concourse-worker-watchdog
BindsTo=concourse-worker.service

[Service]
ExecStart={{ concourse_worker_watchdog_path }}
Restart=on-failure

[Install]
WantedBy=multi-user.target
11 changes: 11 additions & 0 deletions templates/concourse-worker.service.j2
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
Description=concourse-worker
Requires=network-online.target
After=network-online.target
Wants=concourse-worker-watchdog.service
Before=concourse-worker-watchdog.service

[Service]
ExecStart={{ concourse_worker_launcher_path }}
Expand All @@ -17,5 +19,14 @@ TasksMax=infinity
Delegate=yes
KillMode=process

## Watchdog
WatchdogSec={{ concourse_worker_watchdog_sec }}
NotifyAccess=all
# If there is `StartLimitBurst` failed restart attempt
# within `StartLimitInterval` then force poweroff
StartLimitInterval=5min
StartLimitBurst=4
StartLimitAction=poweroff-force

[Install]
WantedBy=multi-user.target

0 comments on commit 26a3bcf

Please sign in to comment.