Skip to content

Commit

Permalink
watchdog: restart worker if failing
Browse files Browse the repository at this point in the history
Workaround of #5
This commit will need to be revert when real watchdog will be unblocked
  • Loading branch information
talset committed Jul 27, 2020
1 parent fb0f13c commit dc885bd
Show file tree
Hide file tree
Showing 5 changed files with 50 additions and 3 deletions.
11 changes: 9 additions & 2 deletions tasks/install-worker.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,14 +24,21 @@
dest: "{{ concourse_worker_launcher_path }}"
- src: concourse-retire-worker.j2
dest: "{{ concourse_retire_worker_path }}"
- src: concourse-worker-watchdog.j2
dest: "{{ concourse_install_dir }}/concourse-worker-watchdog"

- name: create worker service | concourse
template:
src: concourse-worker.service.j2
dest: /etc/systemd/system/concourse-worker.service
src: "{{ item['src'] }}"
dest: "{{ item['dest'] }}"
owner: root
force: yes
become: yes
become_user: root
with_items:
- src: concourse-worker.service.j2
dest: /etc/systemd/system/concourse-worker.service
- src: concourse-worker-watchdog.service.j2
dest: /etc/systemd/system/concourse-worker-watchdog.service
notify:
- restart concourse worker
2 changes: 1 addition & 1 deletion templates/concourse-retire-worker.j2
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ export {{ key }}="{{ value }}"
# If $1 PID of concourse worker is provided, do a kill instead of an api call
# Mostly used by systemd for concourse compatiility issues https://github.com/concourse/concourse/pull/3929

until ! curl --fail 127.0.0.1:7777/ping; do
until ! curl --silent --fail 127.0.0.1:7777/ping; do

if [[ -z "$1" ]]; then
{{ concourse_binary_path }} retire-worker \
Expand Down
24 changes: 24 additions & 0 deletions templates/concourse-worker-watchdog.j2
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
#!/bin/bash

watchdog() {
#WORKER_PID=$1

while(true); do
FAIL=0

curl --silent 127.0.0.1:8888 || FAIL=1

#if [[ $FAIL -eq 0 ]]; then
if [[ $FAIL -eq 1 ]]; then
#/bin/systemd-notify --pid=$WORKER_PID "WATCHDOG=1";
/bin/systemctl restart concourse-worker
sleep 1
else
echo "watchdog: concourse-worker healthcheck ok"
#sleep 1
sleep 15
fi
done
}

exec watchdog
15 changes: 15 additions & 0 deletions templates/concourse-worker-watchdog.service.j2
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# {{ ansible_managed }}

[Unit]
Description=concourse-worker-watchdog
Requires=network-online.target
After=concourse-worker.service

[Service]
ExecStart={{ concourse_install_dir }}/concourse-worker-watchdog
ExecStop=/bin/kill $MAINPID
ExecReload=/bin/kill -HUP $MAINPID
Restart=on-failure

[Install]
WantedBy=multi-user.target
1 change: 1 addition & 0 deletions templates/concourse-worker.service.j2
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
Description=concourse-worker
Requires=network-online.target
After=network-online.target
Before=concourse-worker-watchdog.service

[Service]
ExecStart={{ concourse_worker_launcher_path }}
Expand Down

0 comments on commit dc885bd

Please sign in to comment.