-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathfailsafe.py
160 lines (135 loc) · 5.28 KB
/
failsafe.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
import multiprocessing
import os
import signal
import time
import atexit
import threading
import traceback
from setproctitle import setproctitle
import config
import logsupport as L
from controlevents import CEvent, PostEvent, ConsoleEvent
from utils.utilfuncs import disptime
KeepAlive = multiprocessing.Event()
FailsafeInterval = 60 # 1000000 if utils.utilfuncs.isdevsystem else 60
def DevPrint(msg):
with open('/home/pi/Console/.HistoryBuffer/hlogW', 'a') as f:
f.write('{}: {}\n'.format(time.time(), msg))
f.flush()
def TempThreadList():
"""
This routine is just for working cleanly with PyCharm IDE. If you leave a system running that was launched from
PyCharm, if the PC controlling it goes to sleep it kills the console. Unfortunately it only partially kills it and
so leaves zombies and threads running. This code makes sure everything gets killed to not leave connections
to the ISY which will eventually force it to its limit without manual intervention.
"""
time.sleep(10)
while True:
multiprocessing.active_children() # clean any zombie failsafe
# for x in L:
# DevPrint('Process {}: alive: {} pid: {} daemon: {}'.format(x.name, x.is_alive(), x.pid, x.daemon))
threadlist = threading.enumerate()
for thd in threadlist:
if thd.name == 'MainThread' and not thd.is_alive():
DevPrint('Main Thread died')
os.kill(os.getpid(), signal.SIGINT) # kill myself
# DevPrint('=================End')
time.sleep(30)
def NoEventInjector():
L.Logs.Log('Starting watchdog activity injector')
while config.Running:
# noinspection PyBroadException
try:
now = time.time()
L.Logs.Log('Inject: {}'.format(now), severity=L.ConsoleDetailHigh)
PostEvent(ConsoleEvent(CEvent.FailSafePing, inject=now))
time.sleep(FailsafeInterval / 2)
except Exception as E:
time.sleep(FailsafeInterval / 2)
DevPrint('Inject Exception {}'.format(repr(E)))
# spurious exceptions during shutdown
DevPrint('Injector exiting')
# noinspection PyProtectedMember,PyUnusedLocal
def EndWatchDog(signum, frame):
DevPrint('Watchdog ending on shutdown {}'.format(signum))
os._exit(94)
# noinspection PyUnusedLocal
def AbortWatchDog(signum, frame):
with open('/home/pi/.tombstoneW', 'a') as tomb:
print(f'{disptime()} Watchdog {os.getpid()} exiting for signal {signum}', file=tomb, flush=True)
traceback.print_stack(file=tomb)
# noinspection PyProtectedMember
os._exit(93)
# noinspection PyProtectedMember
def WatchdogDying(signum, frame):
try:
if signum == signal.SIGTERM:
DevPrint('Watchdog saw SIGTERM - must be from systemd')
# console should have also seen this - give it time to shut down
time.sleep(30) # we should see a USR1 from console
os._exit(0)
else:
DevPrint('Watchdog dying signum: {} frame: {}'.format(signum, frame))
# noinspection PyBroadException
try:
os.kill(config.sysStore.Console_pid, signal.SIGUSR1)
except Exception:
pass # probably main console already gone
time.sleep(3)
# noinspection PyBroadException
try:
os.kill(config.sysStore.Console_pid, signal.SIGKILL) # with predjudice
except Exception:
pass # probably already gone
os._exit(0)
except Exception as E:
DevPrint('Exception in WatchdogDying: {}'.format(E))
time.sleep(1)
os._exit(0)
def failsafedeath():
DevPrint('Failsafe exit hook')
DevPrint('failsafedeath {} watching {} at {}'.format(os.getpid(), config.sysStore.Console_pid, time.time()))
os.kill(config.sysStore.Console_pid, signal.SIGUSR1)
time.sleep(3)
os.kill(config.sysStore.Console_pid, signal.SIGKILL) # with predjudice
# noinspection PyUnusedLocal
def IgnoreHUP(signum, frame):
DevPrint('Watchdog got HUP - ignoring')
def MasterWatchDog():
DevPrint('Watchdog')
setproctitle('Console Watchdog')
signal.signal(signal.SIGTERM, WatchdogDying) # don't want the sig handlers from the main console
signal.signal(signal.SIGINT, EndWatchDog)
signal.signal(signal.SIGUSR1, EndWatchDog)
signal.signal(signal.SIGHUP, IgnoreHUP)
signal.signal(signal.SIGABRT, AbortWatchDog)
# failsafehooks.hook()
atexit.register(failsafedeath)
DevPrint('Master Watchdog Started {} for console pid: {}'.format(os.getpid(), config.sysStore.Console_pid))
runningok = True
while runningok:
while KeepAlive.wait(FailsafeInterval):
KeepAlive.clear()
time.sleep(FailsafeInterval)
runningok = False # no keepalive seen for failsafe interval - try to restart
DevPrint('No keepalive in failsafe interval')
DevPrint('Watchdog loop exit: {}'.format(time.time()))
# noinspection PyBroadException
try:
os.kill(config.sysStore.Console_pid, 0)
except Exception:
DevPrint('Normal watchdog exit')
return
DevPrint('Failsafe interrupt {}'.format(config.sysStore.Console_pid))
os.kill(config.sysStore.Console_pid, signal.SIGUSR1)
time.sleep(3) # wait for exit to complete
try:
os.kill(config.sysStore.Console_pid, 0) # check if console exited - raises exception if it is gone
DevPrint("Failsafe watchdog interrupt didn't reset - killing {}".format(config.sysStore.Console_pid))
os.kill(config.sysStore.Console_pid, signal.SIGKILL)
DevPrint("Failsafe exiting after kill attempt")
except Exception as E:
print('Failsafe exiting')
DevPrint("Failsafe successfully ended console (pid: {}), failsafe (pid: {}) exiting (Exc: {})".format(
config.sysStore.Console_pid, os.getpid(), repr(E)))
DevPrint('Watchdog exiting')