Skip to content

Commit

Permalink
remove time out kill + improve message
Browse files Browse the repository at this point in the history
  • Loading branch information
antoinefalisse committed Jul 5, 2024
1 parent fb43d60 commit 650f246
Show file tree
Hide file tree
Showing 4 changed files with 15 additions and 13 deletions.
12 changes: 6 additions & 6 deletions app.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,12 +149,12 @@
r = requests.patch(trial_url, data={"status": "error"},
headers = {"Authorization": "Token {}".format(API_TOKEN)})
traceback.print_exc()
args_as_strings = [str(arg) for arg in e.args]
if len(args_as_strings) > 1 and 'pose detection timed out' in args_as_strings[1].lower():
logging.info("Worker failed. Stopping machine.")
message = "A backend OpenCap machine timed out during pose detection. It has been stopped."
sendStatusEmail(message=message)
raise Exception('Worker failed. Stopped.')
# args_as_strings = [str(arg) for arg in e.args]
# if len(args_as_strings) > 1 and 'pose detection timed out' in args_as_strings[1].lower():
# logging.info("Worker failed. Stopping machine.")
# message = "A backend OpenCap machine timed out during pose detection. It has been stopped."
# sendStatusEmail(message=message)
# raise Exception('Worker failed. Stopped.')
justProcessed = True

# Clean data directory
Expand Down
12 changes: 7 additions & 5 deletions utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -1494,9 +1494,13 @@ def sendStatusEmail(message=None,subject=None):
emailInfo = getStatusEmails()
if emailInfo is None:
return('No email info or wrong email info in env file.')

if 'ip' in emailInfo:
ip = emailInfo['ip']
message = message + ' IP: ' + ip

if message is None:
message = "A backend server is down and has been stopped."
message = "A backend server is down and has been stopped.".format(ip)
if subject is None:
subject = "OpenCap backend server down"

Expand Down Expand Up @@ -1533,8 +1537,7 @@ def checkResourceUsage(stop_machine_and_email=True):

if stop_machine_and_email and resourceUsage['disk_perc'] > 95:

message = "Disc is full on an OpenCap machine backend machine: " \
+ socket.gethostname() + ". It has been stopped. Data: " \
message = "Disc is full on an OpenCap machine backend machine. It has been stopped. Data: " \
+ json.dumps(resourceUsage)
sendStatusEmail(message=message)

Expand All @@ -1551,8 +1554,7 @@ def checkCudaTF():
for gpu in gpus:
print(f"GPU: {gpu.name}")
else:
message = "Cuda check failed on an OpenCap machine backend machine: " \
+ socket.gethostname() + ". It has been stopped."
message = "Cuda check failed on an OpenCap machine backend machine. It has been stopped."
sendStatusEmail(message=message)
raise Exception("No GPU detected. Exiting.")

Expand Down
1 change: 1 addition & 0 deletions utilsAPI.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ def getStatusEmails():
emailInfo['fromEmail'] = config("STATUS_EMAIL_FROM")
emailInfo['password'] = config("STATUS_EMAIL_FROM_PW")
emailInfo['toEmails'] = json.loads(config("STATUS_EMAIL_TO"))
emailInfo['ip'] = json.loads(config("STATUS_EMAIL_IP"))
except:
emailInfo = None

Expand Down
3 changes: 1 addition & 2 deletions utilsServer.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
import requests
import json
import logging
import socket

from main import main
from utils import getDataDirectory
Expand Down Expand Up @@ -469,7 +468,7 @@ def runTestSession(pose='all',isDocker=True):
except:
logging.info("test trial failed. stopping machine.")
# send email
message = "A backend OpenCap machine failed the status check: " + socket.gethostname() + ". It has been stopped."
message = "A backend OpenCap machine failed the status check. It has been stopped."
sendStatusEmail(message=message)
raise Exception('Failed status check. Stopped.')

Expand Down

0 comments on commit 650f246

Please sign in to comment.