Skip to content

Commit

Permalink
Merge pull request #1308 from mskcc/feature/alert_email
Browse files Browse the repository at this point in the history
Hanging alert email
  • Loading branch information
sivkovic authored Jan 10, 2024
2 parents 6c71158 + 0bc71dc commit 66d63a8
Show file tree
Hide file tree
Showing 2 changed files with 26 additions and 0 deletions.
1 change: 1 addition & 0 deletions beagle/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -459,6 +459,7 @@

PERMISSION_DENIED_CC = json.loads(os.environ.get("BEAGLE_PERMISSION_DENIED_CC", "{}"))
PERMISSION_DENIED_EMAILS = json.loads(os.environ.get("BEAGLE_PERMISSION_DENIED_EMAIL", "{}"))
JOB_HANGING_ALERT_EMAILS = os.environ.get("BEAGLE_JOB_HANGING_ALERT_EMAILS", "").split(",")

# Tempo

Expand Down
25 changes: 25 additions & 0 deletions runner/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
SetRunTicketInImportEvent,
SetDeliveryDateFieldEvent,
VoyagerActionRequiredForRunningEvent,
SendEmailEvent,
OperatorErrorEvent,
)
from notifier.tasks import send_notification, notifier_start
Expand Down Expand Up @@ -747,6 +748,21 @@ def check_job_timeouts():
fail_job(run.id, "Run timedout after %s days" % TIMEOUT_BY_DAYS)


def send_hanging_job_alert(run_id, message):
for email in settings.JOB_HANGING_ALERT_EMAILS:
content = f"""Run {settings.BEAGLE_URL}/v0/run/api/{run_id}/ possible hanging.
{message}"""
email = SendEmailEvent(
job_notifier=settings.BEAGLE_NOTIFIER_EMAIL_GROUP,
email_to=email,
email_from=settings.BEAGLE_NOTIFIER_EMAIL_FROM,
subject=f"ALERT: Hanging job detected {settings.BEAGLE_URL}/v0/run/api/{run_id}/",
content=content,
)
send_notification.delay(email.to_dict())


@shared_task
@memcache_lock("check_jobs_status")
def check_jobs_status():
Expand All @@ -771,6 +787,15 @@ def check_jobs_status():
continue

status = remote_statuses[str(run.execution_id)]
message = dict(details=status.get("message", {}))
new_alert = message.get("details", {}).get("alerts")
old_alert = run.message.get("details", {}).get("alerts")
if old_alert != new_alert:
logger.error(format_log("Hanging Job detected", obj=run))
run.message = dict(details=status.get("message", {}))
run.save(update_fields=("message",))
send_hanging_job_alert(str(run.id), new_alert[0]["message"])

if status["started"] and not run.started:
run.started = status["started"]
run.save(update_fields=("started",))
Expand Down

0 comments on commit 66d63a8

Please sign in to comment.