diff --git a/enterprise_access/settings/base.py b/enterprise_access/settings/base.py index de1a696e..6574f05c 100644 --- a/enterprise_access/settings/base.py +++ b/enterprise_access/settings/base.py @@ -411,7 +411,7 @@ def root(*path_fragments): 'fanout_prefix': True, } -TASK_MAX_RETRIES = 3 +TASK_MAX_RETRIES = 5 """############################# END CELERY CONFIG ##################################""" diff --git a/enterprise_access/tasks.py b/enterprise_access/tasks.py index 841b745f..6894b5d1 100644 --- a/enterprise_access/tasks.py +++ b/enterprise_access/tasks.py @@ -24,8 +24,16 @@ class LoggedTaskWithRetry(LoggedTask): # pylint: disable=abstract-method OperationalError, BrazeClientError, ) + # The default number of max_retries is 3, but Braze times out a lot so we will use 5 instead. + # 5 retries means retrying potentially up to ~31 minutes: + # (2⁰ + 2¹ + 2² + 2³ + 2⁴) × (60 seconds) = 31 min retry_kwargs = {'max_retries': settings.TASK_MAX_RETRIES} # Use exponential backoff for retrying tasks - retry_backoff = 5 # delay factor of 5 seconds + # see https://docs.celeryq.dev/en/stable/userguide/tasks.html#Task.retry_backoff + # First retry will delay 60 seconds, second will delay 120 seconds, third 240 seconds. + # The retry_backoff_max default value is 600 seconds -> 10 minutes + # https://docs.celeryq.dev/en/stable/userguide/tasks.html#Task.retry_backoff_max + # This will result in the final backoff time reducing from 2⁴ × 60 seconds = 960 seconds to 600 seconds + retry_backoff = 60 # Add randomness to backoff delays to prevent all tasks in queue from executing simultaneously retry_jitter = True