Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Retry making rundirs a few times with randomised exponential backoff #3409

Draft
wants to merge 1 commit into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions parsl/dataflow/errors.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,3 +63,7 @@ def __init__(self, dependent_exceptions_tids: Sequence[Tuple[BaseException, Opti
def __str__(self) -> str:
dep_tids = [tid for (exception, tid) in self.dependent_exceptions_tids]
return "Join failure for task {} with failed join dependencies from tasks {}".format(self.task_id, dep_tids)


class RundirCreateError(ParslError):
pass
62 changes: 41 additions & 21 deletions parsl/dataflow/rundirs.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,15 @@
import os
from glob import glob
import logging
import random
import time

from parsl.dataflow.errors import RundirCreateError

logger = logging.getLogger(__name__)


def make_rundir(path: str) -> str:
def make_rundir(path: str, *, max_tries: int = 3) -> str:
"""When a path has not been specified, make the run directory.

Creates a rundir with the following hierarchy:
Expand All @@ -18,23 +22,39 @@ def make_rundir(path: str) -> str:
Kwargs:
- path (str): String path to a specific run dir
"""
try:
if not os.path.exists(path):
os.makedirs(path)

prev_rundirs = glob(os.path.join(path, "[0-9]*[0-9]"))

current_rundir = os.path.join(path, '000')

if prev_rundirs:
# Since we globbed on files named as 0-9
x = sorted([int(os.path.basename(x)) for x in prev_rundirs])[-1]
current_rundir = os.path.join(path, '{0:03}'.format(x + 1))

os.makedirs(current_rundir)
logger.debug("Parsl run initializing in rundir: {0}".format(current_rundir))
return os.path.abspath(current_rundir)

except Exception:
logger.exception("Failed to create run directory")
raise
backoff_time_s = 1 + random.random()

os.makedirs(path, exist_ok=True)

# try_count is 1-based for human readability
try_count = 1
while True:

# Python 3.10 introduces root_dir argument to glob which in future
# can be used to simplify this code, something like:
# prev_rundirs = glob("[0-9]*[0-9]", root_dir=path)
full_prev_rundirs = glob(os.path.join(path, "[0-9]*[0-9]"))
prev_rundirs = [os.path.basename(d) for d in full_prev_rundirs]

next = max([int(d) for d in prev_rundirs] + [-1]) + 1

current_rundir = os.path.join(path, '{0:03}'.format(next))

try:
os.makedirs(current_rundir)
logger.debug("rundir created: %s", current_rundir)
return os.path.abspath(current_rundir)
except FileExistsError:
logger.warning(f"Could not create rundir {current_rundir} on try {try_count}")

if try_count >= max_tries:
raise
else:
logger.debug("Backing off {}s", backoff_time_s)
time.sleep(backoff_time_s)
backoff_time_s *= 2 + random.random()
try_count += 1

# this should never be reached - the above loop should have either returned
# or raised an exception on the last try
raise RundirCreateError()
Loading