From 24cbcbd1e4420fa8103f180f753053990e8f8a80 Mon Sep 17 00:00:00 2001 From: Mike Lin Date: Sun, 24 Nov 2019 23:40:51 -1000 Subject: [PATCH] runner scalability adjustments from applied testing --- WDL/runtime/task.py | 40 ++++++++++++++++++++++++++++++++-------- tests/test_4taskrun.py | 4 ++-- 2 files changed, 34 insertions(+), 10 deletions(-) diff --git a/WDL/runtime/task.py b/WDL/runtime/task.py index 2cee2217..03dcbff3 100644 --- a/WDL/runtime/task.py +++ b/WDL/runtime/task.py @@ -10,6 +10,7 @@ import traceback import glob import time +import random import multiprocessing import threading import shutil @@ -317,11 +318,15 @@ def _run( # stream stderr into log with PygtailLogger(logger, os.path.join(self.host_dir, "stderr.txt")) as poll_stderr: # poll for container exit + running = False while exit_code is None: - time.sleep(1) + time.sleep(random.uniform(1.0, 2.0)) # spread out work over the GIL if terminating(): raise Terminated() from None if "running" in self._observed_states: + if not running: + logger.notice("container running") # pyre-fixme + running = True poll_stderr() exit_code = self.poll_service(logger, svc) logger.debug( @@ -478,14 +483,33 @@ def chown(self, logger: logging.Logger, client: docker.DockerClient) -> None: chown -RP {os.geteuid()}:{os.getegid()} {shlex.quote(os.path.join(self.container_dir, 'work'))} """.strip() volumes = {self.host_dir: {"bind": self.container_dir, "mode": "rw"}} + logger.debug(_("post-task chown", script=script, volumes=volumes)) try: - logger.debug(_("post-task chown", script=script, volumes=volumes)) - client.containers.run( - "alpine:3", - command=["/bin/ash", "-c", script], - volumes=volumes, - auto_remove=True, - ) + chowner = None + try: + chowner = client.containers.run( + "alpine:3", + command=["/bin/ash", "-c", script], + volumes=volumes, + detach=True, + ) + chowner_status = None + while chowner_status is None: + try: + chowner_status = chowner.wait() + except Exception as exn: + s_exn = str(exn) + if "timed out" not in s_exn and "Timeout" not in s_exn: + raise + logger.warning("post-task chown is taking a long time") + + assert ( + isinstance(chowner_status, dict) + and chowner_status.get("StatusCode", -1) == 0 + ), str(chowner_status) + finally: + if chowner: + chowner.remove() except: logger.exception("post-task chown failed") diff --git a/tests/test_4taskrun.py b/tests/test_4taskrun.py index b38184cd..a8c0aafe 100644 --- a/tests/test_4taskrun.py +++ b/tests/test_4taskrun.py @@ -145,8 +145,8 @@ def test_logging_std_err(self, capture): for record in std_error_msgs: line_written = int(record.msg.split('=')[1]) self.assertGreater(record.created, line_written) - # check line logged within 3 seconds of being written - self.assertGreater(line_written+3, record.created) + # check line logged within 4 seconds of being written + self.assertGreater(line_written+4, record.created) @log_capture() def test_logging_std_err_captures_full_line(self, capture):