From 99408b30f0a9192694231d9bab750b52a565eec0 Mon Sep 17 00:00:00 2001 From: Tian Xia Date: Tue, 23 Apr 2024 16:54:32 +0800 Subject: [PATCH] [Core][BugFix] Fix GPU detach when using docker container as runtime env (#3436) * fix * Apply suggestions from code review * fix and add smoke test * increase timeout * longer timeout * Fix docker test to avoid using python 3.12 * Update tests/test_smoke.py Co-authored-by: Zhanghao Wu * Update tests/test_smoke.py Co-authored-by: Zhanghao Wu * fix smoketest * change time to sleep --------- Co-authored-by: Zhanghao Wu --- examples/job_queue/job_docker.yaml | 5 ++++- sky/provision/docker_utils.py | 13 +++++++++++-- sky/skylet/providers/command_runner.py | 11 +++++++++++ tests/test_smoke.py | 17 +++++++++++++---- 4 files changed, 39 insertions(+), 7 deletions(-) diff --git a/examples/job_queue/job_docker.yaml b/examples/job_queue/job_docker.yaml index da604125865..37f23f3eef1 100644 --- a/examples/job_queue/job_docker.yaml +++ b/examples/job_queue/job_docker.yaml @@ -8,6 +8,9 @@ name: job_docker +envs: + TIME_TO_SLEEP: 180 + resources: accelerators: T4:0.5 image_id: docker:ubuntu:20.04 @@ -18,7 +21,7 @@ setup: | run: | timestamp=$(date +%s) conda env list - for i in {1..180}; do + for i in $(seq 1 $TIME_TO_SLEEP); do echo "$timestamp $i" sleep 1 done diff --git a/sky/provision/docker_utils.py b/sky/provision/docker_utils.py index b5394cd560c..ab6c92d558a 100644 --- a/sky/provision/docker_utils.py +++ b/sky/provision/docker_utils.py @@ -218,14 +218,23 @@ def initialize(self) -> str: f'{specific_image}') container_running = self._check_container_status() if container_running: - running_image = (self._run( - check_docker_image(self.container_name, self.docker_cmd))) + running_image = self._run( + check_docker_image(self.container_name, self.docker_cmd)) if running_image != specific_image: logger.error( f'A container with name {self.container_name} is running ' f'image {running_image} instead of {specific_image} (which ' 'was provided in the YAML)') else: + # Edit docker config first to avoid disconnecting the container + # from GPUs when a systemctl command is called. This is a known + # issue with nvidia container toolkit: + # https://github.com/NVIDIA/nvidia-container-toolkit/issues/48 + self._run( + 'sudo jq \'.["exec-opts"] = ["native.cgroupdriver=cgroupfs"]\' ' + '/etc/docker/daemon.json > /tmp/daemon.json;' + 'sudo mv /tmp/daemon.json /etc/docker/daemon.json;' + 'sudo systemctl restart docker') user_docker_run_options = self.docker_config.get('run_options', []) start_command = docker_start_cmds( specific_image, diff --git a/sky/skylet/providers/command_runner.py b/sky/skylet/providers/command_runner.py index b6ea52c6eeb..ae54abe4a6b 100644 --- a/sky/skylet/providers/command_runner.py +++ b/sky/skylet/providers/command_runner.py @@ -229,6 +229,17 @@ def run_init(self, *, as_head: bool, file_mounts: Dict[str, str], f'The `image_env` is:\n{image_env}') raise e + # Edit docker config first to avoid disconnecting the container + # from GPUs when a systemctl command is called. This is a known + # issue with nvidia container toolkit: + # https://github.com/NVIDIA/nvidia-container-toolkit/issues/48 + self.run( + 'sudo jq \'.["exec-opts"] = ["native.cgroupdriver=cgroupfs"]\' ' + '/etc/docker/daemon.json > /tmp/daemon.json;' + 'sudo mv /tmp/daemon.json /etc/docker/daemon.json;' + 'sudo systemctl restart docker', + run_env='host') + user_docker_run_options = self.docker_config.get( 'run_options', []) + self.docker_config.get( f'{"head" if as_head else "worker"}_run_options', []) diff --git a/tests/test_smoke.py b/tests/test_smoke.py index 63179e0be68..bcfe96cdbac 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -1200,13 +1200,15 @@ def test_job_queue(generic_cloud: str): ]) def test_job_queue_with_docker(generic_cloud: str, image_id: str): name = _get_cluster_name() + image_id[len('docker:'):][:4] + total_timeout_minutes = 40 if generic_cloud == 'azure' else 15 + time_to_sleep = 300 if generic_cloud == 'azure' else 180 test = Test( 'job_queue_with_docker', [ f'sky launch -y -c {name} --cloud {generic_cloud} --image-id {image_id} examples/job_queue/cluster_docker.yaml', - f'sky exec {name} -n {name}-1 -d --image-id {image_id} examples/job_queue/job_docker.yaml', - f'sky exec {name} -n {name}-2 -d --image-id {image_id} examples/job_queue/job_docker.yaml', - f'sky exec {name} -n {name}-3 -d --image-id {image_id} examples/job_queue/job_docker.yaml', + f'sky exec {name} -n {name}-1 -d --image-id {image_id} --env TIME_TO_SLEEP={time_to_sleep} examples/job_queue/job_docker.yaml', + f'sky exec {name} -n {name}-2 -d --image-id {image_id} --env TIME_TO_SLEEP={time_to_sleep} examples/job_queue/job_docker.yaml', + f'sky exec {name} -n {name}-3 -d --image-id {image_id} --env TIME_TO_SLEEP={time_to_sleep} examples/job_queue/job_docker.yaml', f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep {name}-1 | grep RUNNING', f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep {name}-2 | grep RUNNING', f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep {name}-3 | grep PENDING', @@ -1214,6 +1216,9 @@ def test_job_queue_with_docker(generic_cloud: str, image_id: str): 'sleep 5', f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep {name}-3 | grep RUNNING', f'sky cancel -y {name} 3', + # Make sure the GPU is still visible to the container. + f'sky exec {name} --image-id {image_id} nvidia-smi | grep "Tesla T4"', + f'sky logs {name} 4 --status', f'sky stop -y {name}', # Make sure the job status preserve after stop and start the # cluster. This is also a test for the docker container to be @@ -1224,10 +1229,14 @@ def test_job_queue_with_docker(generic_cloud: str, image_id: str): f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep {name}-3 | grep CANCELLED', f'sky exec {name} --gpus T4:0.2 "[[ \$SKYPILOT_NUM_GPUS_PER_NODE -eq 1 ]] || exit 1"', f'sky exec {name} --gpus T4:1 "[[ \$SKYPILOT_NUM_GPUS_PER_NODE -eq 1 ]] || exit 1"', - f'sky logs {name} 4 --status', f'sky logs {name} 5 --status', + f'sky logs {name} 6 --status', + # Make sure it is still visible after an stop & start cycle. + f'sky exec {name} --image-id {image_id} nvidia-smi | grep "Tesla T4"', + f'sky logs {name} 7 --status' ], f'sky down -y {name}', + timeout=total_timeout_minutes * 60, ) run_one_test(test)