From 99408b30f0a9192694231d9bab750b52a565eec0 Mon Sep 17 00:00:00 2001
From: Tian Xia <cblmemo@gmail.com>
Date: Tue, 23 Apr 2024 16:54:32 +0800
Subject: [PATCH] [Core][BugFix] Fix GPU detach when using docker container as
 runtime env (#3436)

* fix

* Apply suggestions from code review

* fix and add smoke test

* increase timeout

* longer timeout

* Fix docker test to avoid using python 3.12

* Update tests/test_smoke.py

Co-authored-by: Zhanghao Wu <zhanghao.wu@outlook.com>

* Update tests/test_smoke.py

Co-authored-by: Zhanghao Wu <zhanghao.wu@outlook.com>

* fix smoketest

* change time to sleep

---------

Co-authored-by: Zhanghao Wu <zhanghao.wu@outlook.com>
---
 examples/job_queue/job_docker.yaml     |  5 ++++-
 sky/provision/docker_utils.py          | 13 +++++++++++--
 sky/skylet/providers/command_runner.py | 11 +++++++++++
 tests/test_smoke.py                    | 17 +++++++++++++----
 4 files changed, 39 insertions(+), 7 deletions(-)

diff --git a/examples/job_queue/job_docker.yaml b/examples/job_queue/job_docker.yaml
index da604125865..37f23f3eef1 100644
--- a/examples/job_queue/job_docker.yaml
+++ b/examples/job_queue/job_docker.yaml
@@ -8,6 +8,9 @@
 
 name: job_docker
 
+envs:
+  TIME_TO_SLEEP: 180
+
 resources:
   accelerators: T4:0.5
   image_id: docker:ubuntu:20.04
@@ -18,7 +21,7 @@ setup: |
 run: |
   timestamp=$(date +%s)
   conda env list
-  for i in {1..180}; do
+  for i in $(seq 1 $TIME_TO_SLEEP); do
     echo "$timestamp $i"
     sleep 1
   done
diff --git a/sky/provision/docker_utils.py b/sky/provision/docker_utils.py
index b5394cd560c..ab6c92d558a 100644
--- a/sky/provision/docker_utils.py
+++ b/sky/provision/docker_utils.py
@@ -218,14 +218,23 @@ def initialize(self) -> str:
                     f'{specific_image}')
         container_running = self._check_container_status()
         if container_running:
-            running_image = (self._run(
-                check_docker_image(self.container_name, self.docker_cmd)))
+            running_image = self._run(
+                check_docker_image(self.container_name, self.docker_cmd))
             if running_image != specific_image:
                 logger.error(
                     f'A container with name {self.container_name} is running '
                     f'image {running_image} instead of {specific_image} (which '
                     'was provided in the YAML)')
         else:
+            # Edit docker config first to avoid disconnecting the container
+            # from GPUs when a systemctl command is called. This is a known
+            # issue with nvidia container toolkit:
+            # https://github.com/NVIDIA/nvidia-container-toolkit/issues/48
+            self._run(
+                'sudo jq \'.["exec-opts"] = ["native.cgroupdriver=cgroupfs"]\' '
+                '/etc/docker/daemon.json > /tmp/daemon.json;'
+                'sudo mv /tmp/daemon.json /etc/docker/daemon.json;'
+                'sudo systemctl restart docker')
             user_docker_run_options = self.docker_config.get('run_options', [])
             start_command = docker_start_cmds(
                 specific_image,
diff --git a/sky/skylet/providers/command_runner.py b/sky/skylet/providers/command_runner.py
index b6ea52c6eeb..ae54abe4a6b 100644
--- a/sky/skylet/providers/command_runner.py
+++ b/sky/skylet/providers/command_runner.py
@@ -229,6 +229,17 @@ def run_init(self, *, as_head: bool, file_mounts: Dict[str, str],
                     f'The `image_env` is:\n{image_env}')
                 raise e
 
+            # Edit docker config first to avoid disconnecting the container
+            # from GPUs when a systemctl command is called. This is a known
+            # issue with nvidia container toolkit:
+            # https://github.com/NVIDIA/nvidia-container-toolkit/issues/48
+            self.run(
+                'sudo jq \'.["exec-opts"] = ["native.cgroupdriver=cgroupfs"]\' '
+                '/etc/docker/daemon.json > /tmp/daemon.json;'
+                'sudo mv /tmp/daemon.json /etc/docker/daemon.json;'
+                'sudo systemctl restart docker',
+                run_env='host')
+
             user_docker_run_options = self.docker_config.get(
                 'run_options', []) + self.docker_config.get(
                     f'{"head" if as_head else "worker"}_run_options', [])
diff --git a/tests/test_smoke.py b/tests/test_smoke.py
index 63179e0be68..bcfe96cdbac 100644
--- a/tests/test_smoke.py
+++ b/tests/test_smoke.py
@@ -1200,13 +1200,15 @@ def test_job_queue(generic_cloud: str):
     ])
 def test_job_queue_with_docker(generic_cloud: str, image_id: str):
     name = _get_cluster_name() + image_id[len('docker:'):][:4]
+    total_timeout_minutes = 40 if generic_cloud == 'azure' else 15
+    time_to_sleep = 300 if generic_cloud == 'azure' else 180
     test = Test(
         'job_queue_with_docker',
         [
             f'sky launch -y -c {name} --cloud {generic_cloud} --image-id {image_id} examples/job_queue/cluster_docker.yaml',
-            f'sky exec {name} -n {name}-1 -d --image-id {image_id} examples/job_queue/job_docker.yaml',
-            f'sky exec {name} -n {name}-2 -d --image-id {image_id} examples/job_queue/job_docker.yaml',
-            f'sky exec {name} -n {name}-3 -d --image-id {image_id} examples/job_queue/job_docker.yaml',
+            f'sky exec {name} -n {name}-1 -d --image-id {image_id} --env TIME_TO_SLEEP={time_to_sleep} examples/job_queue/job_docker.yaml',
+            f'sky exec {name} -n {name}-2 -d --image-id {image_id} --env TIME_TO_SLEEP={time_to_sleep} examples/job_queue/job_docker.yaml',
+            f'sky exec {name} -n {name}-3 -d --image-id {image_id} --env TIME_TO_SLEEP={time_to_sleep} examples/job_queue/job_docker.yaml',
             f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep {name}-1 | grep RUNNING',
             f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep {name}-2 | grep RUNNING',
             f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep {name}-3 | grep PENDING',
@@ -1214,6 +1216,9 @@ def test_job_queue_with_docker(generic_cloud: str, image_id: str):
             'sleep 5',
             f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep {name}-3 | grep RUNNING',
             f'sky cancel -y {name} 3',
+            # Make sure the GPU is still visible to the container.
+            f'sky exec {name} --image-id {image_id} nvidia-smi | grep "Tesla T4"',
+            f'sky logs {name} 4 --status',
             f'sky stop -y {name}',
             # Make sure the job status preserve after stop and start the
             # cluster. This is also a test for the docker container to be
@@ -1224,10 +1229,14 @@ def test_job_queue_with_docker(generic_cloud: str, image_id: str):
             f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep {name}-3 | grep CANCELLED',
             f'sky exec {name} --gpus T4:0.2 "[[ \$SKYPILOT_NUM_GPUS_PER_NODE -eq 1 ]] || exit 1"',
             f'sky exec {name} --gpus T4:1 "[[ \$SKYPILOT_NUM_GPUS_PER_NODE -eq 1 ]] || exit 1"',
-            f'sky logs {name} 4 --status',
             f'sky logs {name} 5 --status',
+            f'sky logs {name} 6 --status',
+            # Make sure it is still visible after an stop & start cycle.
+            f'sky exec {name} --image-id {image_id} nvidia-smi | grep "Tesla T4"',
+            f'sky logs {name} 7 --status'
         ],
         f'sky down -y {name}',
+        timeout=total_timeout_minutes * 60,
     )
     run_one_test(test)