Skip to content

Commit

Permalink
[Core][BugFix] Fix GPU detach when using docker container as runtime …
Browse files Browse the repository at this point in the history
…env (#3436)

* fix

* Apply suggestions from code review

* fix and add smoke test

* increase timeout

* longer timeout

* Fix docker test to avoid using python 3.12

* Update tests/test_smoke.py

Co-authored-by: Zhanghao Wu <[email protected]>

* Update tests/test_smoke.py

Co-authored-by: Zhanghao Wu <[email protected]>

* fix smoketest

* change time to sleep

---------

Co-authored-by: Zhanghao Wu <[email protected]>
  • Loading branch information
cblmemo and Michaelvll authored Apr 23, 2024
1 parent 6eb7203 commit 99408b3
Show file tree
Hide file tree
Showing 4 changed files with 39 additions and 7 deletions.
5 changes: 4 additions & 1 deletion examples/job_queue/job_docker.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@

name: job_docker

envs:
TIME_TO_SLEEP: 180

resources:
accelerators: T4:0.5
image_id: docker:ubuntu:20.04
Expand All @@ -18,7 +21,7 @@ setup: |
run: |
timestamp=$(date +%s)
conda env list
for i in {1..180}; do
for i in $(seq 1 $TIME_TO_SLEEP); do
echo "$timestamp $i"
sleep 1
done
13 changes: 11 additions & 2 deletions sky/provision/docker_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -218,14 +218,23 @@ def initialize(self) -> str:
f'{specific_image}')
container_running = self._check_container_status()
if container_running:
running_image = (self._run(
check_docker_image(self.container_name, self.docker_cmd)))
running_image = self._run(
check_docker_image(self.container_name, self.docker_cmd))
if running_image != specific_image:
logger.error(
f'A container with name {self.container_name} is running '
f'image {running_image} instead of {specific_image} (which '
'was provided in the YAML)')
else:
# Edit docker config first to avoid disconnecting the container
# from GPUs when a systemctl command is called. This is a known
# issue with nvidia container toolkit:
# https://github.com/NVIDIA/nvidia-container-toolkit/issues/48
self._run(
'sudo jq \'.["exec-opts"] = ["native.cgroupdriver=cgroupfs"]\' '
'/etc/docker/daemon.json > /tmp/daemon.json;'
'sudo mv /tmp/daemon.json /etc/docker/daemon.json;'
'sudo systemctl restart docker')
user_docker_run_options = self.docker_config.get('run_options', [])
start_command = docker_start_cmds(
specific_image,
Expand Down
11 changes: 11 additions & 0 deletions sky/skylet/providers/command_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -229,6 +229,17 @@ def run_init(self, *, as_head: bool, file_mounts: Dict[str, str],
f'The `image_env` is:\n{image_env}')
raise e

# Edit docker config first to avoid disconnecting the container
# from GPUs when a systemctl command is called. This is a known
# issue with nvidia container toolkit:
# https://github.com/NVIDIA/nvidia-container-toolkit/issues/48
self.run(
'sudo jq \'.["exec-opts"] = ["native.cgroupdriver=cgroupfs"]\' '
'/etc/docker/daemon.json > /tmp/daemon.json;'
'sudo mv /tmp/daemon.json /etc/docker/daemon.json;'
'sudo systemctl restart docker',
run_env='host')

user_docker_run_options = self.docker_config.get(
'run_options', []) + self.docker_config.get(
f'{"head" if as_head else "worker"}_run_options', [])
Expand Down
17 changes: 13 additions & 4 deletions tests/test_smoke.py
Original file line number Diff line number Diff line change
Expand Up @@ -1200,20 +1200,25 @@ def test_job_queue(generic_cloud: str):
])
def test_job_queue_with_docker(generic_cloud: str, image_id: str):
name = _get_cluster_name() + image_id[len('docker:'):][:4]
total_timeout_minutes = 40 if generic_cloud == 'azure' else 15
time_to_sleep = 300 if generic_cloud == 'azure' else 180
test = Test(
'job_queue_with_docker',
[
f'sky launch -y -c {name} --cloud {generic_cloud} --image-id {image_id} examples/job_queue/cluster_docker.yaml',
f'sky exec {name} -n {name}-1 -d --image-id {image_id} examples/job_queue/job_docker.yaml',
f'sky exec {name} -n {name}-2 -d --image-id {image_id} examples/job_queue/job_docker.yaml',
f'sky exec {name} -n {name}-3 -d --image-id {image_id} examples/job_queue/job_docker.yaml',
f'sky exec {name} -n {name}-1 -d --image-id {image_id} --env TIME_TO_SLEEP={time_to_sleep} examples/job_queue/job_docker.yaml',
f'sky exec {name} -n {name}-2 -d --image-id {image_id} --env TIME_TO_SLEEP={time_to_sleep} examples/job_queue/job_docker.yaml',
f'sky exec {name} -n {name}-3 -d --image-id {image_id} --env TIME_TO_SLEEP={time_to_sleep} examples/job_queue/job_docker.yaml',
f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep {name}-1 | grep RUNNING',
f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep {name}-2 | grep RUNNING',
f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep {name}-3 | grep PENDING',
f'sky cancel -y {name} 2',
'sleep 5',
f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep {name}-3 | grep RUNNING',
f'sky cancel -y {name} 3',
# Make sure the GPU is still visible to the container.
f'sky exec {name} --image-id {image_id} nvidia-smi | grep "Tesla T4"',
f'sky logs {name} 4 --status',
f'sky stop -y {name}',
# Make sure the job status preserve after stop and start the
# cluster. This is also a test for the docker container to be
Expand All @@ -1224,10 +1229,14 @@ def test_job_queue_with_docker(generic_cloud: str, image_id: str):
f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep {name}-3 | grep CANCELLED',
f'sky exec {name} --gpus T4:0.2 "[[ \$SKYPILOT_NUM_GPUS_PER_NODE -eq 1 ]] || exit 1"',
f'sky exec {name} --gpus T4:1 "[[ \$SKYPILOT_NUM_GPUS_PER_NODE -eq 1 ]] || exit 1"',
f'sky logs {name} 4 --status',
f'sky logs {name} 5 --status',
f'sky logs {name} 6 --status',
# Make sure it is still visible after an stop & start cycle.
f'sky exec {name} --image-id {image_id} nvidia-smi | grep "Tesla T4"',
f'sky logs {name} 7 --status'
],
f'sky down -y {name}',
timeout=total_timeout_minutes * 60,
)
run_one_test(test)

Expand Down

0 comments on commit 99408b3

Please sign in to comment.