Skip to content

Commit

Permalink
[Core] Add docker run options (#3682)
Browse files Browse the repository at this point in the history
* Add docker run options

* Add docs

* Add warning for docker run options in kubernetes

* Update docs/source/reference/config.rst

Co-authored-by: Romil Bhardwaj <[email protected]>

* update

* update doc

* Stream logs

* allow changing the `run_options`

---------

Co-authored-by: Romil Bhardwaj <[email protected]>
  • Loading branch information
Michaelvll and romilbhardwaj authored Jun 26, 2024
1 parent ea4506a commit bd383e9
Show file tree
Hide file tree
Showing 9 changed files with 105 additions and 26 deletions.
25 changes: 25 additions & 0 deletions docs/source/reference/config.rst
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,31 @@ Available fields and semantics:
- gcp
- kubernetes
docker:
# Additional Docker run options (optional).
#
# When image_id: docker:<docker_image> is used in a task YAML, additional
# run options for starting the Docker container can be specified here.
# These options will be passed directly as command line args to `docker run`,
# see: https://docs.docker.com/reference/cli/docker/container/run/
#
# The following run options are applied by default and cannot be overridden:
# --net=host
# --cap-add=SYS_ADMIN
# --device=/dev/fuse
# --security-opt=apparmor:unconfined
# --runtime=nvidia # Applied if nvidia GPUs are detected on the host
#
# This field can be useful for mounting volumes and other advanced Docker
# configurations. You can specify a list of arguments or a string, where the
# former will be combined into a single string with spaces. The following is
# an example option for allowing running Docker inside Docker and increase
# the size of /dev/shm.:
# sky launch --cloud aws --image-id docker:continuumio/miniconda3 "apt update; apt install -y docker.io; docker run hello-world"
run_options:
- -v /var/run/docker.sock:/var/run/docker.sock
- --shm-size=2g
nvidia_gpus:
# Disable ECC for NVIDIA GPUs (optional).
#
Expand Down
15 changes: 15 additions & 0 deletions sky/backends/backend_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,7 @@
# Clouds with new provisioner has docker_login_config in the
# docker field, instead of the provider field.
('docker', 'docker_login_config'),
('docker', 'run_options'),
# Other clouds
('provider', 'docker_login_config'),
('provider', 'firewall_rule'),
Expand Down Expand Up @@ -873,6 +874,17 @@ def write_cluster_config(
f'open(os.path.expanduser("{constants.SKY_REMOTE_RAY_PORT_FILE}"), "w", encoding="utf-8"))\''
)

# Docker run options
docker_run_options = skypilot_config.get_nested(('docker', 'run_options'),
[])
if isinstance(docker_run_options, str):
docker_run_options = [docker_run_options]
if docker_run_options and isinstance(to_provision.cloud, clouds.Kubernetes):
logger.warning(f'{colorama.Style.DIM}Docker run options are specified, '
'but ignored for Kubernetes: '
f'{" ".join(docker_run_options)}'
f'{colorama.Style.RESET_ALL}')

# Use a tmp file path to avoid incomplete YAML file being re-used in the
# future.
initial_setup_commands = []
Expand Down Expand Up @@ -923,6 +935,9 @@ def write_cluster_config(
wheel_hash).replace('{cloud}',
str(cloud).lower())),

# Docker
'docker_run_options': docker_run_options,

# Port of Ray (GCS server).
# Ray's default port 6379 is conflicted with Redis.
'ray_port': constants.SKY_REMOTE_RAY_PORT,
Expand Down
6 changes: 4 additions & 2 deletions sky/provision/docker_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,8 +176,10 @@ def _run(self,
subprocess_utils.handle_returncode(
rc,
cmd,
error_msg='Failed to run docker setup commands',
stderr=stdout + stderr)
error_msg='Failed to run docker setup commands.',
stderr=stdout + stderr,
# Print out the error message if the command failed.
stream_logs=True)
return stdout.strip()

def initialize(self) -> str:
Expand Down
55 changes: 31 additions & 24 deletions sky/provision/instance_setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,9 @@
import os
import resource
import time
from typing import Any, Dict, List, Optional, Tuple
from typing import Any, Callable, Dict, List, Optional, Tuple

from sky import exceptions
from sky import provision
from sky import sky_logging
from sky.provision import common
Expand Down Expand Up @@ -68,29 +69,34 @@
'sky.skylet.attempt_skylet;')


def _auto_retry(func):
def _auto_retry(should_retry: Callable[[Exception], bool] = lambda _: True):
"""Decorator that retries the function if it fails.
This decorator is mostly for SSH disconnection issues, which might happen
during the setup of instances.
"""

@functools.wraps(func)
def retry(*args, **kwargs):
backoff = common_utils.Backoff(initial_backoff=1, max_backoff_factor=5)
for retry_cnt in range(_MAX_RETRY):
try:
return func(*args, **kwargs)
except Exception as e: # pylint: disable=broad-except
if retry_cnt >= _MAX_RETRY - 1:
raise e
sleep = backoff.current_backoff()
logger.info(
f'{func.__name__}: Retrying in {sleep:.1f} seconds, '
f'due to {e}')
time.sleep(sleep)

return retry
def decorator(func):

@functools.wraps(func)
def retry(*args, **kwargs):
backoff = common_utils.Backoff(initial_backoff=1,
max_backoff_factor=5)
for retry_cnt in range(_MAX_RETRY):
try:
return func(*args, **kwargs)
except Exception as e: # pylint: disable=broad-except
if not should_retry(e) or retry_cnt >= _MAX_RETRY - 1:
raise
sleep = backoff.current_backoff()
logger.info(
f'{func.__name__}: Retrying in {sleep:.1f} seconds, '
f'due to {e}')
time.sleep(sleep)

return retry

return decorator


def _log_start_end(func):
Expand Down Expand Up @@ -156,7 +162,8 @@ def initialize_docker(cluster_name: str, docker_config: Dict[str, Any],
return None
_hint_worker_log_path(cluster_name, cluster_info, 'initialize_docker')

@_auto_retry
@_auto_retry(should_retry=lambda e: isinstance(e, exceptions.CommandError)
and e.returncode == 255)
def _initialize_docker(runner: command_runner.CommandRunner, log_path: str):
docker_user = docker_utils.DockerInitializer(docker_config, runner,
log_path).initialize()
Expand Down Expand Up @@ -193,7 +200,7 @@ def setup_runtime_on_cluster(cluster_name: str, setup_commands: List[str],
hasher.update(d)
digest = hasher.hexdigest()

@_auto_retry
@_auto_retry()
def _setup_node(runner: command_runner.CommandRunner, log_path: str):
for cmd in setup_commands:
returncode, stdout, stderr = runner.run(
Expand Down Expand Up @@ -254,7 +261,7 @@ def _ray_gpu_options(custom_resource: str) -> str:


@_log_start_end
@_auto_retry
@_auto_retry()
def start_ray_on_head_node(cluster_name: str, custom_resource: Optional[str],
cluster_info: common.ClusterInfo,
ssh_credentials: Dict[str, Any]) -> None:
Expand Down Expand Up @@ -314,7 +321,7 @@ def start_ray_on_head_node(cluster_name: str, custom_resource: Optional[str],


@_log_start_end
@_auto_retry
@_auto_retry()
def start_ray_on_worker_nodes(cluster_name: str, no_restart: bool,
custom_resource: Optional[str], ray_port: int,
cluster_info: common.ClusterInfo,
Expand Down Expand Up @@ -411,7 +418,7 @@ def _setup_ray_worker(runner_and_id: Tuple[command_runner.CommandRunner,


@_log_start_end
@_auto_retry
@_auto_retry()
def start_skylet_on_head_node(cluster_name: str,
cluster_info: common.ClusterInfo,
ssh_credentials: Dict[str, Any]) -> None:
Expand All @@ -437,7 +444,7 @@ def start_skylet_on_head_node(cluster_name: str,
f'===== stderr ====={stderr}')


@_auto_retry
@_auto_retry()
def _internal_file_mounts(file_mounts: Dict,
runner: command_runner.CommandRunner,
log_path: str) -> None:
Expand Down
3 changes: 3 additions & 0 deletions sky/templates/aws-ray.yml.j2
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,9 @@ docker:
{%- if custom_resources is not none %}
--gpus all
{%- endif %}
{%- for run_option in docker_run_options %}
- {{run_option}}
{%- endfor %}
{%- if docker_login_config is not none %}
docker_login_config:
username: |-
Expand Down
3 changes: 3 additions & 0 deletions sky/templates/azure-ray.yml.j2
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,9 @@ docker:
{%- if custom_resources is not none %}
--gpus all
{%- endif %}
{%- for run_option in docker_run_options %}
- {{run_option}}
{%- endfor %}
{%- endif %}

provider:
Expand Down
3 changes: 3 additions & 0 deletions sky/templates/gcp-ray.yml.j2
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,9 @@ docker:
{%- if gpu is not none %}
--gpus all
{%- endif %}
{%- for run_option in docker_run_options %}
- {{run_option}}
{%- endfor %}
{%- if docker_login_config is not none %}
docker_login_config:
username: |-
Expand Down
3 changes: 3 additions & 0 deletions sky/templates/paperspace-ray.yml.j2
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,9 @@ docker:
{%- if custom_resources is not none %}
--gpus all
{%- endif %}
{%- for run_option in docker_run_options %}
- {{run_option}}
{%- endfor %}
{%- if docker_login_config is not none %}
docker_login_config:
username: |-
Expand Down
18 changes: 18 additions & 0 deletions sky/utils/schemas.py
Original file line number Diff line number Diff line change
Expand Up @@ -757,6 +757,23 @@ def get_config_schema():
}
}

docker_configs = {
'type': 'object',
'required': [],
'additionalProperties': False,
'properties': {
'run_options': {
'anyOf': [{
'type': 'string',
}, {
'type': 'array',
'items': {
'type': 'string',
}
}]
}
}
}
gpu_configs = {
'type': 'object',
'required': [],
Expand Down Expand Up @@ -785,6 +802,7 @@ def get_config_schema():
'spot': controller_resources_schema,
'serve': controller_resources_schema,
'allowed_clouds': allowed_clouds,
'docker': docker_configs,
'nvidia_gpus': gpu_configs,
**cloud_configs,
},
Expand Down

0 comments on commit bd383e9

Please sign in to comment.