From bd383e912a55f0afbd9cc3c239771dbbf3dcb900 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Wed, 26 Jun 2024 01:47:21 -0700 Subject: [PATCH] [Core] Add docker run options (#3682) * Add docker run options * Add docs * Add warning for docker run options in kubernetes * Update docs/source/reference/config.rst Co-authored-by: Romil Bhardwaj * update * update doc * Stream logs * allow changing the `run_options` --------- Co-authored-by: Romil Bhardwaj --- docs/source/reference/config.rst | 25 +++++++++++++ sky/backends/backend_utils.py | 15 ++++++++ sky/provision/docker_utils.py | 6 ++-- sky/provision/instance_setup.py | 55 ++++++++++++++++------------- sky/templates/aws-ray.yml.j2 | 3 ++ sky/templates/azure-ray.yml.j2 | 3 ++ sky/templates/gcp-ray.yml.j2 | 3 ++ sky/templates/paperspace-ray.yml.j2 | 3 ++ sky/utils/schemas.py | 18 ++++++++++ 9 files changed, 105 insertions(+), 26 deletions(-) diff --git a/docs/source/reference/config.rst b/docs/source/reference/config.rst index 96be48e71e3..ea744f925f1 100644 --- a/docs/source/reference/config.rst +++ b/docs/source/reference/config.rst @@ -40,6 +40,31 @@ Available fields and semantics: - gcp - kubernetes + docker: + # Additional Docker run options (optional). + # + # When image_id: docker: is used in a task YAML, additional + # run options for starting the Docker container can be specified here. + # These options will be passed directly as command line args to `docker run`, + # see: https://docs.docker.com/reference/cli/docker/container/run/ + # + # The following run options are applied by default and cannot be overridden: + # --net=host + # --cap-add=SYS_ADMIN + # --device=/dev/fuse + # --security-opt=apparmor:unconfined + # --runtime=nvidia # Applied if nvidia GPUs are detected on the host + # + # This field can be useful for mounting volumes and other advanced Docker + # configurations. You can specify a list of arguments or a string, where the + # former will be combined into a single string with spaces. The following is + # an example option for allowing running Docker inside Docker and increase + # the size of /dev/shm.: + # sky launch --cloud aws --image-id docker:continuumio/miniconda3 "apt update; apt install -y docker.io; docker run hello-world" + run_options: + - -v /var/run/docker.sock:/var/run/docker.sock + - --shm-size=2g + nvidia_gpus: # Disable ECC for NVIDIA GPUs (optional). # diff --git a/sky/backends/backend_utils.py b/sky/backends/backend_utils.py index 0989a3f9122..e760132068b 100644 --- a/sky/backends/backend_utils.py +++ b/sky/backends/backend_utils.py @@ -146,6 +146,7 @@ # Clouds with new provisioner has docker_login_config in the # docker field, instead of the provider field. ('docker', 'docker_login_config'), + ('docker', 'run_options'), # Other clouds ('provider', 'docker_login_config'), ('provider', 'firewall_rule'), @@ -873,6 +874,17 @@ def write_cluster_config( f'open(os.path.expanduser("{constants.SKY_REMOTE_RAY_PORT_FILE}"), "w", encoding="utf-8"))\'' ) + # Docker run options + docker_run_options = skypilot_config.get_nested(('docker', 'run_options'), + []) + if isinstance(docker_run_options, str): + docker_run_options = [docker_run_options] + if docker_run_options and isinstance(to_provision.cloud, clouds.Kubernetes): + logger.warning(f'{colorama.Style.DIM}Docker run options are specified, ' + 'but ignored for Kubernetes: ' + f'{" ".join(docker_run_options)}' + f'{colorama.Style.RESET_ALL}') + # Use a tmp file path to avoid incomplete YAML file being re-used in the # future. initial_setup_commands = [] @@ -923,6 +935,9 @@ def write_cluster_config( wheel_hash).replace('{cloud}', str(cloud).lower())), + # Docker + 'docker_run_options': docker_run_options, + # Port of Ray (GCS server). # Ray's default port 6379 is conflicted with Redis. 'ray_port': constants.SKY_REMOTE_RAY_PORT, diff --git a/sky/provision/docker_utils.py b/sky/provision/docker_utils.py index 046800ca9d1..9fbc19c2959 100644 --- a/sky/provision/docker_utils.py +++ b/sky/provision/docker_utils.py @@ -176,8 +176,10 @@ def _run(self, subprocess_utils.handle_returncode( rc, cmd, - error_msg='Failed to run docker setup commands', - stderr=stdout + stderr) + error_msg='Failed to run docker setup commands.', + stderr=stdout + stderr, + # Print out the error message if the command failed. + stream_logs=True) return stdout.strip() def initialize(self) -> str: diff --git a/sky/provision/instance_setup.py b/sky/provision/instance_setup.py index c81ecd78db4..1fb80ba542a 100644 --- a/sky/provision/instance_setup.py +++ b/sky/provision/instance_setup.py @@ -6,8 +6,9 @@ import os import resource import time -from typing import Any, Dict, List, Optional, Tuple +from typing import Any, Callable, Dict, List, Optional, Tuple +from sky import exceptions from sky import provision from sky import sky_logging from sky.provision import common @@ -68,29 +69,34 @@ 'sky.skylet.attempt_skylet;') -def _auto_retry(func): +def _auto_retry(should_retry: Callable[[Exception], bool] = lambda _: True): """Decorator that retries the function if it fails. This decorator is mostly for SSH disconnection issues, which might happen during the setup of instances. """ - @functools.wraps(func) - def retry(*args, **kwargs): - backoff = common_utils.Backoff(initial_backoff=1, max_backoff_factor=5) - for retry_cnt in range(_MAX_RETRY): - try: - return func(*args, **kwargs) - except Exception as e: # pylint: disable=broad-except - if retry_cnt >= _MAX_RETRY - 1: - raise e - sleep = backoff.current_backoff() - logger.info( - f'{func.__name__}: Retrying in {sleep:.1f} seconds, ' - f'due to {e}') - time.sleep(sleep) - - return retry + def decorator(func): + + @functools.wraps(func) + def retry(*args, **kwargs): + backoff = common_utils.Backoff(initial_backoff=1, + max_backoff_factor=5) + for retry_cnt in range(_MAX_RETRY): + try: + return func(*args, **kwargs) + except Exception as e: # pylint: disable=broad-except + if not should_retry(e) or retry_cnt >= _MAX_RETRY - 1: + raise + sleep = backoff.current_backoff() + logger.info( + f'{func.__name__}: Retrying in {sleep:.1f} seconds, ' + f'due to {e}') + time.sleep(sleep) + + return retry + + return decorator def _log_start_end(func): @@ -156,7 +162,8 @@ def initialize_docker(cluster_name: str, docker_config: Dict[str, Any], return None _hint_worker_log_path(cluster_name, cluster_info, 'initialize_docker') - @_auto_retry + @_auto_retry(should_retry=lambda e: isinstance(e, exceptions.CommandError) + and e.returncode == 255) def _initialize_docker(runner: command_runner.CommandRunner, log_path: str): docker_user = docker_utils.DockerInitializer(docker_config, runner, log_path).initialize() @@ -193,7 +200,7 @@ def setup_runtime_on_cluster(cluster_name: str, setup_commands: List[str], hasher.update(d) digest = hasher.hexdigest() - @_auto_retry + @_auto_retry() def _setup_node(runner: command_runner.CommandRunner, log_path: str): for cmd in setup_commands: returncode, stdout, stderr = runner.run( @@ -254,7 +261,7 @@ def _ray_gpu_options(custom_resource: str) -> str: @_log_start_end -@_auto_retry +@_auto_retry() def start_ray_on_head_node(cluster_name: str, custom_resource: Optional[str], cluster_info: common.ClusterInfo, ssh_credentials: Dict[str, Any]) -> None: @@ -314,7 +321,7 @@ def start_ray_on_head_node(cluster_name: str, custom_resource: Optional[str], @_log_start_end -@_auto_retry +@_auto_retry() def start_ray_on_worker_nodes(cluster_name: str, no_restart: bool, custom_resource: Optional[str], ray_port: int, cluster_info: common.ClusterInfo, @@ -411,7 +418,7 @@ def _setup_ray_worker(runner_and_id: Tuple[command_runner.CommandRunner, @_log_start_end -@_auto_retry +@_auto_retry() def start_skylet_on_head_node(cluster_name: str, cluster_info: common.ClusterInfo, ssh_credentials: Dict[str, Any]) -> None: @@ -437,7 +444,7 @@ def start_skylet_on_head_node(cluster_name: str, f'===== stderr ====={stderr}') -@_auto_retry +@_auto_retry() def _internal_file_mounts(file_mounts: Dict, runner: command_runner.CommandRunner, log_path: str) -> None: diff --git a/sky/templates/aws-ray.yml.j2 b/sky/templates/aws-ray.yml.j2 index 778c64f6926..ac84f8a4fd3 100644 --- a/sky/templates/aws-ray.yml.j2 +++ b/sky/templates/aws-ray.yml.j2 @@ -14,6 +14,9 @@ docker: {%- if custom_resources is not none %} --gpus all {%- endif %} + {%- for run_option in docker_run_options %} + - {{run_option}} + {%- endfor %} {%- if docker_login_config is not none %} docker_login_config: username: |- diff --git a/sky/templates/azure-ray.yml.j2 b/sky/templates/azure-ray.yml.j2 index 803327f1032..66eac439453 100644 --- a/sky/templates/azure-ray.yml.j2 +++ b/sky/templates/azure-ray.yml.j2 @@ -14,6 +14,9 @@ docker: {%- if custom_resources is not none %} --gpus all {%- endif %} + {%- for run_option in docker_run_options %} + - {{run_option}} + {%- endfor %} {%- endif %} provider: diff --git a/sky/templates/gcp-ray.yml.j2 b/sky/templates/gcp-ray.yml.j2 index 42f1d179498..e01ed351bfa 100644 --- a/sky/templates/gcp-ray.yml.j2 +++ b/sky/templates/gcp-ray.yml.j2 @@ -15,6 +15,9 @@ docker: {%- if gpu is not none %} --gpus all {%- endif %} + {%- for run_option in docker_run_options %} + - {{run_option}} + {%- endfor %} {%- if docker_login_config is not none %} docker_login_config: username: |- diff --git a/sky/templates/paperspace-ray.yml.j2 b/sky/templates/paperspace-ray.yml.j2 index 005f30b5233..400714978b9 100644 --- a/sky/templates/paperspace-ray.yml.j2 +++ b/sky/templates/paperspace-ray.yml.j2 @@ -14,6 +14,9 @@ docker: {%- if custom_resources is not none %} --gpus all {%- endif %} + {%- for run_option in docker_run_options %} + - {{run_option}} + {%- endfor %} {%- if docker_login_config is not none %} docker_login_config: username: |- diff --git a/sky/utils/schemas.py b/sky/utils/schemas.py index 97b46113da4..2f1dd649ade 100644 --- a/sky/utils/schemas.py +++ b/sky/utils/schemas.py @@ -757,6 +757,23 @@ def get_config_schema(): } } + docker_configs = { + 'type': 'object', + 'required': [], + 'additionalProperties': False, + 'properties': { + 'run_options': { + 'anyOf': [{ + 'type': 'string', + }, { + 'type': 'array', + 'items': { + 'type': 'string', + } + }] + } + } + } gpu_configs = { 'type': 'object', 'required': [], @@ -785,6 +802,7 @@ def get_config_schema(): 'spot': controller_resources_schema, 'serve': controller_resources_schema, 'allowed_clouds': allowed_clouds, + 'docker': docker_configs, 'nvidia_gpus': gpu_configs, **cloud_configs, },