From 5ed33f9ce7f3ec2b143b61540183dd14244adb3d Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Wed, 25 Oct 2023 13:27:16 -0700 Subject: [PATCH 01/17] [Setup] Make setup more robust when there is no permission to get/write commit hash (#2731) * avoid breaking installation * Add logging * format * constant for message --- sky/setup_files/setup.py | 49 ++++++++++++++++++++++++++++------------ 1 file changed, 35 insertions(+), 14 deletions(-) diff --git a/sky/setup_files/setup.py b/sky/setup_files/setup.py index fed9ccc1cf3..4e8dc145219 100644 --- a/sky/setup_files/setup.py +++ b/sky/setup_files/setup.py @@ -19,12 +19,18 @@ import platform import re import subprocess +import sys from typing import Dict, List import setuptools ROOT_DIR = os.path.dirname(__file__) INIT_FILE_PATH = os.path.join(ROOT_DIR, 'sky', '__init__.py') +_COMMIT_FAILURE_MESSAGE = ( + 'WARNING: SkyPilot fail to {verb} the commit hash in ' + f'{INIT_FILE_PATH!r} (SkyPilot can still be normally used): ' + '{error}') + original_init_content = None system = platform.system() @@ -70,28 +76,43 @@ def get_commit_hash(): if changes: commit_hash += '-dirty' return commit_hash - except Exception: # pylint: disable=broad-except + except Exception as e: # pylint: disable=broad-except + print(_COMMIT_FAILURE_MESSAGE.format(verb='get', error=str(e)), + file=sys.stderr) return commit_hash def replace_commit_hash(): """Fill in the commit hash in the __init__.py file.""" - with open(INIT_FILE_PATH, 'r') as fp: - content = fp.read() - global original_init_content - original_init_content = content - content = re.sub(r'^_SKYPILOT_COMMIT_SHA = [\'"]([^\'"]*)[\'"]', - f'_SKYPILOT_COMMIT_SHA = \'{get_commit_hash()}\'', - content, - flags=re.M) - with open(INIT_FILE_PATH, 'w') as fp: - fp.write(content) + try: + with open(INIT_FILE_PATH, 'r') as fp: + content = fp.read() + global original_init_content + original_init_content = content + content = re.sub(r'^_SKYPILOT_COMMIT_SHA = [\'"]([^\'"]*)[\'"]', + f'_SKYPILOT_COMMIT_SHA = \'{get_commit_hash()}\'', + content, + flags=re.M) + with open(INIT_FILE_PATH, 'w') as fp: + fp.write(content) + except Exception as e: # pylint: disable=broad-except + # Avoid breaking the installation when there is no permission to write + # the file. + print(_COMMIT_FAILURE_MESSAGE.format(verb='replace', error=str(e)), + file=sys.stderr) + pass def revert_commit_hash(): - if original_init_content is not None: - with open(INIT_FILE_PATH, 'w') as fp: - fp.write(original_init_content) + try: + if original_init_content is not None: + with open(INIT_FILE_PATH, 'w') as fp: + fp.write(original_init_content) + except Exception as e: # pylint: disable=broad-except + # Avoid breaking the installation when there is no permission to write + # the file. + print(_COMMIT_FAILURE_MESSAGE.format(verb='replace', error=str(e)), + file=sys.stderr) def parse_readme(readme: str) -> str: From acd5ab70f5ac0249e0f18a05855164091fba796a Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Wed, 25 Oct 2023 15:51:09 -0700 Subject: [PATCH 02/17] [Dependency] Fix pandas dependency and dev requirements (#2679) dependency fix --- requirements-dev.txt | 4 +++- sky/setup_files/setup.py | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/requirements-dev.txt b/requirements-dev.txt index 8fbf9828f08..6d2007d556b 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -11,7 +11,9 @@ isort==5.12.0 # type checking mypy==0.991 types-PyYAML -types-requests +# 2.31 requires urlib3>2, which is incompatible with SkyPilot, IBM and +# kubernetes packages, which require urllib3<2. +types-requests<2.31 types-setuptools types-cachetools diff --git a/sky/setup_files/setup.py b/sky/setup_files/setup.py index 4e8dc145219..920faf66132 100644 --- a/sky/setup_files/setup.py +++ b/sky/setup_files/setup.py @@ -145,7 +145,7 @@ def parse_readme(readme: str) -> str: 'jinja2 >= 3.0', 'jsonschema', 'networkx', - 'pandas', + 'pandas>=1.3.0', 'pendulum', # PrettyTable with version >=2.0.0 is required for the support of # `add_rows` method. From 1ec056ee6c96f5e61a70bc76f7cb99716853a288 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Fri, 27 Oct 2023 17:35:14 -0700 Subject: [PATCH 03/17] [Provisioner] Fix cache for internal file mounts (#2715) * Fix file mounts * print out the skylet version for better debugging ability * Add skypilot version --- sky/backends/cloud_vm_ray_backend.py | 2 -- sky/provision/instance_setup.py | 25 +++++++++++++++---------- sky/provision/metadata_utils.py | 8 ++++++-- sky/provision/provisioner.py | 17 +++-------------- sky/skylet/skylet.py | 5 ++++- 5 files changed, 28 insertions(+), 29 deletions(-) diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py index a37b0d4708f..e98a82ea5e9 100644 --- a/sky/backends/cloud_vm_ray_backend.py +++ b/sky/backends/cloud_vm_ray_backend.py @@ -2904,8 +2904,6 @@ def _provision( provisioner.ClusterName(handle.cluster_name, handle.cluster_name_on_cloud), handle.cluster_yaml, - local_wheel_path=local_wheel_path, - wheel_hash=wheel_hash, provision_record=provision_record, custom_resource=resources_vars.get('custom_resources'), log_dir=self.log_dir) diff --git a/sky/provision/instance_setup.py b/sky/provision/instance_setup.py index ca62d949844..9a21f6ed9ca 100644 --- a/sky/provision/instance_setup.py +++ b/sky/provision/instance_setup.py @@ -93,7 +93,8 @@ def _hint_worker_log_path(cluster_name: str, cluster_info: common.ClusterInfo, def _parallel_ssh_with_cache(func, cluster_name: str, stage_name: str, - digest: str, cluster_info: common.ClusterInfo, + digest: Optional[str], + cluster_info: common.ClusterInfo, ssh_credentials: Dict[str, Any]) -> List[Any]: with futures.ThreadPoolExecutor(max_workers=32) as pool: results = [] @@ -140,7 +141,7 @@ def _initialize_docker(runner: command_runner.SSHCommandRunner, stage_name='initialize_docker', # Should not cache docker setup, as it needs to be # run every time a cluster is restarted. - digest=str(time.time()), + digest=None, cluster_info=cluster_info, ssh_credentials=ssh_credentials) logger.debug(f'All docker users: {docker_users}') @@ -372,8 +373,7 @@ def _internal_file_mounts(file_mounts: Dict, @_log_start_end def internal_file_mounts(cluster_name: str, common_file_mounts: Dict, cluster_info: common.ClusterInfo, - ssh_credentials: Dict[str, - str], wheel_hash: str) -> None: + ssh_credentials: Dict[str, str]) -> None: """Executes file mounts - rsyncing internal local files""" _hint_worker_log_path(cluster_name, cluster_info, 'internal_file_mounts') @@ -382,9 +382,14 @@ def _setup_node(runner: command_runner.SSHCommandRunner, del metadata _internal_file_mounts(common_file_mounts, runner, log_path) - _parallel_ssh_with_cache(_setup_node, - cluster_name, - stage_name='internal_file_mounts', - digest=wheel_hash, - cluster_info=cluster_info, - ssh_credentials=ssh_credentials) + _parallel_ssh_with_cache( + _setup_node, + cluster_name, + stage_name='internal_file_mounts', + # Do not cache the file mounts, as the cloud + # credentials may change, and we should always + # update the remote files. The internal file_mounts + # is minimal and should not take too much time. + digest=None, + cluster_info=cluster_info, + ssh_credentials=ssh_credentials) diff --git a/sky/provision/metadata_utils.py b/sky/provision/metadata_utils.py index e534e24d37a..4a9d3c90ffc 100644 --- a/sky/provision/metadata_utils.py +++ b/sky/provision/metadata_utils.py @@ -4,6 +4,7 @@ import functools import pathlib import shutil +from typing import Optional from sky import sky_logging @@ -30,7 +31,7 @@ def _get_instance_metadata_dir(cluster_name: str, def cache_func(cluster_name: str, instance_id: str, stage_name: str, - hash_str: str): + hash_str: Optional[str]): """A helper function for caching function execution.""" def decorator(function): @@ -51,8 +52,11 @@ def wrapper(*args, **kwargs): @contextlib.contextmanager def check_cache_hash_or_update(cluster_name: str, instance_id: str, - stage_name: str, hash_str: str): + stage_name: str, hash_str: Optional[str]): """A decorator for 'cache_func'.""" + if hash_str is None: + yield True + return path = get_instance_cache_dir(cluster_name, instance_id) / stage_name if path.exists(): with open(path) as f: diff --git a/sky/provision/provisioner.py b/sky/provision/provisioner.py index 094ef33d81f..cc80ce0dbd5 100644 --- a/sky/provision/provisioner.py +++ b/sky/provision/provisioner.py @@ -3,7 +3,6 @@ import dataclasses import json import os -import pathlib import shlex import socket import subprocess @@ -311,7 +310,6 @@ def wait_for_ssh(cluster_info: provision_common.ClusterInfo, def _post_provision_setup( cloud_name: str, cluster_name: ClusterName, cluster_yaml: str, - local_wheel_path: pathlib.Path, wheel_hash: str, provision_record: provision_common.ProvisionRecord, custom_resource: Optional[str]) -> provision_common.ClusterInfo: cluster_info = provision.get_cluster_info(cloud_name, @@ -388,21 +386,15 @@ def _post_provision_setup( # (3) all instances need permission to mount storage for all clouds # It is possible to have a "smaller" permission model, but we leave that # for later. - file_mounts = { - backend_utils.SKY_REMOTE_PATH + '/' + wheel_hash: - str(local_wheel_path), - **config_from_yaml.get('file_mounts', {}) - } + file_mounts = config_from_yaml.get('file_mounts', {}) runtime_preparation_str = ('[bold cyan]Preparing SkyPilot ' 'runtime ({step}/3 - {step_name})') status.update( runtime_preparation_str.format(step=1, step_name='initializing')) instance_setup.internal_file_mounts(cluster_name.name_on_cloud, - file_mounts, - cluster_info, - ssh_credentials, - wheel_hash=wheel_hash) + file_mounts, cluster_info, + ssh_credentials) status.update( runtime_preparation_str.format(step=2, step_name='dependencies')) @@ -464,7 +456,6 @@ def _post_provision_setup( def post_provision_runtime_setup( cloud_name: str, cluster_name: ClusterName, cluster_yaml: str, - local_wheel_path: pathlib.Path, wheel_hash: str, provision_record: provision_common.ProvisionRecord, custom_resource: Optional[str], log_dir: str) -> provision_common.ClusterInfo: @@ -483,8 +474,6 @@ def post_provision_runtime_setup( return _post_provision_setup(cloud_name, cluster_name, cluster_yaml=cluster_yaml, - local_wheel_path=local_wheel_path, - wheel_hash=wheel_hash, provision_record=provision_record, custom_resource=custom_resource) except Exception: # pylint: disable=broad-except diff --git a/sky/skylet/skylet.py b/sky/skylet/skylet.py index 6bbb51e7a37..a36eb921660 100644 --- a/sky/skylet/skylet.py +++ b/sky/skylet/skylet.py @@ -2,14 +2,17 @@ import time +import sky from sky import sky_logging +from sky.skylet import constants from sky.skylet import events # Use the explicit logger name so that the logger is under the # `sky.skylet.skylet` namespace when executed directly, so as # to inherit the setup from the `sky` logger. logger = sky_logging.init_logger('sky.skylet.skylet') -logger.info('skylet started') +logger.info(f'Skylet started with version {constants.SKYLET_VERSION}; ' + f'SkyPilot v{sky.__version__} (commit: {sky.__commit__})') EVENTS = [ events.AutostopEvent(), From da2d6ecf490c8503f55c5ee54fbdc544fc622ff1 Mon Sep 17 00:00:00 2001 From: aseriesof-tubes <110536383+aseriesof-tubes@users.noreply.github.com> Date: Wed, 1 Nov 2023 15:39:35 -0400 Subject: [PATCH 04/17] Added confirmation prompt for sky storage delete, and --yes flag to skip it (#2726) * for issue 2708 * for issue 2708 * fixed tests * formatting * format * format * Update sky/cli.py Co-authored-by: Doyoung Kim <34902420+landscapepainter@users.noreply.github.com> * cli changes * format * final touches to delete * final changes * additional changes made for more cases * Update tests/test_smoke.py Co-authored-by: Doyoung Kim <34902420+landscapepainter@users.noreply.github.com> * final changes * formatting * format * Update sky/cli.py Co-authored-by: Doyoung Kim <34902420+landscapepainter@users.noreply.github.com> * refactored --------- Co-authored-by: Andrew Kim Co-authored-by: Doyoung Kim <34902420+landscapepainter@users.noreply.github.com> Co-authored-by: Andrew Kim Co-authored-by: Andrew Kim --- sky/cli.py | 25 ++++++++++++++++++++----- tests/test_smoke.py | 11 ++++++----- 2 files changed, 26 insertions(+), 10 deletions(-) diff --git a/sky/cli.py b/sky/cli.py index 86396149685..33bfc27b232 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -136,9 +136,6 @@ def _get_glob_storages(storages: List[str]) -> List[str]: glob_storage = global_user_state.get_glob_storage_name(storage_object) if len(glob_storage) == 0: click.echo(f'Storage {storage_object} not found.') - else: - plural = 's' if len(glob_storage) > 1 else '' - click.echo(f'Deleting {len(glob_storage)} storage object{plural}.') glob_storages.extend(glob_storage) return list(set(glob_storages)) @@ -3457,8 +3454,14 @@ def storage_ls(all: bool): is_flag=True, required=False, help='Delete all storage objects.') +@click.option('--yes', + '-y', + default=False, + is_flag=True, + required=False, + help='Skip confirmation prompt.') @usage_lib.entrypoint -def storage_delete(names: List[str], all: bool): # pylint: disable=redefined-builtin +def storage_delete(names: List[str], all: bool, yes: bool): # pylint: disable=redefined-builtin """Delete storage objects. Examples: @@ -3477,11 +3480,23 @@ def storage_delete(names: List[str], all: bool): # pylint: disable=redefined-bu if sum([len(names) > 0, all]) != 1: raise click.UsageError('Either --all or a name must be specified.') if all: - click.echo('Deleting all storage objects.') storages = sky.storage_ls() + if not storages: + click.echo('No storage(s) to delete.') + return names = [s['name'] for s in storages] else: names = _get_glob_storages(names) + if names: + if not yes: + storage_names = ', '.join(names) + storage_str = 'storages' if len(names) > 1 else 'storage' + click.confirm( + f'Deleting {len(names)} {storage_str}: ' + f'{storage_names}. Proceed?', + default=True, + abort=True, + show_default=True) subprocess_utils.run_in_parallel(sky.storage_delete, names) diff --git a/tests/test_smoke.py b/tests/test_smoke.py index aecd5371de0..c9e9fe14ae5 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -3062,7 +3062,7 @@ def test_new_bucket_creation_and_deletion(self, tmp_local_storage_obj, # Run sky storage delete to delete the storage object subprocess.check_output( - ['sky', 'storage', 'delete', tmp_local_storage_obj.name]) + ['sky', 'storage', 'delete', tmp_local_storage_obj.name, '--yes']) # Run sky storage ls to check if storage object is deleted out = subprocess.check_output(['sky', 'storage', 'ls']) @@ -3094,7 +3094,7 @@ def test_multiple_buckets_creation_and_deletion( assert all([item in out for item in storage_obj_name]) # Run sky storage delete all to delete all storage objects - delete_cmd = ['sky', 'storage', 'delete'] + delete_cmd = ['sky', 'storage', 'delete', '--yes'] delete_cmd += storage_obj_name subprocess.check_output(delete_cmd) @@ -3129,7 +3129,7 @@ def test_bucket_external_deletion(self, tmp_scratch_storage_obj, # Run sky storage delete to delete the storage object out = subprocess.check_output( - ['sky', 'storage', 'delete', tmp_scratch_storage_obj.name]) + ['sky', 'storage', 'delete', tmp_scratch_storage_obj.name, '--yes']) # Make sure bucket was not created during deletion (see issue #1322) assert 'created' not in out.decode('utf-8').lower() @@ -3147,8 +3147,9 @@ def test_bucket_bulk_deletion(self, store_type, tmp_bulk_del_storage_obj): # files and folders to a new bucket, then delete bucket. tmp_bulk_del_storage_obj.add_store(store_type) - subprocess.check_output( - ['sky', 'storage', 'delete', tmp_bulk_del_storage_obj.name]) + subprocess.check_output([ + 'sky', 'storage', 'delete', tmp_bulk_del_storage_obj.name, '--yes' + ]) output = subprocess.check_output(['sky', 'storage', 'ls']) assert tmp_bulk_del_storage_obj.name not in output.decode('utf-8') From 53ba415bb2f6ee72dcbe23edee765bcf6f1e1fe3 Mon Sep 17 00:00:00 2001 From: Tian Xia Date: Thu, 2 Nov 2023 13:52:04 -0700 Subject: [PATCH 05/17] [Docs] Refactoring Docker Container Docs (#2747) * upd docs * Update * Apply suggestions from code review Co-authored-by: Zongheng Yang * move "Building containers remotely" and add private registry for containerized apps * add multi-node scenario * Updates * add ref link and requirements for image id --------- Co-authored-by: Zongheng Yang --- docs/source/examples/docker-containers.rst | 146 +++++++++++++++------ 1 file changed, 103 insertions(+), 43 deletions(-) diff --git a/docs/source/examples/docker-containers.rst b/docs/source/examples/docker-containers.rst index 27506b1d81c..b9713c2a703 100644 --- a/docs/source/examples/docker-containers.rst +++ b/docs/source/examples/docker-containers.rst @@ -3,73 +3,59 @@ Using Docker Containers ======================= -SkyPilot can run a Docker container either as the runtime environment for your task, or as the task itself. +SkyPilot can run a container either as a task, or as the runtime environment of a cluster. -Using Docker Containers as Runtime Environment ----------------------------------------------- +* If the container image is invocable / has an entrypoint: run it :ref:`as a task `. +* Otherwise, the container image is likely to be used as a runtime environment (e.g., ``ubuntu``) and you likely have extra commands to run inside the container: run it :ref:`as a runtime environment `. -When a container is used as the runtime environment, the SkyPilot task is executed inside the container. +.. _docker-containers-as-tasks: -This means all :code:`setup` and :code:`run` commands in the YAML file will be executed in the container, and any files created by the task will be stored inside the container. -Any GPUs assigned to the task will be automatically mapped to your Docker container and all future tasks on the cluster will also execute in the container. +Running Containers as Tasks +--------------------------- -To use a Docker image as your runtime environment, set the :code:`image_id` field in the :code:`resources` section of your task YAML file to :code:`docker:`. -For example, to use the :code:`ubuntu:20.04` image from Docker Hub: - -.. code-block:: yaml - - resources: - image_id: docker:ubuntu:20.04 +SkyPilot can run containerized applications directly as regular tasks. The default VM images provided by SkyPilot already have the Docker runtime pre-configured. - setup: | - # Will run inside container - - run: | - # Will run inside container +To launch a containerized application, you can directly invoke :code:`docker run` in the :code:`run` section of your task. -For Docker images hosted on private registries, you can provide the registry authentication details using :ref:`task environment variables `: +For example, to run a HuggingFace TGI serving container: .. code-block:: yaml - # ecr_private_docker.yaml resources: - image_id: docker:.dkr.ecr.us-east-1.amazonaws.com/: - # the following shorthand is also supported: - # image_id: docker:: - - envs: - SKYPILOT_DOCKER_USERNAME: AWS - # SKYPILOT_DOCKER_PASSWORD: - SKYPILOT_DOCKER_SERVER: .dkr.ecr.us-east-1.amazonaws.com + accelerators: A100:1 -We suggest setting the :code:`SKYPILOT_DOCKER_PASSWORD` environment variable through the CLI (see :ref:`passing secrets `): - -.. code-block:: console - - $ export SKYPILOT_DOCKER_PASSWORD=$(aws ecr get-login-password --region us-east-1) - $ sky launch ecr_private_docker.yaml --env SKYPILOT_DOCKER_PASSWORD - -Running Docker Containers as Tasks ----------------------------------- + run: | + docker run --gpus all --shm-size 1g -v ~/data:/data \ + ghcr.io/huggingface/text-generation-inference \ + --model-id lmsys/vicuna-13b-v1.5 -As an alternative, SkyPilot can run docker containers as tasks. Docker runtime is configured and ready for use on the default VM image used by SkyPilot. + # NOTE: Uncommon to have any commands after the above. + # `docker run` is blocking, so any commands after it + # will NOT be run inside the container. -To run a container as a task, you can directly invoke the :code:`docker run` command in the :code:`run` section of your task. +Private Registries +^^^^^^^^^^^^^^^^^^ -For example, to run a GPU-accelerated container that prints the output of :code:`nvidia-smi`: +When using this mode, to access Docker images hosted on private registries, +simply add a :code:`setup` section to your task YAML file to authenticate with +the registry: .. code-block:: yaml resources: - accelerators: V100:1 + accelerators: A100:1 + + setup: | + # Authenticate with private registry + docker login -u -p run: | - docker run --rm --gpus all nvidia/cuda:11.6.2-base-ubuntu20.04 nvidia-smi + docker run /: Building containers remotely ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -If you are running the container as a task, the container image can also be built remotely on the cluster in the :code:`setup` phase of the task. +If you are running containerized applications, the container image can also be built remotely on the cluster in the :code:`setup` phase of the task. The :code:`echo_app` `example `_ provides an example on how to do this: @@ -100,3 +86,77 @@ The inputs to the app are copied to SkyPilot using :code:`file_mounts` and mount The output of the app produced at :code:`/outputs` path in the container is also volume mounted to :code:`/outputs` on the VM, which gets directly written to a S3 bucket through SkyPilot Storage mounting. Our GitHub repository has more examples, including running `Detectron2 in a Docker container `_ via SkyPilot. + +.. _docker-containers-as-runtime-environments: + +Using Containers as Runtime Environments +---------------------------------------- + +When a container is used as the runtime environment, everything happens inside the container: + +- The SkyPilot runtime is automatically installed and launched inside the container; +- :code:`setup` and :code:`run` commands are executed in the container; +- Any files created by the task will be stored inside the container. + +To use a Docker image as your runtime environment, set the :code:`image_id` field in the :code:`resources` section of your task YAML file to :code:`docker:`. +For example, to use the :code:`ubuntu:20.04` image from Docker Hub: + +.. code-block:: yaml + + resources: + image_id: docker:ubuntu:20.04 + + setup: | + # Commands to run inside the container + + run: | + # Commands to run inside the container + +Any GPUs assigned to the task will be automatically mapped to your Docker container, and all subsequent tasks within the cluster will also run inside the container. In a multi-node scenario, the container will be launched on all nodes, and the corresponding node's container will be assigned for task execution. + +.. tip:: + + **When to use this?** + + If you have a preconfigured development environment set up within a Docker + image, it can be convenient to use the runtime environment mode. This is + especially useful for launching development environments that are + challenging to configure on a new virtual machine, such as dependencies on + specific versions of CUDA or cuDNN. + +.. note:: + + Since we ``pip install skypilot`` inside the user-specified container image + as part of a launch, users should ensure dependency conflicts do not occur. + + Currently, the following requirements must be met: + + 1. The container image should be based on Debian; + + 2. The container image must grant sudo permissions without requiring password authentication for the user. Having a root user is also acceptable. + +Private Registries +^^^^^^^^^^^^^^^^^^ + +When using this mode, to access Docker images hosted on private registries, +you can provide the registry authentication details using :ref:`task environment variables `: + +.. code-block:: yaml + + # ecr_private_docker.yaml + resources: + image_id: docker:.dkr.ecr.us-east-1.amazonaws.com/: + # the following shorthand is also supported: + # image_id: docker:: + + envs: + SKYPILOT_DOCKER_USERNAME: AWS + # SKYPILOT_DOCKER_PASSWORD: + SKYPILOT_DOCKER_SERVER: .dkr.ecr.us-east-1.amazonaws.com + +We suggest setting the :code:`SKYPILOT_DOCKER_PASSWORD` environment variable through the CLI (see :ref:`passing secrets `): + +.. code-block:: console + + $ export SKYPILOT_DOCKER_PASSWORD=$(aws ecr get-login-password --region us-east-1) + $ sky launch ecr_private_docker.yaml --env SKYPILOT_DOCKER_PASSWORD From 2d59e3afb9f7f2128b44aa603ae4898a519f69fb Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Sun, 5 Nov 2023 10:36:35 -0600 Subject: [PATCH 06/17] [Config] Fix ssh proxy command to be null (#2756) * Fix the case where the ssh proxy command is null * Add test * format * format * fix test --- sky/utils/schemas.py | 9 ++++++++- tests/test_config.py | 25 +++++++++++++++++++++++++ 2 files changed, 33 insertions(+), 1 deletion(-) diff --git a/sky/utils/schemas.py b/sky/utils/schemas.py index 0267b7a26a7..20453c49e70 100644 --- a/sky/utils/schemas.py +++ b/sky/utils/schemas.py @@ -323,7 +323,14 @@ def get_config_schema(): 'type': 'object', 'required': [], 'additionalProperties': { - 'type': 'string', + 'anyOf': [ + { + 'type': 'string' + }, + { + 'type': 'null' + }, + ] } }] }, diff --git a/tests/test_config.py b/tests/test_config.py index b4053523dc3..52ff1767648 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -58,6 +58,31 @@ def test_empty_config(monkeypatch, tmp_path) -> None: _check_empty_config() +def test_valid_null_proxy_config(monkeypatch, tmp_path) -> None: + """Test that the config is not loaded if the config file is empty.""" + with open(tmp_path / 'valid.yaml', 'w') as f: + f.write(f"""\ + aws: + instance_tags: + mytag: myvalue + ssh_proxy_command: + eu-west-1: null + us-east-1: null + use_internal_ips: true + vpc_name: abc + + spot: + controller: + resources: + disk_size: 256 + """) + monkeypatch.setattr(skypilot_config, 'CONFIG_PATH', tmp_path / 'valid.yaml') + _reload_config() + proxy_config = skypilot_config.get_nested( + ('aws', 'ssh_proxy_command', 'eu-west-1'), 'default') + assert proxy_config is None, proxy_config + + def test_invalid_field_config(monkeypatch, tmp_path) -> None: """Test that the config is not loaded if the config file contains unknown field.""" config_path = tmp_path / 'invalid.yaml' From d80c47ba74506c625a54d87a8ebc855e57791b4b Mon Sep 17 00:00:00 2001 From: Zongheng Yang Date: Sun, 5 Nov 2023 18:53:12 -0800 Subject: [PATCH 07/17] Docs: add a hint to customizing spot controller. (#2753) --- docs/source/examples/spot-jobs.rst | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/docs/source/examples/spot-jobs.rst b/docs/source/examples/spot-jobs.rst index c62246f97dc..83e01fa4054 100644 --- a/docs/source/examples/spot-jobs.rst +++ b/docs/source/examples/spot-jobs.rst @@ -275,26 +275,34 @@ you can still tear it down manually with Customizing spot controller resources ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -You may customize the resources of the spot controller for the following reasons: +You may want to customize the resources of the spot controller for several reasons: -1. Enforcing the spot controller to run on a specific location. (Default: cheapest location) -2. Changing the maximum number of spot jobs that can be run concurrently. (Default: 16) -3. Changing the disk_size of the spot controller to store more logs. (Default: 50GB) +1. Use a lower-cost controller (if you have a low number of concurrent spot jobs). +2. Enforcing the spot controller to run on a specific location. (Default: cheapest location) +3. Changing the maximum number of spot jobs that can be run concurrently, which is 2x the vCPUs of the controller. (Default: 16) +4. Changing the disk_size of the spot controller to store more logs. (Default: 50GB) To achieve the above, you can specify custom configs in :code:`~/.sky/config.yaml` with the following fields: .. code-block:: yaml spot: + # NOTE: these settings only take effect for a new spot controller, not if + # you have an existing one. controller: resources: - # All configs below are optional - # 1. Specify the location of the spot controller. + # All configs below are optional. + # Specify the location of the spot controller. cloud: gcp region: us-central1 - # 2. Specify the maximum number of spot jobs that can be run concurrently. + # Specify the maximum number of spot jobs that can be run concurrently. cpus: 4+ # number of vCPUs, max concurrent spot jobs = 2 * cpus - # 3. Specify the disk_size of the spot controller. + # Specify the disk_size in GB of the spot controller. disk_size: 100 The :code:`resources` field has the same spec as a normal SkyPilot job; see `here `__. + +.. note:: + These settings will not take effect if you have an existing controller (either + stopped or live). For them to take effect, tear down the existing controller + first, which requires all in-progress spot jobs to finish or be canceled. From 5a35ab6a1f0c82a5dcecf280dfc1f28c51d8eeec Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Mon, 6 Nov 2023 12:24:39 -0800 Subject: [PATCH 08/17] [Examples] Add docker compose example to run multiple containers (#2745) * Add docker compose example * newline * Add repo setup for AWS * Change CUDA to 11.5 for azure compat * Add device_id * typo --- examples/docker/compose/compose_example.yaml | 30 ++++++++++++++++++++ examples/docker/compose/docker-compose.yml | 24 ++++++++++++++++ 2 files changed, 54 insertions(+) create mode 100644 examples/docker/compose/compose_example.yaml create mode 100644 examples/docker/compose/docker-compose.yml diff --git a/examples/docker/compose/compose_example.yaml b/examples/docker/compose/compose_example.yaml new file mode 100644 index 00000000000..497ab105a23 --- /dev/null +++ b/examples/docker/compose/compose_example.yaml @@ -0,0 +1,30 @@ +# Minimal example using docker compose to run multiple containers +# +# Syncs the docker-compose.yml in the current workdir to the remote cluster and +# runs docker compose up. +# +# Usage: +# sky launch -c myclus compose_example.yaml +# sky down myclus + +resources: + accelerators: T4:2 + +workdir: . + +setup: | + sudo apt-get update + sudo apt-get install -y ca-certificates curl gnupg + sudo install -m 0755 -d /etc/apt/keyrings + curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo gpg --dearmor -o /etc/apt/keyrings/docker.gpg + sudo chmod a+r /etc/apt/keyrings/docker.gpg + + echo \ + "deb [arch="$(dpkg --print-architecture)" signed-by=/etc/apt/keyrings/docker.gpg] https://download.docker.com/linux/ubuntu \ + "$(. /etc/os-release && echo "$VERSION_CODENAME")" stable" | \ + sudo tee /etc/apt/sources.list.d/docker.list > /dev/null + sudo apt-get update + sudo apt-get install -y docker-compose-plugin + +run: | + docker compose -f docker-compose.yml up diff --git a/examples/docker/compose/docker-compose.yml b/examples/docker/compose/docker-compose.yml new file mode 100644 index 00000000000..daf87bd704f --- /dev/null +++ b/examples/docker/compose/docker-compose.yml @@ -0,0 +1,24 @@ +version: '3.8' + +services: + gpu-app1: + image: nvidia/cuda:11.5.2-runtime-ubuntu20.04 + command: nvidia-smi # To keep running in a loop, add -l 1 + deploy: + resources: + reservations: + devices: + - driver: nvidia + device_ids: ['0'] + capabilities: [gpu] + + gpu-app2: + image: nvidia/cuda:11.5.2-runtime-ubuntu20.04 + command: nvidia-smi + deploy: + resources: + reservations: + devices: + - driver: nvidia + device_ids: ['1'] # Allocates GPU ID 1 to this container. Inside the container, this will be visible as device id 0 + capabilities: [gpu] From 3fbde3925cab66491107bee672d87c00be267776 Mon Sep 17 00:00:00 2001 From: Doyoung Kim <34902420+landscapepainter@users.noreply.github.com> Date: Mon, 6 Nov 2023 16:28:44 -0800 Subject: [PATCH 09/17] [k8s] Ensure Jump Pod is in "Running" Status Before Proceeding (#2589) * wait for jump pod to be running * testing images * update ssh-jump-pod-name * nit * re-read pod status from _rase_pod_scheduling_error * Update == 'Running' to != 'Pending' * comment * fix to remove pending jump pod when terminating * update comment --- .../providers/kubernetes/node_provider.py | 33 +++++++++++++------ sky/utils/kubernetes_utils.py | 13 ++++---- 2 files changed, 30 insertions(+), 16 deletions(-) diff --git a/sky/skylet/providers/kubernetes/node_provider.py b/sky/skylet/providers/kubernetes/node_provider.py index 8963225cc3f..81230ff4ef8 100644 --- a/sky/skylet/providers/kubernetes/node_provider.py +++ b/sky/skylet/providers/kubernetes/node_provider.py @@ -164,8 +164,16 @@ def _set_node_tags(self, node_id, tags): def _raise_pod_scheduling_errors(self, new_nodes): for new_node in new_nodes: - pod_status = new_node.status.phase - pod_name = new_node._metadata._name + pod = kubernetes.core_api().read_namespaced_pod( + new_node.metadata.name, self.namespace) + pod_status = pod.status.phase + # When there are multiple pods involved while launching instance, + # there may be a single pod causing issue while others are + # scheduled. In this case, we make sure to not surface the error + # message from the pod that is already scheduled. + if pod_status != 'Pending': + continue + pod_name = pod._metadata._name events = kubernetes.core_api().list_namespaced_event( self.namespace, field_selector=(f'involvedObject.name={pod_name},' @@ -173,8 +181,8 @@ def _raise_pod_scheduling_errors(self, new_nodes): # Events created in the past hours are kept by # Kubernetes python client and we want to surface # the latest event message - events_desc_by_time = \ - sorted(events.items, + events_desc_by_time = sorted( + events.items, key=lambda e: e.metadata.creation_timestamp, reverse=True) for event in events_desc_by_time: @@ -200,8 +208,8 @@ def _raise_pod_scheduling_errors(self, new_nodes): lf.get_label_key() for lf in kubernetes_utils.LABEL_FORMATTER_REGISTRY ] - if new_node.spec.node_selector: - for label_key in new_node.spec.node_selector.keys(): + if pod.spec.node_selector: + for label_key in pod.spec.node_selector.keys(): if label_key in gpu_lf_keys: # TODO(romilb): We may have additional node # affinity selectors in the future - in that @@ -210,7 +218,7 @@ def _raise_pod_scheduling_errors(self, new_nodes): 'didn\'t match Pod\'s node affinity/selector' in event_message: raise config.KubernetesError( f'{lack_resource_msg.format(resource="GPU")} ' - f'Verify if {new_node.spec.node_selector[label_key]}' + f'Verify if {pod.spec.node_selector[label_key]}' ' is available in the cluster.') raise config.KubernetesError(f'{timeout_err_msg} ' f'Pod status: {pod_status}' @@ -257,9 +265,14 @@ def create_node(self, node_config, tags, count): self.namespace, service_spec) new_svcs.append(svc) - # Wait for all pods to be ready, and if it exceeds the timeout, raise an - # exception. If pod's container is ContainerCreating, then we can assume - # that resources have been allocated and we can exit. + # Wait for all pods including jump pod to be ready, and if it + # exceeds the timeout, raise an exception. If pod's container + # is ContainerCreating, then we can assume that resources have been + # allocated and we can exit. + ssh_jump_pod_name = conf['metadata']['labels']['skypilot-ssh-jump'] + jump_pod = kubernetes.core_api().read_namespaced_pod( + ssh_jump_pod_name, self.namespace) + new_nodes.append(jump_pod) start = time.time() while True: if time.time() - start > self.timeout: diff --git a/sky/utils/kubernetes_utils.py b/sky/utils/kubernetes_utils.py index 382ae7c23bf..b958e3f3477 100644 --- a/sky/utils/kubernetes_utils.py +++ b/sky/utils/kubernetes_utils.py @@ -923,12 +923,13 @@ def find(l, predicate): ssh_jump_name, namespace) cont_ready_cond = find(ssh_jump_pod.status.conditions, lambda c: c.type == 'ContainersReady') - if cont_ready_cond and \ - cont_ready_cond.status == 'False': - # The main container is not ready. To be on the safe side - # and prevent a dangling ssh jump pod, lets remove it and - # the service. Otherwise main container is ready and its lifecycle - # management script takes care of the cleaning. + if (cont_ready_cond and cont_ready_cond.status + == 'False') or ssh_jump_pod.status.phase == 'Pending': + # Either the main container is not ready or the pod failed + # to schedule. To be on the safe side and prevent a dangling + # ssh jump pod, lets remove it and the service. Otherwise, main + # container is ready and its lifecycle management script takes + # care of the cleaning. kubernetes.core_api().delete_namespaced_pod(ssh_jump_name, namespace) kubernetes.core_api().delete_namespaced_service( From 7355929ccbf1cc1c103aaeacbd50f0d02e4618d6 Mon Sep 17 00:00:00 2001 From: Zongheng Yang Date: Tue, 7 Nov 2023 10:22:31 -0800 Subject: [PATCH 10/17] Core: Fix syncing workdir with spaces in .git/info/exclude path. (#2762) Fix syncing workdir with spaces in .git/info/exclude path. --- sky/backends/backend_utils.py | 7 ++++++- sky/data/storage.py | 4 ++-- sky/utils/command_runner.py | 6 +++++- 3 files changed, 13 insertions(+), 4 deletions(-) diff --git a/sky/backends/backend_utils.py b/sky/backends/backend_utils.py index 65876ef194b..09f28bb6d65 100644 --- a/sky/backends/backend_utils.py +++ b/sky/backends/backend_utils.py @@ -7,6 +7,7 @@ import pathlib import pprint import re +import shlex import subprocess import tempfile import textwrap @@ -292,8 +293,12 @@ def path_size_megabytes(path: str) -> int: git_exclude_filter = '' if (resolved_path / command_runner.GIT_EXCLUDE).exists(): # Ensure file exists; otherwise, rsync will error out. + # + # We shlex.quote() because the path may contain spaces: + # 'my dir/.git/info/exclude' + # Without quoting rsync fails. git_exclude_filter = command_runner.RSYNC_EXCLUDE_OPTION.format( - str(resolved_path / command_runner.GIT_EXCLUDE)) + shlex.quote(str(resolved_path / command_runner.GIT_EXCLUDE))) rsync_command = (f'rsync {command_runner.RSYNC_DISPLAY_OPTION} ' f'{command_runner.RSYNC_FILTER_OPTION} ' f'{git_exclude_filter} --dry-run {path!r}') diff --git a/sky/data/storage.py b/sky/data/storage.py index 1064f06f9a2..af6d5c7143f 100644 --- a/sky/data/storage.py +++ b/sky/data/storage.py @@ -852,8 +852,8 @@ def warn_for_git_dir(source: str): warn_for_git_dir(source) store.upload() except exceptions.StorageUploadError: - logger.error(f'Could not upload {self.source} to store ' - f'name {store.name}.') + logger.error(f'Could not upload {self.source!r} to store ' + f'name {store.name!r}.') if store.is_sky_managed: global_user_state.set_storage_status( self.name, StorageStatus.UPLOAD_FAILED) diff --git a/sky/utils/command_runner.py b/sky/utils/command_runner.py index c3dd73eb345..4a538e8fe54 100644 --- a/sky/utils/command_runner.py +++ b/sky/utils/command_runner.py @@ -391,9 +391,13 @@ def rsync( resolved_source = pathlib.Path(source).expanduser().resolve() if (resolved_source / GIT_EXCLUDE).exists(): # Ensure file exists; otherwise, rsync will error out. + # + # We shlex.quote() because the path may contain spaces: + # 'my dir/.git/info/exclude' + # Without quoting rsync fails. rsync_command.append( RSYNC_EXCLUDE_OPTION.format( - str(resolved_source / GIT_EXCLUDE))) + shlex.quote(str(resolved_source / GIT_EXCLUDE)))) if self._docker_ssh_proxy_command is not None: docker_ssh_proxy_command = self._docker_ssh_proxy_command(['ssh']) From 4c4b7bdb840929a6093d70cef436747c22884778 Mon Sep 17 00:00:00 2001 From: Doyoung Kim <34902420+landscapepainter@users.noreply.github.com> Date: Wed, 8 Nov 2023 15:16:15 -0800 Subject: [PATCH 11/17] [Storage] Support for storage info in cluster table's metadata (#2322) * add storage in cluster's metadata * add functionality to run mounting with sky start * nit * update cluster metadata as pickle * format * nit * format * format * update metadata usage * format * nit * update from CSYNC PR * update add_or_update_cluster() * nit * nit * nti * nit * fix * raise error for when bucket is None * testing images * nit * nit * fix externally deleted storage edge case * nit * lint * err msg update * nit * nit * Update sky/exceptions.py Co-authored-by: Romil Bhardwaj * resolve comments * update Storage.from_metadata for valid format * update smoke test * nit * backward compatibility and check list/cloud source --------- Co-authored-by: Romil Bhardwaj --- sky/backends/backend.py | 8 +- sky/backends/backend_utils.py | 5 + sky/backends/cloud_vm_ray_backend.py | 93 ++++++++++++-- sky/backends/local_docker_backend.py | 4 +- sky/core.py | 6 + sky/data/storage.py | 181 ++++++++++++++++++++------- sky/exceptions.py | 6 + sky/global_user_state.py | 56 ++++++++- tests/test_smoke.py | 101 ++++++++++++++- 9 files changed, 397 insertions(+), 63 deletions(-) diff --git a/sky/backends/backend.py b/sky/backends/backend.py index 28aa981b078..c51ecd41d92 100644 --- a/sky/backends/backend.py +++ b/sky/backends/backend.py @@ -66,8 +66,8 @@ def sync_workdir(self, handle: _ResourceHandleType, workdir: Path) -> None: def sync_file_mounts( self, handle: _ResourceHandleType, - all_file_mounts: Dict[Path, Path], - storage_mounts: Dict[Path, 'storage_lib.Storage'], + all_file_mounts: Optional[Dict[Path, Path]], + storage_mounts: Optional[Dict[Path, 'storage_lib.Storage']], ) -> None: return self._sync_file_mounts(handle, all_file_mounts, storage_mounts) @@ -130,8 +130,8 @@ def _sync_workdir(self, handle: _ResourceHandleType, workdir: Path) -> None: def _sync_file_mounts( self, handle: _ResourceHandleType, - all_file_mounts: Dict[Path, Path], - storage_mounts: Dict[Path, 'storage_lib.Storage'], + all_file_mounts: Optional[Dict[Path, Path]], + storage_mounts: Optional[Dict[Path, 'storage_lib.Storage']], ) -> None: raise NotImplementedError diff --git a/sky/backends/backend_utils.py b/sky/backends/backend_utils.py index 09f28bb6d65..fa54505dd20 100644 --- a/sky/backends/backend_utils.py +++ b/sky/backends/backend_utils.py @@ -109,6 +109,11 @@ CLUSTER_STATUS_LOCK_PATH = os.path.expanduser('~/.sky/.{}.lock') CLUSTER_STATUS_LOCK_TIMEOUT_SECONDS = 20 +# Filelocks for updating cluster's file_mounts. +CLUSTER_FILE_MOUNTS_LOCK_PATH = os.path.expanduser( + '~/.sky/.{}_file_mounts.lock') +CLUSTER_FILE_MOUNTS_LOCK_TIMEOUT_SECONDS = 10 + # Remote dir that holds our runtime files. _REMOTE_RUNTIME_FILES_DIR = '~/.sky/.runtime_files' diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py index e98a82ea5e9..11bf1bbf145 100644 --- a/sky/backends/cloud_vm_ray_backend.py +++ b/sky/backends/cloud_vm_ray_backend.py @@ -3158,12 +3158,13 @@ def _sync_workdir_node(runner: command_runner.SSHCommandRunner) -> None: def _sync_file_mounts( self, handle: CloudVmRayResourceHandle, - all_file_mounts: Dict[Path, Path], - storage_mounts: Dict[Path, storage_lib.Storage], + all_file_mounts: Optional[Dict[Path, Path]], + storage_mounts: Optional[Dict[Path, storage_lib.Storage]], ) -> None: """Mounts all user files to the remote nodes.""" self._execute_file_mounts(handle, all_file_mounts) self._execute_storage_mounts(handle, storage_mounts) + self._set_storage_mounts_metadata(handle.cluster_name, storage_mounts) def _setup(self, handle: CloudVmRayResourceHandle, task: task_lib.Task, detach_setup: bool) -> None: @@ -4414,7 +4415,7 @@ def _setup_tpu_name_on_node( subprocess_utils.run_in_parallel(_setup_tpu_name_on_node, runners) def _execute_file_mounts(self, handle: CloudVmRayResourceHandle, - file_mounts: Dict[Path, Path]): + file_mounts: Optional[Dict[Path, Path]]): """Executes file mounts. Rsyncing local files and copying from remote stores. @@ -4547,19 +4548,26 @@ def _symlink_node(runner: command_runner.SSHCommandRunner): end = time.time() logger.debug(f'File mount sync took {end - start} seconds.') - def _execute_storage_mounts(self, handle: CloudVmRayResourceHandle, - storage_mounts: Dict[Path, - storage_lib.Storage]): + def _execute_storage_mounts( + self, handle: CloudVmRayResourceHandle, + storage_mounts: Optional[Dict[Path, storage_lib.Storage]]): """Executes storage mounts: installing mounting tools and mounting.""" + # Handle cases where `storage_mounts` is None. This occurs when users + # initiate a 'sky start' command from a Skypilot version that predates + # the introduction of the `storage_mounts_metadata` feature. + if not storage_mounts: + return + # Process only mount mode objects here. COPY mode objects have been # converted to regular copy file mounts and thus have been handled - # in the '__execute_file_mounts' method. + # in the '_execute_file_mounts' method. storage_mounts = { path: storage_mount for path, storage_mount in storage_mounts.items() if storage_mount.mode == storage_lib.StorageMode.MOUNT } + # Handle cases when there aren't any Storages with MOUNT mode. if not storage_mounts: return @@ -4589,6 +4597,15 @@ def _execute_storage_mounts(self, handle: CloudVmRayResourceHandle, for dst, storage_obj in storage_mounts.items(): if not os.path.isabs(dst) and not dst.startswith('~/'): dst = f'{SKY_REMOTE_WORKDIR}/{dst}' + # Raised when the bucket is externall removed before re-mounting + # with sky start. + if not storage_obj.stores: + with ux_utils.print_exception_no_traceback(): + raise exceptions.StorageExternalDeletionError( + f'The bucket, {storage_obj.name!r}, could not be ' + f'mounted on cluster {handle.cluster_name!r}. Please ' + 'verify that the bucket exists. The cluster started ' + 'successfully without mounting the bucket.') # Get the first store and use it to mount store = list(storage_obj.stores.values())[0] mount_cmd = store.mount_command(dst) @@ -4628,6 +4645,68 @@ def _execute_storage_mounts(self, handle: CloudVmRayResourceHandle, end = time.time() logger.debug(f'Storage mount sync took {end - start} seconds.') + def _set_storage_mounts_metadata( + self, cluster_name: str, + storage_mounts: Optional[Dict[Path, storage_lib.Storage]]) -> None: + """Sets 'storage_mounts' object in cluster's storage_mounts_metadata. + + After converting Storage objects in 'storage_mounts' to metadata, + it stores {PATH: StorageMetadata} into the table. + """ + if not storage_mounts: + return + storage_mounts_metadata = {} + for dst, storage_obj in storage_mounts.items(): + storage_mounts_metadata[dst] = storage_obj.handle + lock_path = ( + backend_utils.CLUSTER_FILE_MOUNTS_LOCK_PATH.format(cluster_name)) + lock_timeout = backend_utils.CLUSTER_FILE_MOUNTS_LOCK_TIMEOUT_SECONDS + try: + with filelock.FileLock(lock_path, lock_timeout): + global_user_state.set_cluster_storage_mounts_metadata( + cluster_name, storage_mounts_metadata) + except filelock.Timeout as e: + raise RuntimeError( + f'Failed to store metadata for cluster {cluster_name!r} due to ' + 'a timeout when trying to access local database. Please ' + f'try again or manually remove the lock at {lock_path}. ' + f'{common_utils.format_exception(e)}') from None + + def get_storage_mounts_metadata( + self, + cluster_name: str) -> Optional[Dict[Path, storage_lib.Storage]]: + """Gets 'storage_mounts' object from cluster's storage_mounts_metadata. + + After retrieving storage_mounts_metadata, it converts back the + StorageMetadata to Storage object and restores 'storage_mounts.' + """ + lock_path = ( + backend_utils.CLUSTER_FILE_MOUNTS_LOCK_PATH.format(cluster_name)) + lock_timeout = backend_utils.CLUSTER_FILE_MOUNTS_LOCK_TIMEOUT_SECONDS + try: + with filelock.FileLock(lock_path, lock_timeout): + storage_mounts_metadata = ( + global_user_state.get_cluster_storage_mounts_metadata( + cluster_name)) + except filelock.Timeout as e: + raise RuntimeError( + f'Failed to retrieve metadata for cluster {cluster_name!r} ' + 'due to a timeout when trying to access local database. ' + f'Please try again or manually remove the lock at {lock_path}.' + f' {common_utils.format_exception(e)}') from None + + if storage_mounts_metadata is None: + return None + storage_mounts = {} + for dst, storage_metadata in storage_mounts_metadata.items(): + # Setting 'sync_on_reconstruction' to False prevents from Storage + # object creation to sync local source syncing to the bucket. Local + # source specified in Storage object is synced to the bucket only + # when it is created with 'sky launch'. + storage_mounts[dst] = storage_lib.Storage.from_metadata( + storage_metadata, sync_on_reconstruction=False) + return storage_mounts + def _execute_task_one_node(self, handle: CloudVmRayResourceHandle, task: task_lib.Task, job_id: int, detach_run: bool) -> None: diff --git a/sky/backends/local_docker_backend.py b/sky/backends/local_docker_backend.py index 2b466dff4cf..bc3bfbbe253 100644 --- a/sky/backends/local_docker_backend.py +++ b/sky/backends/local_docker_backend.py @@ -185,8 +185,8 @@ def _sync_workdir(self, handle: LocalDockerResourceHandle, def _sync_file_mounts( self, handle: LocalDockerResourceHandle, - all_file_mounts: Dict[Path, Path], - storage_mounts: Dict[Path, storage_lib.Storage], + all_file_mounts: Optional[Dict[Path, Path]], + storage_mounts: Optional[Dict[Path, storage_lib.Storage]], ) -> None: """File mounts in Docker are implemented with volume mounts (-v).""" assert not storage_mounts, \ diff --git a/sky/core.py b/sky/core.py index c0babe0062e..383240517fb 100644 --- a/sky/core.py +++ b/sky/core.py @@ -208,6 +208,12 @@ def _start( stream_logs=True, cluster_name=cluster_name, retry_until_up=retry_until_up) + storage_mounts = backend.get_storage_mounts_metadata(handle.cluster_name) + # Passing all_file_mounts as None ensures the local source set in Storage + # to not redundantly sync source to the bucket. + backend.sync_file_mounts(handle=handle, + all_file_mounts=None, + storage_mounts=storage_mounts) if idle_minutes_to_autostop is not None: backend.set_autostop(handle, idle_minutes_to_autostop, down=down) return handle diff --git a/sky/data/storage.py b/sky/data/storage.py index af6d5c7143f..e18ae72df10 100644 --- a/sky/data/storage.py +++ b/sky/data/storage.py @@ -215,7 +215,8 @@ def __init__(self, sync_on_reconstruction: bool; Whether to sync data if the storage object is found in the global_user_state and reconstructed from there. This is set to false when the Storage object is created not - for direct use, e.g. for sky storage delete. + for direct use, e.g. for 'sky storage delete', or the storage is + being re-used, e.g., for `sky start` on a stopped cluster. Raises: StorageBucketCreateError: If bucket creation fails @@ -356,6 +357,7 @@ class StorageMetadata(object): - (required) Storage name. - (required) Source + - (optional) Storage mode. - (optional) Set of stores managed by sky added to the Storage object """ @@ -364,11 +366,13 @@ def __init__( *, storage_name: Optional[str], source: Optional[SourceType], + mode: Optional[StorageMode] = None, sky_stores: Optional[Dict[StoreType, AbstractStore.StoreMetadata]] = None): assert storage_name is not None or source is not None self.storage_name = storage_name self.source = source + self.mode = mode # Only stores managed by sky are stored here in the # global_user_state self.sky_stores = {} if sky_stores is None else sky_stores @@ -377,6 +381,7 @@ def __repr__(self): return (f'StorageMetadata(' f'\n\tstorage_name={self.storage_name},' f'\n\tsource={self.source},' + f'\n\tmode={self.mode},' f'\n\tstores={self.sky_stores})') def add_store(self, store: AbstractStore) -> None: @@ -430,7 +435,8 @@ def __init__(self, sync_on_reconstruction: bool; Whether to sync the data if the storage object is found in the global_user_state and reconstructed from there. This is set to false when the Storage object is created not - for direct use, e.g. for sky storage delete. + for direct use, e.g. for 'sky storage delete', or the storage is + being re-used, e.g., for `sky start` on a stopped cluster. """ self.name: str self.source = source @@ -458,34 +464,7 @@ def __init__(self, # Reconstruct the Storage object from the global_user_state logger.debug('Detected existing storage object, ' f'loading Storage: {self.name}') - for s_type, s_metadata in self.handle.sky_stores.items(): - # When initializing from global_user_state, we override the - # source from the YAML - if s_type == StoreType.S3: - store = S3Store.from_metadata( - s_metadata, - source=self.source, - sync_on_reconstruction=self.sync_on_reconstruction) - elif s_type == StoreType.GCS: - store = GcsStore.from_metadata( - s_metadata, - source=self.source, - sync_on_reconstruction=self.sync_on_reconstruction) - elif s_type == StoreType.R2: - store = R2Store.from_metadata( - s_metadata, - source=self.source, - sync_on_reconstruction=self.sync_on_reconstruction) - elif s_type == StoreType.IBM: - store = IBMCosStore.from_metadata( - s_metadata, - source=self.source, - sync_on_reconstruction=self.sync_on_reconstruction) - else: - with ux_utils.print_exception_no_traceback(): - raise ValueError(f'Unknown store type: {s_type}') - - self._add_store(store, is_reconstructed=True) + self._add_store_from_metadata(self.handle.sky_stores) # TODO(romilb): This logic should likely be in add_store to move # syncing to file_mount stage.. @@ -507,6 +486,7 @@ def __init__(self, } self.handle = self.StorageMetadata(storage_name=self.name, source=self.source, + mode=self.mode, sky_stores=sky_managed_stores) if self.source is not None: @@ -713,6 +693,79 @@ def validate_name(name): f'Validation failed for storage source {self.source}, name ' f'{self.name} and mode {self.mode}. Please check the arguments.') + def _add_store_from_metadata( + self, sky_stores: Dict[StoreType, + AbstractStore.StoreMetadata]) -> None: + """Reconstructs Storage.stores from sky_stores. + + Reconstruct AbstractStore objects from sky_store's metadata and + adds them into Storage.stores + """ + for s_type, s_metadata in sky_stores.items(): + # When initializing from global_user_state, we override the + # source from the YAML + try: + if s_type == StoreType.S3: + store = S3Store.from_metadata( + s_metadata, + source=self.source, + sync_on_reconstruction=self.sync_on_reconstruction) + elif s_type == StoreType.GCS: + store = GcsStore.from_metadata( + s_metadata, + source=self.source, + sync_on_reconstruction=self.sync_on_reconstruction) + elif s_type == StoreType.R2: + store = R2Store.from_metadata( + s_metadata, + source=self.source, + sync_on_reconstruction=self.sync_on_reconstruction) + elif s_type == StoreType.IBM: + store = IBMCosStore.from_metadata( + s_metadata, + source=self.source, + sync_on_reconstruction=self.sync_on_reconstruction) + else: + with ux_utils.print_exception_no_traceback(): + raise ValueError(f'Unknown store type: {s_type}') + # Following error is raised from _get_bucket and caught only when + # an externally removed storage is attempted to be fetched. + except exceptions.StorageExternalDeletionError: + logger.debug(f'Storage object, {self.name}, was attempted to ' + 'be reconstructed while the corresponding bucket' + ' was externally deleted.') + continue + + self._add_store(store, is_reconstructed=True) + + @classmethod + def from_metadata(cls, metadata: StorageMetadata, + **override_args) -> 'Storage': + """Create Storage from StorageMetadata object. + + Used when reconstructing Storage object and AbstractStore objects from + global_user_state. + """ + # Name should not be specified if the source is a cloud store URL. + source = override_args.get('source', metadata.source) + name = override_args.get('name', metadata.storage_name) + # If the source is a list, it consists of local paths + if not isinstance(source, + list) and data_utils.is_cloud_store_url(source): + name = None + + storage_obj = cls(name=name, + source=source, + sync_on_reconstruction=override_args.get( + 'sync_on_reconstruction', True)) + + # For backward compatibility + if hasattr(metadata, 'mode'): + if metadata.mode: + storage_obj.mode = override_args.get('mode', metadata.mode) + + return storage_obj + def add_store(self, store_type: Union[str, StoreType]) -> AbstractStore: """Initializes and adds a new store to the storage. @@ -1169,7 +1222,7 @@ def _transfer_to_s3(self) -> None: elif self.source.startswith('r2://'): data_transfer.r2_to_s3(self.name, self.name) - def _get_bucket(self) -> Tuple[Optional[StorageHandle], bool]: + def _get_bucket(self) -> Tuple[StorageHandle, bool]: """Obtains the S3 bucket. If the bucket exists, this method will return the bucket. @@ -1182,6 +1235,9 @@ def _get_bucket(self) -> Tuple[Optional[StorageHandle], bool]: Raises: StorageBucketCreateError: If creating the bucket fails StorageBucketGetError: If fetching a bucket fails + StorageExternalDeletionError: If externally deleted storage is + attempted to be fetched while reconstructing the storage for + 'sky storage delete' or 'sky start' """ s3 = aws.resource('s3') bucket = s3.Bucket(self.name) @@ -1207,18 +1263,24 @@ def _get_bucket(self) -> Tuple[Optional[StorageHandle], bool]: if isinstance(self.source, str) and self.source.startswith('s3://'): with ux_utils.print_exception_no_traceback(): raise exceptions.StorageBucketGetError( - 'Attempted to connect to a non-existent bucket: ' + 'Attempted to use a non-existent bucket as a source: ' f'{self.source}. Consider using `aws s3 ls ' f'{self.source}` to debug.') # If bucket cannot be found in both private and public settings, # the bucket is to be created by Sky. However, creation is skipped if - # Store object is being reconstructed for deletion. + # Store object is being reconstructed for deletion or re-mount with + # sky start, and error is raised instead. if self.sync_on_reconstruction: bucket = self._create_s3_bucket(self.name) return bucket, True else: - return None, False + # Raised when Storage object is reconstructed for sky storage + # delete or to re-mount Storages with sky start but the storage + # is already removed externally. + raise exceptions.StorageExternalDeletionError( + 'Attempted to fetch a non-existent bucket: ' + f'{self.name}') def _download_file(self, remote_path: str, local_path: str) -> None: """Downloads file from remote to local on s3 bucket @@ -1618,6 +1680,9 @@ def _get_bucket(self) -> Tuple[StorageHandle, bool]: Raises: StorageBucketCreateError: If creating the bucket fails StorageBucketGetError: If fetching a bucket fails + StorageExternalDeletionError: If externally deleted storage is + attempted to be fetched while reconstructing the storage for + 'sky storage delete' or 'sky start' """ try: bucket = self.client.get_bucket(self.name) @@ -1626,18 +1691,23 @@ def _get_bucket(self) -> Tuple[StorageHandle, bool]: if isinstance(self.source, str) and self.source.startswith('gs://'): with ux_utils.print_exception_no_traceback(): raise exceptions.StorageBucketGetError( - 'Attempted to connect to a non-existent bucket: ' + 'Attempted to use a non-existent bucket as a source: ' f'{self.source}') from e else: - # If bucket cannot be found (i.e., does not exist), it is to be # created by Sky. However, creation is skipped if Store object - # is being reconstructed for deletion. + # is being reconstructed for deletion or re-mount with + # sky start, and error is raised instead. if self.sync_on_reconstruction: bucket = self._create_gcs_bucket(self.name) return bucket, True else: - return None, False + # This is raised when Storage object is reconstructed for + # sky storage delete or to re-mount Storages with sky start + # but the storage is already removed externally. + raise exceptions.StorageExternalDeletionError( + 'Attempted to fetch a non-existent bucket: ' + f'{self.name}') from e except gcp.forbidden_exception(): # Try public bucket to see if bucket exists logger.info( @@ -1876,7 +1946,7 @@ def get_handle(self) -> StorageHandle: def batch_aws_rsync(self, source_path_list: List[Path], create_dirs: bool = False) -> None: - """Invokes aws s3 sync to batch upload a list of local paths to S3 + """Invokes aws s3 sync to batch upload a list of local paths to R2 AWS Sync by default uses 10 threads to upload files to the bucket. To increase parallelism, modify max_concurrent_requests in your aws config @@ -1963,6 +2033,9 @@ def _get_bucket(self) -> Tuple[StorageHandle, bool]: Raises: StorageBucketCreateError: If creating the bucket fails StorageBucketGetError: If fetching a bucket fails + StorageExternalDeletionError: If externally deleted storage is + attempted to be fetched while reconstructing the storage for + 'sky storage delete' or 'sky start' """ r2 = cloudflare.resource('s3') bucket = r2.Bucket(self.name) @@ -1992,7 +2065,7 @@ def _get_bucket(self) -> Tuple[StorageHandle, bool]: if isinstance(self.source, str) and self.source.startswith('r2://'): with ux_utils.print_exception_no_traceback(): raise exceptions.StorageBucketGetError( - 'Attempted to connect to a non-existent bucket: ' + 'Attempted to use a non-existent bucket as a source: ' f'{self.source}. Consider using ' '`AWS_SHARED_CREDENTIALS_FILE=' f'{cloudflare.R2_CREDENTIALS_PATH} aws s3 ls ' @@ -2002,13 +2075,19 @@ def _get_bucket(self) -> Tuple[StorageHandle, bool]: 'to debug.') # If bucket cannot be found in both private and public settings, - # the bucket is to be created by Sky. However, skip creation if - # Store object is being reconstructed for deletion. + # the bucket is to be created by Sky. However, creation is skipped if + # Store object is being reconstructed for deletion or re-mount with + # sky start, and error is raised instead. if self.sync_on_reconstruction: bucket = self._create_r2_bucket(self.name) return bucket, True else: - return None, False + # Raised when Storage object is reconstructed for sky storage + # delete or to re-mount Storages with sky start but the storage + # is already removed externally. + raise exceptions.StorageExternalDeletionError( + 'Attempted to fetch a non-existent bucket: ' + f'{self.name}') def _download_file(self, remote_path: str, local_path: str) -> None: """Downloads file from remote to local on r2 bucket @@ -2365,6 +2444,13 @@ def _get_bucket(self) -> Tuple[StorageHandle, bool]: Returns: StorageHandle(str): bucket name bool: indicates whether a new bucket was created. + + Raises: + StorageBucketCreateError: If bucket creation fails. + StorageBucketGetError: If fetching a bucket fails + StorageExternalDeletionError: If externally deleted storage is + attempted to be fetched while reconstructing the storage for + 'sky storage delete' or 'sky start' """ bucket_profile_name = Rclone.RcloneClouds.IBM.value + self.name @@ -2397,7 +2483,7 @@ def _get_bucket(self) -> Tuple[StorageHandle, bool]: # bucket doesn't exist but source is a bucket URI with ux_utils.print_exception_no_traceback(): raise exceptions.StorageBucketGetError( - 'Attempted to connect to a non-existent bucket: ' + 'Attempted to use a non-existent bucket as a source: ' f'{self.name} by providing URI. Consider using ' '`rclone lsd ` on relevant remotes returned ' 'via `rclone listremotes` to debug.') @@ -2410,6 +2496,13 @@ def _get_bucket(self) -> Tuple[StorageHandle, bool]: if not bucket_region and self.sync_on_reconstruction: # bucket doesn't exist return self._create_cos_bucket(self.name, self.region), True + elif not bucket_region and not self.sync_on_reconstruction: + # Raised when Storage object is reconstructed for sky storage + # delete or to re-mount Storages with sky start but the storage + # is already removed externally. + raise exceptions.StorageExternalDeletionError( + 'Attempted to fetch a non-existent bucket: ' + f'{self.name}') else: # bucket exists return self.s3_resource.Bucket(self.name), False diff --git a/sky/exceptions.py b/sky/exceptions.py index b6c676ec700..723e769ef9d 100644 --- a/sky/exceptions.py +++ b/sky/exceptions.py @@ -173,6 +173,12 @@ class StorageModeError(StorageSpecError): pass +class StorageExternalDeletionError(StorageBucketGetError): + # Error raised when the bucket is attempted to be fetched while it has been + # deleted externally. + pass + + class FetchIPError(Exception): """Raised when fetching the IP fails.""" diff --git a/sky/global_user_state.py b/sky/global_user_state.py index 6f94aa8098d..b75457e5713 100644 --- a/sky/global_user_state.py +++ b/sky/global_user_state.py @@ -110,6 +110,9 @@ def create_table(cursor, conn): db_utils.add_column_to_table(cursor, conn, 'clusters', 'cluster_hash', 'TEXT DEFAULT null') + db_utils.add_column_to_table(cursor, conn, 'clusters', + 'storage_mounts_metadata', 'BLOB DEFAULT null') + conn.commit() @@ -174,7 +177,8 @@ def add_or_update_cluster(cluster_name: str, # the field of the existing row with the default value if not # specified. '(name, launched_at, handle, last_use, status, ' - 'autostop, to_down, metadata, owner, cluster_hash) ' + 'autostop, to_down, metadata, owner, cluster_hash, ' + 'storage_mounts_metadata) ' 'VALUES (' # name '?, ' @@ -206,7 +210,10 @@ def add_or_update_cluster(cluster_name: str, 'COALESCE(' '(SELECT owner FROM clusters WHERE name=?), null),' # cluster_hash - '?' + '?,' + # storage_mounts_metadata + 'COALESCE(' + '(SELECT storage_mounts_metadata FROM clusters WHERE name=?), null)' ')', ( # name @@ -233,6 +240,8 @@ def add_or_update_cluster(cluster_name: str, cluster_name, # cluster_hash cluster_hash, + # storage_mounts_metadata + cluster_name, )) launched_nodes = getattr(cluster_handle, 'launched_nodes', None) @@ -390,6 +399,32 @@ def set_cluster_info(cluster_name: str, metadata: Dict[str, Any]) -> None: raise ValueError(f'Cluster {cluster_name} not found.') +def get_cluster_storage_mounts_metadata( + cluster_name: str) -> Optional[Dict[str, Any]]: + rows = _DB.cursor.execute( + 'SELECT storage_mounts_metadata FROM clusters WHERE name=(?)', + (cluster_name,)) + for (storage_mounts_metadata,) in rows: + if storage_mounts_metadata is None: + return None + return pickle.loads(storage_mounts_metadata) + return None + + +def set_cluster_storage_mounts_metadata( + cluster_name: str, storage_mounts_metadata: Dict[str, Any]) -> None: + _DB.cursor.execute( + 'UPDATE clusters SET storage_mounts_metadata=(?) WHERE name=(?)', ( + pickle.dumps(storage_mounts_metadata), + cluster_name, + )) + count = _DB.cursor.rowcount + _DB.conn.commit() + assert count <= 1, count + if count == 0: + raise ValueError(f'Cluster {cluster_name} not found.') + + def _get_cluster_usage_intervals( cluster_hash: Optional[str] ) -> Optional[List[Tuple[int, Optional[int]]]]: @@ -507,6 +542,14 @@ def _load_owner(record_owner: Optional[str]) -> Optional[List[str]]: return [record_owner] +def _load_storage_mounts_metadata( + record_storage_mounts_metadata: Optional[bytes] +) -> Optional[Dict[str, 'Storage.StorageMetadata']]: + if not record_storage_mounts_metadata: + return None + return pickle.loads(record_storage_mounts_metadata) + + def get_cluster_from_name( cluster_name: Optional[str]) -> Optional[Dict[str, Any]]: rows = _DB.cursor.execute('SELECT * FROM clusters WHERE name=(?)', @@ -516,7 +559,7 @@ def get_cluster_from_name( # we can add new fields to the database in the future without # breaking the previous code. (name, launched_at, handle, last_use, status, autostop, metadata, - to_down, owner, cluster_hash) = row[:10] + to_down, owner, cluster_hash, storage_mounts_metadata) = row[:11] # TODO: use namedtuple instead of dict record = { 'name': name, @@ -529,6 +572,8 @@ def get_cluster_from_name( 'owner': _load_owner(owner), 'metadata': json.loads(metadata), 'cluster_hash': cluster_hash, + 'storage_mounts_metadata': + _load_storage_mounts_metadata(storage_mounts_metadata), } return record return None @@ -540,9 +585,8 @@ def get_clusters() -> List[Dict[str, Any]]: records = [] for row in rows: (name, launched_at, handle, last_use, status, autostop, metadata, - to_down, owner, cluster_hash) = row[:10] + to_down, owner, cluster_hash, storage_mounts_metadata) = row[:11] # TODO: use namedtuple instead of dict - record = { 'name': name, 'launched_at': launched_at, @@ -554,6 +598,8 @@ def get_clusters() -> List[Dict[str, Any]]: 'owner': _load_owner(owner), 'metadata': json.loads(metadata), 'cluster_hash': cluster_hash, + 'storage_mounts_metadata': + _load_storage_mounts_metadata(storage_mounts_metadata), } records.append(record) diff --git a/tests/test_smoke.py b/tests/test_smoke.py index c9e9fe14ae5..4654528f817 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -944,6 +944,105 @@ def test_ibm_storage_mounts(): run_one_test(test) +@pytest.mark.aws +def test_aws_storage_mounts_with_stop(): + name = _get_cluster_name() + storage_name = f'sky-test-{int(time.time())}' + template_str = pathlib.Path( + 'tests/test_yamls/test_storage_mounting.yaml.j2').read_text() + template = jinja2.Template(template_str) + content = template.render(storage_name=storage_name) + with tempfile.NamedTemporaryFile(suffix='.yaml', mode='w') as f: + f.write(content) + f.flush() + file_path = f.name + test_commands = [ + *storage_setup_commands, + f'sky launch -y -c {name} --cloud aws {file_path}', + f'sky logs {name} 1 --status', # Ensure job succeeded. + f'aws s3 ls {storage_name}/hello.txt', + f'sky stop -y {name}', + f'sky start -y {name}', + # Check if hello.txt from mounting bucket exists after restart in + # the mounted directory + f'sky exec {name} -- "set -ex; ls /mount_private_mount/hello.txt"' + ] + test = Test( + 'aws_storage_mounts', + test_commands, + f'sky down -y {name}; sky storage delete {storage_name}', + timeout=20 * 60, # 20 mins + ) + run_one_test(test) + + +@pytest.mark.gcp +def test_gcp_storage_mounts_with_stop(): + name = _get_cluster_name() + storage_name = f'sky-test-{int(time.time())}' + template_str = pathlib.Path( + 'tests/test_yamls/test_storage_mounting.yaml.j2').read_text() + template = jinja2.Template(template_str) + content = template.render(storage_name=storage_name) + with tempfile.NamedTemporaryFile(suffix='.yaml', mode='w') as f: + f.write(content) + f.flush() + file_path = f.name + test_commands = [ + *storage_setup_commands, + f'sky launch -y -c {name} --cloud gcp {file_path}', + f'sky logs {name} 1 --status', # Ensure job succeeded. + f'gsutil ls gs://{storage_name}/hello.txt', + f'sky stop -y {name}', + f'sky start -y {name}', + # Check if hello.txt from mounting bucket exists after restart in + # the mounted directory + f'sky exec {name} -- "set -ex; ls /mount_private_mount/hello.txt"' + ] + test = Test( + 'gcp_storage_mounts', + test_commands, + f'sky down -y {name}; sky storage delete {storage_name}', + timeout=20 * 60, # 20 mins + ) + run_one_test(test) + + +@pytest.mark.kubernetes +def test_kubernetes_storage_mounts_with_stop(): + # Tests bucket mounting on k8s, assuming S3 is configured. + # This test will fail if run on non x86_64 architecture, since goofys is + # built for x86_64 only. + name = _get_cluster_name() + storage_name = f'sky-test-{int(time.time())}' + template_str = pathlib.Path( + 'tests/test_yamls/test_storage_mounting.yaml.j2').read_text() + template = jinja2.Template(template_str) + content = template.render(storage_name=storage_name) + with tempfile.NamedTemporaryFile(suffix='.yaml', mode='w') as f: + f.write(content) + f.flush() + file_path = f.name + test_commands = [ + *storage_setup_commands, + f'sky launch -y -c {name} --cloud kubernetes {file_path}', + f'sky logs {name} 1 --status', # Ensure job succeeded. + f'aws s3 ls {storage_name}/hello.txt', + f'sky stop -y {name}', + f'sky start -y {name}', + # Check if hello.txt from mounting bucket exists after restart in + # the mounted directory + f'sky exec {name} -- "set -ex; ls /mount_private_mount/hello.txt"' + ] + test = Test( + 'kubernetes_storage_mounts', + test_commands, + f'sky down -y {name}; sky storage delete {storage_name}', + timeout=20 * 60, # 20 mins + ) + run_one_test(test) + + # ---------- CLI logs ---------- @pytest.mark.no_scp # SCP does not support num_nodes > 1 yet. Run test_scp_logs instead. def test_cli_logs(generic_cloud: str): @@ -3226,7 +3325,7 @@ def test_nonexistent_bucket(self, nonexist_bucket_url): with pytest.raises( sky.exceptions.StorageBucketGetError, - match='Attempted to connect to a non-existent bucket'): + match='Attempted to use a non-existent bucket as a source'): storage_obj = storage_lib.Storage(source=nonexist_bucket_url.format( random_name=nonexist_bucket_name)) From d1a8f264693a195eaabefebc4594a048cb4e2430 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Sat, 11 Nov 2023 04:20:07 +0800 Subject: [PATCH 12/17] [Dependency] Update google-api-python-client version for discardLocalSsd (#2759) Update google-api-python-client version for discardLocalSsd --- sky/setup_files/setup.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/sky/setup_files/setup.py b/sky/setup_files/setup.py index 920faf66132..261c7c9ed4d 100644 --- a/sky/setup_files/setup.py +++ b/sky/setup_files/setup.py @@ -219,9 +219,10 @@ def parse_readme(readme: str) -> str: 'azure-cli>=2.31.0', 'azure-core', 'azure-identity>=1.13.0', 'azure-mgmt-network' ] + local_ray, - # We need google-api-python-client>=2.19.1 to enable 'reason' attribute - # of googleapiclient.errors.HttpError, which is widely used in our system. - 'gcp': ['google-api-python-client>=2.19.1', 'google-cloud-storage'] + + # We need google-api-python-client>=2.69.0 to enable 'discardLocalSsd' + # parameter for stopping instances. + # Reference: https://github.com/googleapis/google-api-python-client/commit/f6e9d3869ed605b06f7cbf2e8cf2db25108506e6 + 'gcp': ['google-api-python-client>=2.69.0', 'google-cloud-storage'] + local_ray, 'ibm': [ 'ibm-cloud-sdk-core', 'ibm-vpc', 'ibm-platform-services', 'ibm-cos-sdk' From 51a831c6b8c58a3fb9c659ec647f38e780879dfb Mon Sep 17 00:00:00 2001 From: Tian Xia Date: Fri, 10 Nov 2023 12:32:29 -0800 Subject: [PATCH 13/17] [Core] Return `job_id` and `handle` for `sky/execution.py::_execute` (#2736) * add return value * fix type * Update sky/execution.py Co-authored-by: Zhanghao Wu * restore local docker backend * add return value for sky.launch and sky.exec * add smoke test * move dryrun tests to test_api * enable all clouds * directly call python api in smoke test --------- Co-authored-by: Zhanghao Wu --- .github/workflows/pytest.yml | 1 + sky/backends/backend.py | 9 +++- sky/backends/cloud_vm_ray_backend.py | 13 +++-- sky/cli.py | 6 ++- sky/execution.py | 76 +++++++++++++++++++--------- tests/test_api.py | 7 +++ tests/test_smoke.py | 36 +++++++++++-- 7 files changed, 114 insertions(+), 34 deletions(-) create mode 100644 tests/test_api.py diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml index 4280f61ab52..019b2cb4564 100644 --- a/.github/workflows/pytest.yml +++ b/.github/workflows/pytest.yml @@ -17,6 +17,7 @@ jobs: python-version: [3.8] test-path: - tests/unit_tests + - tests/test_api.py - tests/test_cli.py - tests/test_config.py - tests/test_global_user_state.py diff --git a/sky/backends/backend.py b/sky/backends/backend.py index c51ecd41d92..59887bcc847 100644 --- a/sky/backends/backend.py +++ b/sky/backends/backend.py @@ -86,7 +86,12 @@ def execute(self, handle: _ResourceHandleType, task: 'task_lib.Task', detach_run: bool, - dryrun: bool = False) -> None: + dryrun: bool = False) -> Optional[int]: + """Execute the task on the cluster. + + Returns: + Job id if the task is submitted to the cluster, None otherwise. + """ usage_lib.record_cluster_name_for_current_operation( handle.get_cluster_name()) usage_lib.messages.usage.update_actual_task(task) @@ -143,7 +148,7 @@ def _execute(self, handle: _ResourceHandleType, task: 'task_lib.Task', detach_run: bool, - dryrun: bool = False) -> None: + dryrun: bool = False) -> Optional[int]: raise NotImplementedError def _post_execute(self, handle: _ResourceHandleType, down: bool) -> None: diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py index 11bf1bbf145..c9c80da03e5 100644 --- a/sky/backends/cloud_vm_ray_backend.py +++ b/sky/backends/cloud_vm_ray_backend.py @@ -3473,10 +3473,15 @@ def _execute( task: task_lib.Task, detach_run: bool, dryrun: bool = False, - ) -> None: + ) -> Optional[int]: + """Executes the task on the cluster. + + Returns: + Job id if the task is submitted to the cluster, None otherwise. + """ if task.run is None: logger.info('Run commands not specified or empty.') - return + return None # Check the task resources vs the cluster resources. Since `sky exec` # will not run the provision and _check_existing_cluster # We need to check ports here since sky.exec shouldn't change resources @@ -3486,7 +3491,7 @@ def _execute( if dryrun: logger.info(f'Dryrun complete. Would have run:\n{task}') - return + return None job_id = self._add_job(handle, task.name, resources_str) @@ -3498,6 +3503,8 @@ def _execute( # Case: task_lib.Task(run, num_nodes=1) self._execute_task_one_node(handle, task, job_id, detach_run) + return job_id + def _post_execute(self, handle: CloudVmRayResourceHandle, down: bool) -> None: fore = colorama.Fore diff --git a/sky/cli.py b/sky/cli.py index 33bfc27b232..21c1d65616b 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -2043,7 +2043,7 @@ def queue(clusters: List[str], skip_finished: bool, all_users: bool): @usage_lib.entrypoint def logs( cluster: str, - job_ids: Tuple[str], + job_ids: Tuple[str, ...], sync_down: bool, status: bool, # pylint: disable=redefined-outer-name follow: bool, @@ -2082,6 +2082,7 @@ def logs( assert job_ids is None or len(job_ids) <= 1, job_ids job_id = None + job_ids_to_query: Optional[List[int]] = None if job_ids: # Already check that len(job_ids) <= 1. This variable is used later # in core.tail_logs. @@ -2091,7 +2092,8 @@ def logs( 'Job ID must be integers.') job_ids_to_query = [int(job_id)] else: - job_ids_to_query = job_ids + # job_ids is either None or empty list, so it is safe to cast it here. + job_ids_to_query = typing.cast(Optional[List[int]], job_ids) if status: job_statuses = core.job_status(cluster, job_ids_to_query) job_id = list(job_statuses.keys())[0] diff --git a/sky/execution.py b/sky/execution.py index ebf42cf83b6..a759908a22c 100644 --- a/sky/execution.py +++ b/sky/execution.py @@ -7,7 +7,7 @@ import getpass import os import tempfile -from typing import Any, List, Optional, Union +from typing import Any, List, Optional, Tuple, Union import uuid import colorama @@ -153,7 +153,7 @@ def _execute( dryrun: bool = False, down: bool = False, stream_logs: bool = True, - handle: Any = None, + handle: Optional[backends.ResourceHandle] = None, backend: Optional[backends.Backend] = None, retry_until_up: bool = False, optimize_target: optimizer.OptimizeTarget = optimizer.OptimizeTarget.COST, @@ -167,7 +167,7 @@ def _execute( # Internal only: # pylint: disable=invalid-name _is_launched_by_spot_controller: bool = False, -) -> None: +) -> Tuple[Optional[int], Optional[backends.ResourceHandle]]: """Execute an entrypoint. If sky.Task is given or DAG has not been optimized yet, this will call @@ -183,8 +183,8 @@ def _execute( Note that if errors occur during provisioning/data syncing/setting up, the cluster will not be torn down for debugging purposes. stream_logs: bool; whether to stream all tasks' outputs to the client. - handle: Any; if provided, execution will use an existing backend cluster - handle instead of provisioning a new one. + handle: Optional[backends.ResourceHandle]; if provided, execution will use + an existing backend cluster handle instead of provisioning a new one. backend: Backend; backend to use for executing the tasks. Defaults to CloudVmRayBackend() retry_until_up: bool; whether to retry the provisioning until the cluster @@ -205,6 +205,13 @@ def _execute( idle_minutes_to_autostop: int; if provided, the cluster will be set to autostop after this many minutes of idleness. no_setup: bool; whether to skip setup commands or not when (re-)launching. + + Returns: + job_id: Optional[int]; the job ID of the submitted job. None if the + backend is not CloudVmRayBackend, or no job is submitted to + the cluster. + handle: Optional[backends.ResourceHandle]; the handle to the cluster. None + if dryrun. """ dag = _convert_to_dag(entrypoint) assert len(dag) == 1, f'We support 1 task for now. {dag}' @@ -319,9 +326,11 @@ def _execute( cluster_name=cluster_name, retry_until_up=retry_until_up) - if dryrun and handle is None: + if handle is None: + assert dryrun, ('If not dryrun, handle must be set or ' + 'Stage.PROVISION must be included in stages.') logger.info('Dryrun finished.') - return + return None, None if Stage.SYNC_WORKDIR in stages and not dryrun: if task.workdir is not None: @@ -339,6 +348,7 @@ def _execute( if Stage.PRE_EXEC in stages and not dryrun: if idle_minutes_to_autostop is not None: assert isinstance(backend, backends.CloudVmRayBackend) + assert isinstance(handle, backends.CloudVmRayResourceHandle) backend.set_autostop(handle, idle_minutes_to_autostop, down=down) @@ -346,7 +356,10 @@ def _execute( if Stage.EXEC in stages: try: global_user_state.update_last_use(handle.get_cluster_name()) - backend.execute(handle, task, detach_run, dryrun=dryrun) + job_id = backend.execute(handle, + task, + detach_run, + dryrun=dryrun) finally: # Enables post_execute() to be run after KeyboardInterrupt. backend.post_execute(handle, down) @@ -370,6 +383,7 @@ def _execute( subprocess_utils.run('sky status --no-show-spot-jobs', env=env) print() print('\x1b[?25h', end='') # Show cursor. + return job_id, handle @timeline.event @@ -391,7 +405,7 @@ def launch( # Internal only: # pylint: disable=invalid-name _is_launched_by_spot_controller: bool = False, -) -> None: +) -> Tuple[Optional[int], Optional[backends.ResourceHandle]]: # NOTE(dev): Keep the docstring consistent between the Python API and CLI. """Launch a task. @@ -470,12 +484,19 @@ def launch( exceptions.CommandError: any ssh command error. exceptions.NoCloudAccessError: if all clouds are disabled. Other exceptions may be raised depending on the backend. + + Returns: + job_id: Optional[int]; the job ID of the submitted job. None if the + backend is not CloudVmRayBackend, or no job is submitted to + the cluster. + handle: Optional[backends.ResourceHandle]; the handle to the cluster. None + if dryrun. """ entrypoint = task backend_utils.check_cluster_name_not_reserved(cluster_name, operation_str='sky.launch') - _execute( + return _execute( entrypoint=entrypoint, dryrun=dryrun, down=down, @@ -503,7 +524,7 @@ def exec( # pylint: disable=redefined-builtin stream_logs: bool = True, backend: Optional[backends.Backend] = None, detach_run: bool = False, -) -> None: +) -> Tuple[Optional[int], Optional[backends.ResourceHandle]]: # NOTE(dev): Keep the docstring consistent between the Python API and CLI. """Execute a task on an existing cluster. @@ -548,6 +569,13 @@ def exec( # pylint: disable=redefined-builtin status. sky.exceptions.NotSupportedError: if the specified cluster is a reserved cluster that does not support this operation. + + Returns: + job_id: Optional[int]; the job ID of the submitted job. None if the + backend is not CloudVmRayBackend, or no job is submitted to + the cluster. + handle: Optional[backends.ResourceHandle]; the handle to the cluster. None + if dryrun. """ entrypoint = task if isinstance(entrypoint, sky.Dag): @@ -563,18 +591,20 @@ def exec( # pylint: disable=redefined-builtin operation='executing tasks', check_cloud_vm_ray_backend=False, dryrun=dryrun) - _execute(entrypoint=entrypoint, - dryrun=dryrun, - down=down, - stream_logs=stream_logs, - handle=handle, - backend=backend, - stages=[ - Stage.SYNC_WORKDIR, - Stage.EXEC, - ], - cluster_name=cluster_name, - detach_run=detach_run) + return _execute( + entrypoint=entrypoint, + dryrun=dryrun, + down=down, + stream_logs=stream_logs, + handle=handle, + backend=backend, + stages=[ + Stage.SYNC_WORKDIR, + Stage.EXEC, + ], + cluster_name=cluster_name, + detach_run=detach_run, + ) @usage_lib.entrypoint diff --git a/tests/test_api.py b/tests/test_api.py new file mode 100644 index 00000000000..4d6658fcd05 --- /dev/null +++ b/tests/test_api.py @@ -0,0 +1,7 @@ +import sky + + +def test_sky_launch(enable_all_clouds): + task = sky.Task() + job_id, handle = sky.launch(task, dryrun=True) + assert job_id is None and handle is None diff --git a/tests/test_smoke.py b/tests/test_smoke.py index 4654528f817..6da0866b4c5 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -25,6 +25,7 @@ import inspect import os import pathlib +import shlex import shutil import subprocess import sys @@ -2724,13 +2725,40 @@ def test_user_ray_cluster(generic_cloud: str): run_one_test(test) +_CODE_PREFIX = ['import sky'] + + +def _build(code: List[str]) -> str: + code = _CODE_PREFIX + code + code = ';'.join(code) + return f'python3 -u -c {shlex.quote(code)}' + + # ------- Testing the core API -------- # Most of the core APIs have been tested in the CLI tests. # These tests are for testing the return value of the APIs not fully used in CLI. -def test_core_api(): - name = _get_cluster_name() - sky.launch - # TODO(zhwu): Add a test for core api. + + +@pytest.mark.gcp +def test_core_api_sky_launch_exec(): + name = _get_cluster_name() + task = sky.Task(run="whoami") + task.set_resources(sky.Resources(cloud=sky.GCP())) + job_id, handle = sky.launch(task, cluster_name=name) + assert job_id == 1 + assert handle is not None + assert handle.cluster_name == name + assert handle.launched_resources.cloud.is_same_cloud(sky.GCP()) + job_id_exec, handle_exec = sky.exec(task, cluster_name=name) + assert job_id_exec == 2 + assert handle_exec is not None + assert handle_exec.cluster_name == name + assert handle_exec.launched_resources.cloud.is_same_cloud(sky.GCP()) + # For dummy task (i.e. task.run is None), the job won't be submitted. + dummy_task = sky.Task() + job_id_dummy, _ = sky.exec(dummy_task, cluster_name=name) + assert job_id_dummy is None + sky.down(name) # ---------- Testing Storage ---------- From 31aa76c968e6c9ca8c21ac25051f002383c1baee Mon Sep 17 00:00:00 2001 From: Tian Xia Date: Fri, 10 Nov 2023 16:42:35 -0800 Subject: [PATCH 14/17] [BugFix] Fix GCP Cloud Permission doc url (#2770) fix --- sky/backends/cloud_vm_ray_backend.py | 2 +- sky/clouds/gcp.py | 2 +- sky/skylet/providers/gcp/constants.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py index c9c80da03e5..b1832d6581f 100644 --- a/sky/backends/cloud_vm_ray_backend.py +++ b/sky/backends/cloud_vm_ray_backend.py @@ -769,7 +769,7 @@ def _update_blocklist_on_gcp_error( 'having the required permissions and the user ' 'account does not have enough permission to ' 'update it. Please contact your administrator and ' - 'check out: https://skypilot.readthedocs.io/en/latest/cloud-setup/cloud-permissions.html#gcp\n' # pylint: disable=line-too-long + 'check out: https://skypilot.readthedocs.io/en/latest/cloud-setup/cloud-permissions/gcp.html\n' # pylint: disable=line-too-long f'Details: {httperror_str[0]}') self._blocked_resources.add( launchable_resources.copy(region=None, zone=None)) diff --git a/sky/clouds/gcp.py b/sky/clouds/gcp.py index df73d46316a..98d8403528e 100644 --- a/sky/clouds/gcp.py +++ b/sky/clouds/gcp.py @@ -860,7 +860,7 @@ def check_credentials(cls) -> Tuple[bool, Optional[str]]: 'The following permissions are not enabled for the current ' f'GCP identity ({identity_str}):\n ' f'{diffs}\n ' - 'For more details, visit: https://skypilot.readthedocs.io/en/latest/cloud-setup/cloud-permissions.html#gcp') # pylint: disable=line-too-long + 'For more details, visit: https://skypilot.readthedocs.io/en/latest/cloud-setup/cloud-permissions/gcp.html') # pylint: disable=line-too-long return True, None def get_credential_file_mounts(self) -> Dict[str, str]: diff --git a/sky/skylet/providers/gcp/constants.py b/sky/skylet/providers/gcp/constants.py index bcf3b02ef70..c6f02e8af9b 100644 --- a/sky/skylet/providers/gcp/constants.py +++ b/sky/skylet/providers/gcp/constants.py @@ -80,7 +80,7 @@ ] # A list of permissions required to run SkyPilot on GCP. -# Keep this in sync with https://skypilot.readthedocs.io/en/latest/cloud-setup/cloud-permissions.html#gcp # pylint: disable=line-too-long +# Keep this in sync with https://skypilot.readthedocs.io/en/latest/cloud-setup/cloud-permissions/gcp.html # pylint: disable=line-too-long VM_MINIMAL_PERMISSIONS = [ "compute.disks.create", "compute.disks.list", From aecb2de328cdff1e973380572c2c136683262206 Mon Sep 17 00:00:00 2001 From: Zongheng Yang Date: Sat, 11 Nov 2023 22:57:42 -0800 Subject: [PATCH 15/17] GCP: Support for custom VPC. (#2764) * GCP: Support for custom VPC. * yapf * format.sh * Address comments. * Update docs * format * comment --- .../cloud-setup/cloud-permissions/gcp.rst | 54 ++++++++++++- docs/source/reference/config.rst | 39 +++++++-- sky/backends/backend_utils.py | 19 ++++- sky/backends/cloud_vm_ray_backend.py | 37 +++++++-- sky/skylet/providers/gcp/config.py | 80 +++++++++++++++---- sky/skylet/providers/gcp/constants.py | 11 +++ sky/templates/aws-ray.yml.j2 | 7 +- sky/templates/gcp-ray.yml.j2 | 4 + sky/utils/schemas.py | 7 ++ tests/test_config.py | 7 ++ 10 files changed, 224 insertions(+), 41 deletions(-) diff --git a/docs/source/cloud-setup/cloud-permissions/gcp.rst b/docs/source/cloud-setup/cloud-permissions/gcp.rst index fa75750e711..82ad6249aa4 100644 --- a/docs/source/cloud-setup/cloud-permissions/gcp.rst +++ b/docs/source/cloud-setup/cloud-permissions/gcp.rst @@ -66,7 +66,7 @@ User compute.firewalls.create compute.firewalls.delete compute.firewalls.get - compute.instances.create + compute.instances.create compute.instances.delete compute.instances.get compute.instances.list @@ -148,8 +148,8 @@ User .. note:: - The user created with the above minimal permissions will not be able to create service accounts to be assigned to SkyPilot instances. - + The user created with the above minimal permissions will not be able to create service accounts to be assigned to SkyPilot instances. + The admin needs to follow the :ref:`instruction below ` to create a service account to be shared by all users in the project. @@ -182,3 +182,51 @@ Service Account :align: center :alt: Set Service Account Role + +.. _gcp-minimum-firewall-rules: + +Firewall Rules +~~~~~~~~~~~~~~~~~~~ + +By default, users do not need to set up any special firewall rules to start +using SkyPilot. If the default VPC does not satisfy the minimal required rules, +a new VPC ``skypilot-vpc`` with sufficient rules will be automatically created +and used. + +However, if you manually set up and instruct SkyPilot to use a VPC (see +:ref:`here `), ensure it has the following required firewall rules: + +.. code-block:: python + + # Allow internal connections between SkyPilot VMs: + # + # controller -> head node of a cluster + # head node of a cluster <-> worker node(s) of a cluster + # + # NOTE: these ports are more relaxed than absolute minimum, but the + # sourceRanges restrict the traffic to internal IPs. + { + "direction": "INGRESS", + "allowed": [ + {"IPProtocol": "tcp", "ports": ["0-65535"]}, + {"IPProtocol": "udp", "ports": ["0-65535"]}, + ], + "sourceRanges": ["10.128.0.0/9"], + }, + + # Allow SSH connections from user machine(s) + # + # NOTE: This can be satisfied using the following relaxed sourceRanges + # (0.0.0.0/0), but you can customize it if you want to restrict to certain + # known public IPs (useful when using internal VPN or proxy solutions). + { + "direction": "INGRESS", + "allowed": [ + {"IPProtocol": "tcp", "ports": ["22"]}, + ], + "sourceRanges": ["0.0.0.0/0"], + }, + +You can inspect and manage firewall rules at +``https://console.cloud.google.com/net-security/firewall-manager/firewall-policies/list?project=`` +or using any of GCP's SDKs. diff --git a/docs/source/reference/config.rst b/docs/source/reference/config.rst index 5c0a86ec66b..e21487d612d 100644 --- a/docs/source/reference/config.rst +++ b/docs/source/reference/config.rst @@ -57,7 +57,7 @@ Available fields and semantics: # with this name (provisioner automatically looks for such regions). # Regions without a VPC with this name will not be used to launch nodes. # - # Default: None (use the default VPC in each region). + # Default: null (use the default VPC in each region). vpc_name: skypilot-vpc # Should instances be assigned private IPs only? (optional) @@ -88,7 +88,7 @@ Available fields and semantics: # and any SkyPilot nodes. (This option is not used between SkyPilot nodes, # since they are behind the proxy / may not have such a proxy set up.) # - # Optional; default: None. + # Optional; default: null. ### Format 1 ### # A string; the same proxy command is used for all regions. ssh_proxy_command: ssh -W %h:%p -i ~/.ssh/sky-key -o StrictHostKeyChecking=no ec2-user@ @@ -103,6 +103,24 @@ Available fields and semantics: # Advanced GCP configurations (optional). # Apply to all new instances but not existing ones. gcp: + # VPC to use (optional). + # + # Default: null, which implies the following behavior. First, the VPC named + # 'default' is checked against minimal recommended firewall rules for + # SkyPilot to function. If it satisfies these rules, this VPC is used. + # Otherwise, a new VPC named 'skypilot-vpc' is automatically created with + # the minimal recommended firewall rules and will be used. + # + # If this field is set, SkyPilot will use the VPC with this name. Useful for + # when users want to manually set up a VPC and precisely control its + # firewall rules. If no region restrictions are given, SkyPilot only + # provisions in regions for which a subnet of this VPC exists. Errors are + # thrown if VPC with this name is not found. The VPC does not get modified + # in any way, except when opening ports (e.g., via `resources.ports`) in + # which case new firewall rules permitting public traffic to those ports + # will be added. + vpc_name: skypilot-vpc + # Reserved capacity (optional). # # The specific reservation to be considered when provisioning clusters on GCP. @@ -117,15 +135,24 @@ Available fields and semantics: # Advanced Kubernetes configurations (optional). kubernetes: # The networking mode for accessing SSH jump pod (optional). - # This must be either: 'nodeport' or 'portforward'. If not specified, defaults to 'portforward'. # - # nodeport: Exposes the jump pod SSH service on a static port number on each Node, allowing external access to using :. Using this mode requires opening multiple ports on nodes in the Kubernetes cluster. - # portforward: Uses `kubectl port-forward` to create a tunnel and directly access the jump pod SSH service in the Kubernetes cluster. Does not require opening ports the cluster nodes and is more secure. 'portforward' is used as default if 'networking' is not specified. + # This must be either: 'nodeport' or 'portforward'. If not specified, + # defaults to 'portforward'. + # + # nodeport: Exposes the jump pod SSH service on a static port number on each + # Node, allowing external access to using :. Using this + # mode requires opening multiple ports on nodes in the Kubernetes cluster. + # + # portforward: Uses `kubectl port-forward` to create a tunnel and directly + # access the jump pod SSH service in the Kubernetes cluster. Does not + # require opening ports the cluster nodes and is more secure. 'portforward' + # is used as default if 'networking' is not specified. networking: portforward # Advanced OCI configurations (optional). oci: - # A dict mapping region names to region-specific configurations, or `default` for the default configuration. + # A dict mapping region names to region-specific configurations, or + # `default` for the default configuration. default: # The OCID of the profile to use for launching instances (optional). oci_config_profile: DEFAULT diff --git a/sky/backends/backend_utils.py b/sky/backends/backend_utils.py index fa54505dd20..c7ea38bf071 100644 --- a/sky/backends/backend_utils.py +++ b/sky/backends/backend_utils.py @@ -1023,8 +1023,8 @@ def write_cluster_config( 'SKYPILOT_USER', '')), # AWS only: - 'vpc_name': skypilot_config.get_nested(('aws', 'vpc_name'), - None), + 'aws_vpc_name': skypilot_config.get_nested(('aws', 'vpc_name'), + None), 'use_internal_ips': skypilot_config.get_nested( ('aws', 'use_internal_ips'), False), # Not exactly AWS only, but we only test it's supported on AWS @@ -1038,6 +1038,8 @@ def write_cluster_config( 'resource_group': f'{cluster_name}-{region_name}', # GCP only: + 'gcp_vpc_name': skypilot_config.get_nested(('gcp', 'vpc_name'), + None), 'gcp_project_id': gcp_project_id, 'specific_reservations': filtered_specific_reservations, 'num_specific_reserved_workers': num_specific_reserved_workers, @@ -1126,10 +1128,21 @@ def write_cluster_config( user_file_dir = os.path.expanduser(f'{SKY_USER_FILE_PATH}/') + # We do not import the module under sky.skylet.providers globally as we + # need to avoid importing ray module (extras like skypilot[aws] has + # removed the Ray dependency). # pylint: disable=import-outside-toplevel from sky.skylet.providers.gcp import config as gcp_config config = common_utils.read_yaml(os.path.expanduser(config_dict['ray'])) - vpc_name = gcp_config.get_usable_vpc(config) + vpc_name = None + try: + vpc_name = gcp_config.get_usable_vpc(config) + except RuntimeError as e: + # Launching a TPU and encountering a bootstrap-phase error, no point + # in failover unless: + # TODO(zongheng): handle failover when multi-resource is added. + with ux_utils.print_exception_no_traceback(): + raise e scripts = [] for template_name in ('gcp-tpu-create.sh.j2', 'gcp-tpu-delete.sh.j2'): diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py index b1832d6581f..d8b93bcd51a 100644 --- a/sky/backends/cloud_vm_ray_backend.py +++ b/sky/backends/cloud_vm_ray_backend.py @@ -664,8 +664,6 @@ def _update_blocklist_on_gcp_error( self, launchable_resources: 'resources_lib.Resources', region: 'clouds.Region', zones: Optional[List['clouds.Zone']], stdout: str, stderr: str): - - del region # unused style = colorama.Style assert zones and len(zones) == 1, zones zone = zones[0] @@ -673,7 +671,12 @@ def _update_blocklist_on_gcp_error( exception_list = [s for s in splits if s.startswith('Exception: ')] httperror_str = [ s for s in splits - if s.startswith('googleapiclient.errors.HttpError: ') + # GCP API errors + if s.startswith('googleapiclient.errors.HttpError: ') or + # 'SKYPILOT_ERROR_NO_NODES_LAUNCHED': skypilot's changes to the + # underlying provisioner provider; for errors prior to provisioning + # like VPC setup. + 'SKYPILOT_ERROR_NO_NODES_LAUNCHED: ' in s ] if len(exception_list) == 1: # Parse structured response {'errors': [...]}. @@ -756,9 +759,21 @@ def _update_blocklist_on_gcp_error( else: assert False, error elif len(httperror_str) >= 1: - logger.info(f'Got {httperror_str[0]}') - if ('Requested disk size cannot be smaller than the image size' - in httperror_str[0]): + messages = '\n\t'.join(httperror_str) + logger.warning( + f'Got error(s):\n\t{style.DIM}{messages}{style.RESET_ALL}') + if ('SKYPILOT_ERROR_NO_NODES_LAUNCHED: No VPC with name ' + in stderr): + # User has specified a VPC that does not exist. On GCP, VPC is + # global. So we skip the entire cloud. + self._blocked_resources.add( + launchable_resources.copy(region=None, zone=None)) + elif ('SKYPILOT_ERROR_NO_NODES_LAUNCHED: No subnet for region ' + in stderr): + self._blocked_resources.add( + launchable_resources.copy(region=region.name, zone=None)) + elif ('Requested disk size cannot be smaller than the image size' + in httperror_str[0]): logger.info('Skipping all regions due to disk size issue.') self._blocked_resources.add( launchable_resources.copy(region=None, zone=None)) @@ -773,7 +788,6 @@ def _update_blocklist_on_gcp_error( f'Details: {httperror_str[0]}') self._blocked_resources.add( launchable_resources.copy(region=None, zone=None)) - else: # Parse HttpError for unauthorized regions. Example: # googleapiclient.errors.HttpError: None: + """Logs an message then raises a specific RuntimeError to trigger failover. + + Mainly used for handling VPC/subnet errors before nodes are launched. + """ + # NOTE: keep. The backend looks for this to know no nodes are launched. + prefix = "SKYPILOT_ERROR_NO_NODES_LAUNCHED: " + raise RuntimeError(prefix + error) + + def get_node_type(node: dict) -> GCPNodeType: """Returns node type based on the keys in ``node``. @@ -753,28 +763,56 @@ def _create_rules(config, compute, rules, VPC_NAME, PROJ_ID): wait_for_compute_global_operation(config["provider"]["project_id"], op, compute) -def get_usable_vpc(config): +def get_usable_vpc(config) -> str: """Return a usable VPC. + If config['provider']['vpc_name'] is set, return the VPC with the name + (errors out if not found). When this field is set, no firewall rules + checking or overrides will take place; it is the user's responsibility to + properly set up the VPC. + If not found, create a new one with sufficient firewall rules. + + Raises: + RuntimeError: if the user has specified a VPC name but the VPC is not found. """ _, _, compute, _ = construct_clients_from_provider_config(config["provider"]) - - # For backward compatibility, reuse the VPC if the VM is launched. - resource = GCPCompute( - compute, - config["provider"]["project_id"], - config["provider"]["availability_zone"], - config["cluster_name"], - ) - node = resource._list_instances(label_filters=None, status_filter=None) - if len(node) > 0: - netInterfaces = node[0].get("networkInterfaces", []) - if len(netInterfaces) > 0: - vpc_name = netInterfaces[0]["network"].split("/")[-1] - return vpc_name - - vpcnets_all = _list_vpcnets(config, compute) + specific_vpc_to_use = config["provider"].get("vpc_name", None) + if specific_vpc_to_use is None: + # For backward compatibility, reuse the VPC if the VM is launched. + resource = GCPCompute( + compute, + config["provider"]["project_id"], + config["provider"]["availability_zone"], + config["cluster_name"], + ) + node = resource._list_instances(label_filters=None, status_filter=None) + if len(node) > 0: + netInterfaces = node[0].get("networkInterfaces", []) + if len(netInterfaces) > 0: + vpc_name = netInterfaces[0]["network"].split("/")[-1] + return vpc_name + + vpcnets_all = _list_vpcnets(config, compute) + else: + vpcnets_all = _list_vpcnets( + config, compute, filter=f"name={specific_vpc_to_use}" + ) + # On GCP, VPC names are unique, so it'd be 0 or 1 VPC found. + assert ( + len(vpcnets_all) <= 1 + ), f"{len(vpcnets_all)} VPCs found with the same name {specific_vpc_to_use}" + if len(vpcnets_all) == 1: + # Skip checking any firewall rules if the user has specified a VPC. + logger.info(f"Using user-specified VPC {specific_vpc_to_use!r}.") + return specific_vpc_to_use + else: + # VPC with this name not found. Error out and let SkyPilot failover. + _skypilot_log_error_and_exit_for_failover( + f"No VPC with name {specific_vpc_to_use!r} is found. " + "To fix: specify a correct VPC name." + ) + # Should not reach here. usable_vpc_name = None for vpc in vpcnets_all: @@ -827,6 +865,14 @@ def _configure_subnet(config, compute): # SkyPilot: make sure there's a usable VPC usable_vpc_name = get_usable_vpc(config) subnets = _list_subnets(config, compute, filter=f'(name="{usable_vpc_name}")') + if not subnets: + # This can happen when e.g., region A is specified but the VPC has no + # subnet in region A. + _skypilot_log_error_and_exit_for_failover( + f"No subnet for region {config['provider']['region']} found (VPC {usable_vpc_name!r}). " + f"Check the subnets of VPC {usable_vpc_name!r} at https://console.cloud.google.com/networking/networks" + ) + default_subnet = subnets[0] default_interfaces = [ diff --git a/sky/skylet/providers/gcp/constants.py b/sky/skylet/providers/gcp/constants.py index c6f02e8af9b..97d9d14ad1c 100644 --- a/sky/skylet/providers/gcp/constants.py +++ b/sky/skylet/providers/gcp/constants.py @@ -9,6 +9,7 @@ "mtu": 1460, "routingConfig": {"routingMode": "GLOBAL"}, } + # Required firewall rules for SkyPilot to work. FIREWALL_RULES_REQUIRED = [ # Allow internal connections between GCP VMs for Ray multi-node cluster. @@ -29,9 +30,12 @@ "ports": ["22"], } ], + # TODO(skypilot): some users reported that this should be relaxed (e.g., + # allowlisting only certain IPs to have ssh access). "sourceRanges": ["0.0.0.0/0"], }, ] + # Template when creating firewall rules for a new VPC. FIREWALL_RULES_TEMPLATE = [ { @@ -61,6 +65,8 @@ "ports": ["22"], } ], + # TODO(skypilot): some users reported that this should be relaxed (e.g., + # allowlisting only certain IPs to have ssh access). "sourceRanges": ["0.0.0.0/0"], }, { @@ -84,6 +90,11 @@ VM_MINIMAL_PERMISSIONS = [ "compute.disks.create", "compute.disks.list", + # TODO(skypilot): some users reported that firewalls changes + # (create/delete/update) should be removed if VPC/firewalls are separately + # set up. It is undesirable for a normal account to have these permissions. + # Note that if these permissions are removed, opening ports (e.g., via + # `resources.ports`) would fail. "compute.firewalls.create", "compute.firewalls.delete", "compute.firewalls.get", diff --git a/sky/templates/aws-ray.yml.j2 b/sky/templates/aws-ray.yml.j2 index bbd96d30a2c..1494a2c7060 100644 --- a/sky/templates/aws-ray.yml.j2 +++ b/sky/templates/aws-ray.yml.j2 @@ -36,10 +36,9 @@ provider: security_group: # AWS config file must include security group name GroupName: {{security_group}} -{% if vpc_name is not none %} - # NOTE: This is a new field added by SkyPilot and parsed by our own - # AWSNodeProvider. - vpc_name: {{vpc_name}} +{% if aws_vpc_name is not none %} + # NOTE: This is a new field added by SkyPilot to force use a specific VPC. + vpc_name: {{aws_vpc_name}} {% endif %} {%- if docker_login_config is not none %} # We put docker login config in provider section because ray's schema disabled diff --git a/sky/templates/gcp-ray.yml.j2 b/sky/templates/gcp-ray.yml.j2 index 2ecb915f55e..c63340392ab 100644 --- a/sky/templates/gcp-ray.yml.j2 +++ b/sky/templates/gcp-ray.yml.j2 @@ -28,6 +28,10 @@ provider: cache_stopped_nodes: True # The GCP project ID. project_id: {{gcp_project_id}} +{% if gcp_vpc_name is not none %} + # NOTE: This is a new field added by SkyPilot to force use a specific VPC. + vpc_name: {{gcp_vpc_name}} +{% endif %} # The firewall rule name for customized firewall rules. Only enabled # if we have ports requirement. {% if firewall_rule is not none %} diff --git a/sky/utils/schemas.py b/sky/utils/schemas.py index 20453c49e70..0b7a1cc4223 100644 --- a/sky/utils/schemas.py +++ b/sky/utils/schemas.py @@ -349,6 +349,13 @@ def get_config_schema(): 'minItems': 1, 'maxItems': 1, }, + 'vpc_name': { + 'oneOf': [{ + 'type': 'string', + }, { + 'type': 'null', + }], + }, } }, 'kubernetes': { diff --git a/tests/test_config.py b/tests/test_config.py index 52ff1767648..dfe64f77f06 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -37,6 +37,10 @@ def _create_config_file(config_file_path: pathlib.Path) -> None: vpc_name: {VPC_NAME} use_internal_ips: true ssh_proxy_command: {PROXY_COMMAND} + + gcp: + vpc_name: {VPC_NAME} + kubernetes: networking: {NODEPORT_MODE_NAME} """)) @@ -167,6 +171,7 @@ def test_config_get_set_nested(monkeypatch, tmp_path) -> None: assert skypilot_config.get_nested(('aws', 'use_internal_ips'), None) assert skypilot_config.get_nested(('aws', 'ssh_proxy_command'), None) == PROXY_COMMAND + assert skypilot_config.get_nested(('gcp', 'vpc_name'), None) == VPC_NAME assert skypilot_config.get_nested(('kubernetes', 'networking'), None) == NODEPORT_MODE_NAME # Check set_nested() will copy the config dict and return a new dict @@ -189,6 +194,7 @@ def test_config_get_set_nested(monkeypatch, tmp_path) -> None: assert skypilot_config.get_nested(('aws', 'use_internal_ips'), None) assert skypilot_config.get_nested( ('aws', 'ssh_proxy_command'), None) is None + assert skypilot_config.get_nested(('gcp', 'vpc_name'), None) == VPC_NAME # Check config with only partial keys still works new_config3 = copy.copy(new_config2) @@ -223,3 +229,4 @@ def test_config_with_env(monkeypatch, tmp_path) -> None: assert skypilot_config.get_nested(('aws', 'use_internal_ips'), None) assert skypilot_config.get_nested(('aws', 'ssh_proxy_command'), None) == PROXY_COMMAND + assert skypilot_config.get_nested(('gcp', 'vpc_name'), None) == VPC_NAME From 1ce2dc18863ab4a182c6c550d0e66d80d0097b5d Mon Sep 17 00:00:00 2001 From: Ziming Mao Date: Sun, 12 Nov 2023 14:17:34 -0800 Subject: [PATCH 16/17] [Core] Support Multiple Resources (#2498) * merge master into multi-acc * debug and bash format.sh * fix bug initialize resources_pref_list * fix bug * format * improve printing * format * tmp * fix spot launch issue * fuzzing target * bug fix * fix bugs refactor * add in new_task_resources * fix list copy * len(self.resources * spacing * is_resources_ordered * Refactor a bit * remove print * fix bug * updated optimization table * comment out logger info * bug fix unordered list * format * debug and added launched accelerators in recovery strategy * bug fix * sort rows by resources_pref_list * UI tweak * multi_resources * using use_spot to sort the tables * added mixed spot and demand * address weilin comments * format.sh * fix bug * added default spot recovery to make * added test smoke, removed mixed spot, addressed comments * format.sh * added smoke test and resources_ordered * make the output log use 1 line vs. 2. * region lists, fix bug with managed spot dashboard * conditions for len(region) <= 1 * multi_resources.yaml * merge master * change yaml example * Refactor backend_utils and pytest * address comments * addressed comments * fix pytest * fix test * format * disable sky.exec on multiple resources * minor formatting changes * refactor * format.sh * style nit: avoid abbreviation * added check_resources_fit_cluster to exec * combine get_resources * added usage.lib collection * update comment * update comment * combine multi resources checks * fix multiple resources, same accelerator type * format.sh and move exec position * update list and set for resources * debugging * address comment * fix format and test * fix bug * update test * support exec * using dict to represent res * support {'A100-40GB:1', 'K80:1', 'V100:1', 'T4:1'} * address Zhanghao's code review * tmp * fix bugs * _get_resource_group_hash * added _execute * added tpu back * did the tests pass * added monkeypatch * enabled all clouds * comment out unordered * comment out unordered * fix get_task_resources_str * overwrite resources with valid_resource * removed regions from this PR * used v[chosen_resources * fix key hash issue * move check to cloud.py * assert * remove assert * refactor optimizer _optimize_objective * added in assert for optimize by cost * addressed most of the code review * fix resources import * optimize_dag_with_user * fix bug * update yaml config * fix node_to_cost_map out of sync with best_plan * check * fix bug * fix basic changes * refactor _optimize_dag * fix compare_optimization_results * format * fix bug * fix test * fix * addressed some pr comments * fix check schemas * added single resource schema * fix bug * added test_multiple_resources_unordered * address code reviews * fix ports issue * address code review * fix smoke test * smoke --- docs/source/reference/yaml-spec.rst | 12 +- examples/multi_accelerators.yaml | 12 + examples/multi_resources.yaml | 18 + sky/backends/backend_utils.py | 116 +++++-- sky/backends/cloud_vm_ray_backend.py | 120 ++++--- sky/backends/local_docker_backend.py | 20 +- sky/cli.py | 25 +- sky/clouds/cloud.py | 1 + sky/global_user_state.py | 4 - sky/optimizer.py | 312 ++++++++++++++---- sky/resources.py | 16 +- sky/spot/controller.py | 1 - sky/spot/recovery_strategy.py | 50 +-- sky/task.py | 117 +++++-- sky/utils/dag_utils.py | 44 ++- sky/utils/schemas.py | 132 +++++++- tests/test_optimizer_dryruns.py | 15 + tests/test_optimizer_random_dag.py | 10 +- tests/test_smoke.py | 41 +++ .../test_multiple_accelerators_ordered.yaml | 7 + .../test_multiple_accelerators_unordered.yaml | 7 + tests/test_yamls/test_multiple_resources.yaml | 13 + 22 files changed, 851 insertions(+), 242 deletions(-) create mode 100644 examples/multi_accelerators.yaml create mode 100644 examples/multi_resources.yaml create mode 100644 tests/test_yamls/test_multiple_accelerators_ordered.yaml create mode 100644 tests/test_yamls/test_multiple_accelerators_unordered.yaml create mode 100644 tests/test_yamls/test_multiple_resources.yaml diff --git a/docs/source/reference/yaml-spec.rst b/docs/source/reference/yaml-spec.rst index c8dc4b1ed4f..5498410c2fb 100644 --- a/docs/source/reference/yaml-spec.rst +++ b/docs/source/reference/yaml-spec.rst @@ -43,8 +43,16 @@ Available fields: # Accelerator name and count per node (optional). # # Use `sky show-gpus` to view available accelerator configurations. - # - # Format: : (or simply , short for a count of 1). + # The following three ways are valid for specifying accelerators for a cluster: + # To specify a single accelerator: + # Format: : (or simply , short for a count of 1). + # accelerators: V100:4 + # To specify a ordered list of accelerators: Try the accelerators in the specified order. + # Format: [:, ...] + # accelerators: ['K80:1', 'V100:1', 'T4:1'] + # To specify an unordered set of accelerators: Optimize all specified accelerators together, and try accelerator with lowest cost first. + # Format: {:, ...} + # accelerators: {'K80:1', 'V100:1', 'T4:1'} accelerators: V100:4 # Number of vCPUs per node (optional). diff --git a/examples/multi_accelerators.yaml b/examples/multi_accelerators.yaml new file mode 100644 index 00000000000..d523c142a36 --- /dev/null +++ b/examples/multi_accelerators.yaml @@ -0,0 +1,12 @@ +name: multi-accelerators + +resources: + + # Ordered list of accelerators: Try the accelerators in the specified order. + # accelerators: ['A100-40GB:1', 'V100:1', 'K80:1', 'T4:1'] + + # Unordered set of accelerators: Optimize all specified accelerators together, and try accelerator with lowest cost first. + accelerators: {'A100-40GB:1', 'K80:1', 'V100:1', 'T4:1', 'T4:4'} + +run: | + nvidia-smi diff --git a/examples/multi_resources.yaml b/examples/multi_resources.yaml new file mode 100644 index 00000000000..56656b7cd1b --- /dev/null +++ b/examples/multi_resources.yaml @@ -0,0 +1,18 @@ +name: multi-resources + +resources: + ordered: + - cloud: AWS + accelerators: A10g + - cloud: GCP + accelerators: L4 + +# resources: +# any_of: + # - cloud: AWS + # accelerators: A10g + # - cloud: GCP + # accelerators: L4 + +run: | + nvidia-smi diff --git a/sky/backends/backend_utils.py b/sky/backends/backend_utils.py index c7ea38bf071..5317d4781af 100644 --- a/sky/backends/backend_utils.py +++ b/sky/backends/backend_utils.py @@ -1967,43 +1967,66 @@ def check_can_clone_disk_and_override_task( 'disk is only supported when creating a new cluster. To fix: specify ' 'a new target cluster name.') - assert len(task.resources) == 1, task.resources - task_resources = list(task.resources)[0] - if handle.launched_resources.disk_size > task_resources.disk_size: - # The target cluster's disk should be at least as large as the source. - with ux_utils.print_exception_no_traceback(): - target_cluster_name_str = f' {target_cluster_name!r}' - if target_cluster_name is None: - target_cluster_name_str = '' - raise exceptions.NotSupportedError( - f'The target cluster{target_cluster_name_str} should have a disk size ' - f'of at least {handle.launched_resources.disk_size} GB to clone the ' - f'disk from {cluster_name!r}.') - override_param = {} + new_task_resources = [] original_cloud = handle.launched_resources.cloud - assert original_cloud is not None, handle.launched_resources - if task_resources.cloud is None: - override_param['cloud'] = original_cloud - else: - if not original_cloud.is_same_cloud(task_resources.cloud): - with ux_utils.print_exception_no_traceback(): - raise ValueError( - f'Cannot clone disk across cloud from {original_cloud} to ' - f'{task_resources.cloud}.') original_cloud.check_features_are_supported( {clouds.CloudImplementationFeatures.CLONE_DISK_FROM_CLUSTER}) - if task_resources.region is None: - override_param['region'] = handle.launched_resources.region - - if override_param: - logger.info( - f'No cloud/region specified for the task. Using the same region ' - f'as source cluster {cluster_name!r}: ' - f'{handle.launched_resources.cloud}' - f'({handle.launched_resources.region}).') + assert original_cloud is not None, handle.launched_resources + has_override = False + has_disk_size_met = False + has_cloud_met = False + for task_resources in task.resources: + if handle.launched_resources.disk_size > task_resources.disk_size: + # The target cluster's disk should be at least as large as the source. + continue + has_disk_size_met = True + if task_resources.cloud is not None and not original_cloud.is_same_cloud( + task_resources.cloud): + continue + has_cloud_met = True + + override_param = {} + if task_resources.cloud is None: + override_param['cloud'] = original_cloud + if task_resources.region is None: + override_param['region'] = handle.launched_resources.region + + if override_param: + logger.info( + f'No cloud/region specified for the task {task_resources}. Using the same region ' + f'as source cluster {cluster_name!r}: ' + f'{handle.launched_resources.cloud}' + f'({handle.launched_resources.region}).') + has_override = True task_resources = task_resources.copy(**override_param) - task.set_resources({task_resources}) + new_task_resources.append(task_resources) + + if not new_task_resources: + if not has_disk_size_met: + with ux_utils.print_exception_no_traceback(): + target_cluster_name_str = f' {target_cluster_name!r}' + if target_cluster_name is None: + target_cluster_name_str = '' + raise exceptions.NotSupportedError( + f'The target cluster{target_cluster_name_str} should have a disk size ' + f'of at least {handle.launched_resources.disk_size} GB to clone the ' + f'disk from {cluster_name!r}.') + if not has_cloud_met: + task_resources_cloud_str = '[' + ','.join( + [f'{res.cloud}' for res in task.resources]) + ']' + task_resources_str = '[' + ','.join( + [f'{res}' for res in task.resources]) + ']' + with ux_utils.print_exception_no_traceback(): + raise ValueError( + f'Cannot clone disk across cloud from {original_cloud} to ' + f'{task_resources_cloud_str} for resources {task_resources_str}.' + ) + assert False, 'Should not reach here.' + # set the new_task_resources to be the same type (list or set) as the + # original task.resources + if has_override: + task.set_resources(type(task.resources)(new_task_resources)) # Reset the best_resources to triger re-optimization # later, so that the new task_resources will be used. task.best_resources = None @@ -2724,11 +2747,32 @@ def get_task_demands_dict(task: 'task_lib.Task') -> Optional[Dict[str, float]]: def get_task_resources_str(task: 'task_lib.Task') -> str: - resources_dict = get_task_demands_dict(task) - if resources_dict is None: - resources_str = f'CPU:{DEFAULT_TASK_CPU_DEMAND}' + if task.best_resources is not None: + accelerator_dict = task.best_resources.accelerators + if accelerator_dict is None: + resources_str = f'CPU:{DEFAULT_TASK_CPU_DEMAND}' + else: + resources_str = ', '.join( + f'{k}:{v}' for k, v in accelerator_dict.items()) + elif len(task.resources) == 1: + resources_dict = list(task.resources)[0].accelerators + if resources_dict is None: + resources_str = f'CPU:{DEFAULT_TASK_CPU_DEMAND}' + else: + resources_str = ', '.join( + f'{k}:{v}' for k, v in resources_dict.items()) else: - resources_str = ', '.join(f'{k}:{v}' for k, v in resources_dict.items()) + resource_accelerators = [] + for resource in task.resources: + if resource.accelerators is None: + continue + for k, v in resource.accelerators.items(): + resource_accelerators.append(f'{k}:{v}') + + if resource_accelerators: + resources_str = ', '.join(set(resource_accelerators)) + else: + resources_str = f'CPU:{DEFAULT_TASK_CPU_DEMAND}' resources_str = f'{task.num_nodes}x [{resources_str}]' return resources_str diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py index d8b93bcd51a..5b30f4d72d8 100644 --- a/sky/backends/cloud_vm_ray_backend.py +++ b/sky/backends/cloud_vm_ray_backend.py @@ -2199,7 +2199,7 @@ def provision_with_retries( config_dict = self._retry_zones( to_provision, num_nodes, - requested_resources=task.resources, + requested_resources=set(task.resources), dryrun=dryrun, stream_logs=stream_logs, cluster_name=cluster_name, @@ -2706,21 +2706,23 @@ def check_resources_fit_cluster( handle: CloudVmRayResourceHandle, task: task_lib.Task, check_ports: bool = False, - ): + ) -> resources_lib.Resources: """Check if resources requested by the task fit the cluster. The resources requested by the task should be smaller than the existing cluster. + If multiple resources are specified, this checking will pass when + at least one resource fits the cluster. Raises: exceptions.ResourcesMismatchError: If the resources in the task does not match the existing cluster. """ - assert len(task.resources) == 1, task.resources launched_resources = handle.launched_resources - task_resources = list(task.resources)[0] cluster_name = handle.cluster_name + + # Usage Collection: usage_lib.messages.usage.update_cluster_resources( handle.launched_nodes, launched_resources) record = global_user_state.get_cluster_from_name(cluster_name) @@ -2739,40 +2741,55 @@ def check_resources_fit_cluster( launched_resources) mismatch_str = ('To fix: use accelerators/number of nodes that can ' 'be satisfied by the local cluster') - # Requested_resources <= actual_resources. - # Special handling for local cloud case, which assumes a cluster can - # be heterogeneous. Here, launched_resources is a list of custom - # accelerators per node, and Resources.less_demanding_than determines - # how many nodes satisfy task resource requirements. - if not (task.num_nodes <= handle.launched_nodes and - task_resources.less_demanding_than( - launched_resources, - requested_num_nodes=task.num_nodes, - check_ports=check_ports)): - if (task_resources.region is not None and - task_resources.region != launched_resources.region): - with ux_utils.print_exception_no_traceback(): - raise exceptions.ResourcesMismatchError( - 'Task requested resources in region ' - f'{task_resources.region!r}, but the existing cluster ' - f'is in region {launched_resources.region!r}.') - if (task_resources.zone is not None and - task_resources.zone != launched_resources.zone): - zone_str = (f'is in zone {launched_resources.zone!r}.' - if launched_resources.zone is not None else - 'does not have zone specified.') - with ux_utils.print_exception_no_traceback(): - raise exceptions.ResourcesMismatchError( - 'Task requested resources in zone ' - f'{task_resources.zone!r}, but the existing cluster ' - f'{zone_str}') + + valid_resource = None + requested_resource_list = [] + for resource in task.resources: + if (task.num_nodes <= handle.launched_nodes and + resource.less_demanding_than( + launched_resources, + requested_num_nodes=task.num_nodes, + check_ports=check_ports)): + valid_resource = resource + break + else: + requested_resource_list.append(f'{task.num_nodes}x {resource}') + + if valid_resource is None: + for example_resource in task.resources: + if (example_resource.region is not None and + example_resource.region != launched_resources.region): + with ux_utils.print_exception_no_traceback(): + raise exceptions.ResourcesMismatchError( + f'Task requested resources {example_resource} in region ' # pylint: disable=line-too-long + f'{example_resource.region!r}' + ', but the existing cluster ' + f'is in region {launched_resources.region!r}.') + if (example_resource.zone is not None and + example_resource.zone != launched_resources.zone): + zone_str = (f'is in zone {launched_resources.zone!r}.' + if launched_resources.zone is not None else + 'does not have zone specified.') + with ux_utils.print_exception_no_traceback(): + raise exceptions.ResourcesMismatchError( + f'Task requested resources {example_resource} in zone ' # pylint: disable=line-too-long + f'{example_resource.zone!r},' + 'but the existing cluster ' + f'{zone_str}') + requested_resource_str = ', '.join(requested_resource_list) + if isinstance(task.resources, list): + requested_resource_str = f'[{requested_resource_str}]' + elif isinstance(task.resources, set): + requested_resource_str = f'{{{requested_resource_str}}}' with ux_utils.print_exception_no_traceback(): raise exceptions.ResourcesMismatchError( - 'Requested resources do not match the existing cluster.\n' - f' Requested:\t{task.num_nodes}x {task_resources} \n' + 'Requested resources do not match the existing ' + 'cluster.\n' + f' Requested:\t{requested_resource_str}\n' f' Existing:\t{handle.launched_nodes}x ' f'{handle.launched_resources}\n' f'{mismatch_str}') + return valid_resource def _provision( self, @@ -3092,7 +3109,7 @@ def _update_after_cluster_provisioned( global_user_state.add_or_update_cluster( handle.cluster_name, handle, - task.resources, + set(task.resources), ready=True, ) usage_lib.messages.usage.update_final_cluster_status( @@ -3499,23 +3516,31 @@ def _execute( # Check the task resources vs the cluster resources. Since `sky exec` # will not run the provision and _check_existing_cluster # We need to check ports here since sky.exec shouldn't change resources - self.check_resources_fit_cluster(handle, task, check_ports=True) - - resources_str = backend_utils.get_task_resources_str(task) + valid_resource = self.check_resources_fit_cluster(handle, + task, + check_ports=True) + task_copy = copy.copy(task) + # Handle multiple resources exec case. + task_copy.set_resources(valid_resource) + if len(task.resources) > 1: + logger.info('Multiple resources are specified' + f'for the task, using: {valid_resource}') + task_copy.best_resources = None + resources_str = backend_utils.get_task_resources_str(task_copy) if dryrun: logger.info(f'Dryrun complete. Would have run:\n{task}') return None - job_id = self._add_job(handle, task.name, resources_str) + job_id = self._add_job(handle, task_copy.name, resources_str) is_tpu_vm_pod = tpu_utils.is_tpu_vm_pod(handle.launched_resources) # Case: task_lib.Task(run, num_nodes=N) or TPU VM Pods - if task.num_nodes > 1 or is_tpu_vm_pod: - self._execute_task_n_nodes(handle, task, job_id, detach_run) + if task_copy.num_nodes > 1 or is_tpu_vm_pod: + self._execute_task_n_nodes(handle, task_copy, job_id, detach_run) else: # Case: task_lib.Task(run, num_nodes=1) - self._execute_task_one_node(handle, task, job_id, detach_run) + self._execute_task_one_node(handle, task_copy, job_id, detach_run) return job_id @@ -4343,7 +4368,9 @@ def _check_existing_cluster( self.check_resources_fit_cluster(handle, task) # Use the existing cluster. assert handle.launched_resources is not None, (cluster_name, handle) - assert len(task.resources) == 1 + # Assume resources share the same ports. + for resource in task.resources: + assert resource.ports == list(task.resources)[0].ports all_ports = resources_utils.port_set_to_ranges( resources_utils.port_ranges_to_set( handle.launched_resources.ports) | @@ -4359,13 +4386,12 @@ def _check_existing_cluster( prev_cluster_status=prev_cluster_status, prev_handle=handle) usage_lib.messages.usage.set_new_cluster() - assert len(task.resources) == 1, task.resources # Use the task_cloud, because the cloud in `to_provision` can be changed # later during the retry. - resources = list(task.resources)[0] - task_cloud = (resources.cloud - if resources.cloud is not None else clouds.Cloud) - task_cloud.check_cluster_name_is_valid(cluster_name) + for resources in task.resources: + task_cloud = (resources.cloud + if resources.cloud is not None else clouds.Cloud) + task_cloud.check_cluster_name_is_valid(cluster_name) if to_provision is None: # The cluster is recently terminated either by autostop or manually diff --git a/sky/backends/local_docker_backend.py b/sky/backends/local_docker_backend.py index bc3bfbbe253..78619943e8c 100644 --- a/sky/backends/local_docker_backend.py +++ b/sky/backends/local_docker_backend.py @@ -164,11 +164,11 @@ def _provision( self.images[handle] = (image_tag, metadata) logger.info(f'Image {image_tag} built.') logger.info('Provisioning complete.') - global_user_state.add_or_update_cluster( - cluster_name, - cluster_handle=handle, - requested_resources=task.resources, - ready=False) + global_user_state.add_or_update_cluster(cluster_name, + cluster_handle=handle, + requested_resources=set( + task.resources), + ready=False) return handle def _sync_workdir(self, handle: LocalDockerResourceHandle, @@ -258,11 +258,11 @@ def _setup(self, handle: LocalDockerResourceHandle, task: 'task_lib.Task', f'-it {container.name} /bin/bash{style.RESET_ALL}.\n' f'You can debug the image by running: {style.BRIGHT}docker run -it ' f'{image_tag} /bin/bash{style.RESET_ALL}.\n') - global_user_state.add_or_update_cluster( - cluster_name, - cluster_handle=handle, - requested_resources=task.resources, - ready=True) + global_user_state.add_or_update_cluster(cluster_name, + cluster_handle=handle, + requested_resources=set( + task.resources), + ready=True) def _execute(self, handle: LocalDockerResourceHandle, diff --git a/sky/cli.py b/sky/cli.py index 21c1d65616b..55423ef8078 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -1125,11 +1125,7 @@ def _make_task_or_dag_from_entrypoint_with_overrides( if spot_recovery is not None: override_params['spot_recovery'] = spot_recovery - assert len(task.resources) == 1 - old_resources = list(task.resources)[0] - new_resources = old_resources.copy(**override_params) - - task.set_resources({new_resources}) + task.set_resources_override(override_params) if num_nodes is not None: task.num_nodes = num_nodes @@ -1137,11 +1133,12 @@ def _make_task_or_dag_from_entrypoint_with_overrides( task.name = name task.update_envs(env) # TODO(wei-lin): move this validation into Python API. - if new_resources.accelerators is not None: - acc, _ = list(new_resources.accelerators.items())[0] - if acc.startswith('tpu-') and task.num_nodes > 1: - raise ValueError('Multi-node TPU cluster is not supported. ' - f'Got num_nodes={task.num_nodes}.') + for resource in task.resources: + if resource.accelerators is not None: + acc, _ = list(resource.accelerators.items())[0] + if acc.startswith('tpu-') and task.num_nodes > 1: + raise ValueError('Multi-node TPU cluster is not supported. ' + f'Got num_nodes={task.num_nodes}.') return task @@ -3753,10 +3750,10 @@ def spot_launch( # cluster name against the regex, and the cloud-specific validation will # be done by the spot controller when actually launching the spot # cluster. - resources = list(task.resources)[0] - task_cloud = (resources.cloud - if resources.cloud is not None else clouds.Cloud) - task_cloud.check_cluster_name_is_valid(name) + for resources in task.resources: + task_cloud = (resources.cloud + if resources.cloud is not None else clouds.Cloud) + task_cloud.check_cluster_name_is_valid(name) sky.spot_launch(dag, name, diff --git a/sky/clouds/cloud.py b/sky/clouds/cloud.py index 29eb36de54f..625694f8780 100644 --- a/sky/clouds/cloud.py +++ b/sky/clouds/cloud.py @@ -294,6 +294,7 @@ def get_feasible_launchable_resources( if num_nodes > 1: resources_required_features.add( CloudImplementationFeatures.MULTI_NODE) + try: self.check_features_are_supported(resources_required_features) except exceptions.NotSupportedError: diff --git a/sky/global_user_state.py b/sky/global_user_state.py index b75457e5713..cf7b88f9726 100644 --- a/sky/global_user_state.py +++ b/sky/global_user_state.py @@ -166,10 +166,6 @@ def add_or_update_cluster(cluster_name: str, cluster_launched_at = int(time.time()) usage_intervals.append((cluster_launched_at, None)) - if requested_resources: - assert len(requested_resources) == 1, requested_resources - requested_resources = list(requested_resources)[0] - _DB.cursor.execute( 'INSERT or REPLACE INTO clusters' # All the fields need to exist here, even if they don't need diff --git a/sky/optimizer.py b/sky/optimizer.py index d190c97a246..f1a75e0dbc9 100644 --- a/sky/optimizer.py +++ b/sky/optimizer.py @@ -1,10 +1,13 @@ """Optimizer: assigns best resources to user tasks.""" import collections +import copy import enum +import json import typing from typing import Any, Dict, Iterable, List, Optional, Tuple import colorama +import networkx as nx import numpy as np import prettytable @@ -22,8 +25,6 @@ from sky.utils import ux_utils if typing.TYPE_CHECKING: - import networkx as nx - # pylint: disable=ungrouped-imports from sky import dag as dag_lib @@ -56,6 +57,15 @@ def _create_table(field_names: List[str]) -> prettytable.PrettyTable: return log_utils.create_table(field_names, **table_kwargs) +def _is_dag_resources_ordered(dag: 'dag_lib.Dag') -> bool: + graph = dag.get_graph() + topo_order = list(nx.topological_sort(graph)) + for node in topo_order: + if isinstance(node.resources, list): + return True + return False + + class Optimizer: """Optimizer: assigns best resources to user tasks.""" @@ -114,8 +124,8 @@ def optimize(dag: 'dag_lib.Dag', # node.best_resources if it is None. Optimizer._add_dummy_source_sink_nodes(dag) try: - unused_best_plan = Optimizer._optimize_objective( - dag, + unused_best_plan = Optimizer._optimize_dag( + dag=dag, minimize_cost=minimize == OptimizeTarget.COST, blocked_resources=blocked_resources, quiet=quiet) @@ -224,6 +234,7 @@ def _estimate_nodes_cost_or_time( topo_order: List[task_lib.Task], minimize_cost: bool = True, blocked_resources: Optional[Iterable[resources_lib.Resources]] = None, + quiet: bool = False ) -> Tuple[_TaskToCostMap, _TaskToPerCloudCandidates]: """Estimates the cost/time of each task-resource mapping in the DAG. @@ -245,7 +256,7 @@ def _estimate_nodes_cost_or_time( for node_i, node in enumerate(topo_order): if node_i == 0: # Base case: a special source node. - node_to_cost_map[node][list(node.get_resources())[0]] = 0 + node_to_cost_map[node][list(node.resources)[0]] = 0 continue # Don't print for the last node, Sink. @@ -257,14 +268,19 @@ def _estimate_nodes_cost_or_time( if node_i < len(topo_order) - 1: # Convert partial resource labels to launchable resources. launchable_resources, cloud_candidates, fuzzy_candidates = ( - _fill_in_launchable_resources(node, blocked_resources)) + _fill_in_launchable_resources( + task=node, + blocked_resources=blocked_resources, + try_fix_with_sky_check=True, + quiet=quiet)) node_to_candidate_map[node] = cloud_candidates else: # Dummy sink node. - node_resources = node.get_resources() - launchable_resources = {list(node_resources)[0]: node_resources} + launchable_resources = { + list(node.resources)[0]: list(node.resources) + } - num_resources = len(node.get_resources()) + num_resources = len(list(node.resources)) for orig_resources, launchable_list in launchable_resources.items(): if num_resources == 1 and node.time_estimator_func is None: @@ -282,7 +298,11 @@ def _estimate_nodes_cost_or_time( # FIXME(zongheng): take 'num_nodes' as an arg/into # account. It may be another reason to treat num_nodes as # part of a Resources. - estimated_runtime = node.estimate_runtime(orig_resources) + if node.time_estimator_func is None: + estimated_runtime = 1 * 3600 + else: + estimated_runtime = node.estimate_runtime( + orig_resources) for resources in launchable_list: if do_print: logger.debug(f'resources: {resources}') @@ -320,11 +340,11 @@ def _estimate_nodes_cost_or_time( enabled_clouds = global_user_state.get_enabled_clouds() if _cloud_in_list(clouds.Kubernetes(), enabled_clouds): if any(orig_resources.cloud is None - for orig_resources in node.get_resources()): + for orig_resources in node.resources): source_hint = 'catalog and kubernetes cluster' elif all( isinstance(orig_resources.cloud, clouds.Kubernetes) - for orig_resources in node.get_resources()): + for orig_resources in node.resources): source_hint = 'kubernetes cluster' # TODO(romilb): When `sky show-gpus` supports Kubernetes, @@ -375,7 +395,7 @@ def _optimize_by_dp( for node_i, node in enumerate(topo_order): if node_i == 0: # Base case: a special source node. - dp_best_objective[node][list(node.get_resources())[0]] = 0 + dp_best_objective[node][list(node.resources)[0]] = 0 continue parent = topo_order[node_i - 1] @@ -705,13 +725,8 @@ def print_optimized_plan( def _get_resources_element_list( resources: 'resources_lib.Resources') -> List[str]: - accelerators = resources.accelerators - if accelerators is None: - accelerators = '-' - elif isinstance(accelerators, dict) and len(accelerators) == 1: - accelerators, count = list(accelerators.items())[0] - accelerators = f'{accelerators}:{count}' - spot = '[Spot]' if resources.use_spot else '' + accelerators = resources.get_accelerators_str() + spot = resources.get_spot_str() cloud = resources.cloud vcpus, mem = cloud.get_vcpus_mem_from_instance_type( resources.instance_type) @@ -740,6 +755,55 @@ def format_number(x): str(region_or_zone), ] + Row = collections.namedtuple('Row', [ + 'cloud', 'instance', 'vcpus', 'mem', 'accelerators', + 'region_or_zone', 'cost_str', 'chosen_str' + ]) + + def _get_resources_named_tuple(resources: 'resources_lib.Resources', + cost_str: str, chosen: bool) -> Row: + + accelerators = resources.get_accelerators_str() + spot = resources.get_spot_str() + cloud = resources.cloud + vcpus, mem = cloud.get_vcpus_mem_from_instance_type( + resources.instance_type) + + def format_number(x): + if x is None: + return '-' + elif x.is_integer(): + return str(int(x)) + else: + return f'{x:.1f}' + + vcpus = format_number(vcpus) + mem = format_number(mem) + + if resources.zone is None: + region_or_zone = resources.region + else: + region_or_zone = resources.zone + + chosen_str = '' + if chosen: + chosen_str = (colorama.Fore.GREEN + ' ' + u'\u2714' + + colorama.Style.RESET_ALL) + row = Row(cloud, resources.instance_type + spot, vcpus, mem, + str(accelerators), str(region_or_zone), cost_str, + chosen_str) + + return row + + def _get_resource_group_hash(resources: 'resources_lib.Resources'): + return json.dumps( + { + 'cloud': f'{resources.cloud}', + 'accelerators': f'{resources.accelerators}', + 'use_spot': resources.use_spot + }, + sort_keys=True) + # Print the list of resouces that the optimizer considered. resource_fields = [ 'CLOUD', 'INSTANCE', 'vCPUs', 'Mem(GB)', 'ACCELERATORS', @@ -764,6 +828,13 @@ def format_number(x): num_tasks = len(ordered_node_to_cost_map) for task, v in ordered_node_to_cost_map.items(): + # Hack: convert the dictionary values + # (resources) to their yaml config + # For dictionary comparison later. + v_yaml = { + json.dumps(resource.to_yaml_config()): cost + for resource, cost in v.items() + } task_str = (f'for task {repr(task)!r} ' if num_tasks > 1 else '') plural = 's' if task.num_nodes > 1 else '' logger.info( @@ -772,46 +843,67 @@ def format_number(x): f'{colorama.Style.RESET_ALL}') # Only print 1 row per cloud. - best_per_cloud: Dict[str, Tuple[resources_lib.Resources, - float]] = {} + # The following code is to generate the table + # of optimizer table for display purpose. + best_per_resource_group: Dict[str, Tuple[resources_lib.Resources, + float]] = {} for resources, cost in v.items(): - cloud = str(resources.cloud) - if cloud in best_per_cloud: - if cost < best_per_cloud[cloud][1]: - best_per_cloud[cloud] = (resources, cost) + resource_table_key = _get_resource_group_hash(resources) + if resource_table_key in best_per_resource_group: + if cost < best_per_resource_group[resource_table_key][1]: + best_per_resource_group[resource_table_key] = ( + resources, cost) else: - best_per_cloud[cloud] = (resources, cost) + best_per_resource_group[resource_table_key] = (resources, + cost) # If the DAG has multiple tasks, the chosen resources may not be # the best resources for the task. chosen_resources = best_plan[task] - best_per_cloud[str(chosen_resources.cloud)] = (chosen_resources, - v[chosen_resources]) - + resource_table_key = _get_resource_group_hash(chosen_resources) + best_per_resource_group[resource_table_key] = ( + chosen_resources, + v_yaml[json.dumps(chosen_resources.to_yaml_config())]) rows = [] - for resources, cost in best_per_cloud.values(): + for resources, cost in best_per_resource_group.values(): if minimize_cost: cost_str = f'{cost:.2f}' else: cost_str = f'{cost / 3600:.2f}' - row = [*_get_resources_element_list(resources), cost_str, ''] - if resources == best_plan[task]: - # Use tick sign for the chosen resources. - row[-1] = (colorama.Fore.GREEN + ' ' + u'\u2714' + - colorama.Style.RESET_ALL) + row = _get_resources_named_tuple(resources, cost_str, + resources == best_plan[task]) rows.append(row) # NOTE: we've converted the cost to a string above, so we should # convert it back to float for sorting. - rows = sorted(rows, key=lambda x: float(x[-2])) - # Highlight the chosen resources. + if isinstance(task.resources, list): + accelerator_spot_list = [ + r.get_accelerators_str() + r.get_spot_str() + for r in list(task.resources) + ] + + def sort_key(row, accelerator_spot_list=accelerator_spot_list): + accelerator_index = accelerator_spot_list.index( + row.accelerators + + ('[Spot]' if '[Spot]' in row.instance else '')) + cost = float(row.cost_str) + return (accelerator_index, cost) + + rows = sorted(rows, key=sort_key) + else: + rows = sorted(rows, key=lambda row: float(row.cost_str)) + + row_list = [] for row in rows: - if row[-1] != '': - for i, cell in enumerate(row): - row[i] = (f'{colorama.Style.BRIGHT}{cell}' - f'{colorama.Style.RESET_ALL}') - break + row_in_list = [] + if row.chosen_str != '': + for _, cell in enumerate(row): + row_in_list.append((f'{colorama.Style.BRIGHT}{cell}' + f'{colorama.Style.RESET_ALL}')) + else: + row_in_list = list(row) + row_list.append(row_in_list) table = _create_table(field_names) table.add_rows(rows) @@ -820,7 +912,10 @@ def format_number(x): @staticmethod def _print_candidates(node_to_candidate_map: _TaskToPerCloudCandidates): for node, candidate_set in node_to_candidate_map.items(): - accelerator = list(node.get_resources())[0].accelerators + if node.best_resources: + accelerator = node.best_resources.accelerators + else: + accelerator = list(node.resources)[0].accelerators is_multi_instances = False if accelerator: acc_name, acc_count = list(accelerator.items())[0] @@ -840,7 +935,7 @@ def _print_candidates(node_to_candidate_map: _TaskToPerCloudCandidates): f'To list more details, run \'sky show-gpus {acc_name}\'.') @staticmethod - def _optimize_objective( + def _optimize_dag( dag: 'dag_lib.Dag', minimize_cost: bool = True, blocked_resources: Optional[Iterable[resources_lib.Resources]] = None, @@ -851,40 +946,126 @@ def _optimize_objective( The optimal mapping should consider the egress cost/time so that the total estimated cost/time of the DAG becomes the minimum. """ - import networkx as nx # pylint: disable=import-outside-toplevel # TODO: The output of this function is useful. Should generate a # text plan and print to both console and a log file. - graph = dag.get_graph() - topo_order = list(nx.topological_sort(graph)) + def ordinal_number(n): + if 10 <= n % 100 <= 20: + suffix = 'th' + else: + suffix = {1: 'st', 2: 'nd', 3: 'rd'}.get(n % 10, 'th') + return str(n) + suffix + + has_resources_ordered = _is_dag_resources_ordered(dag) + if has_resources_ordered: + # Honor the user's choice. + # The actual task dag can store dummy tasks. + for task_id, task in enumerate(dag.tasks): + if isinstance(task.resources, list): + resources_list = task.resources + accelerators_str = ', '.join( + [f'{r}' for r in resources_list]) + task_id_str = ordinal_number(task_id + 1) + if len(dag.tasks) > 3: + # User-provided dag has more than one task. + # Comparing with 3, + # as there are two dummy tasks added by the optimizer. + logger.info(f'{colorama.Fore.YELLOW}{task_id_str} ' + 'task is using user-specified ' + 'accelerators list ' + f'{colorama.Style.RESET_ALL}' + '(will be tried in the listed order): ' + f'{accelerators_str}') + else: + logger.info( + f'{colorama.Fore.YELLOW}Using user-specified ' + 'accelerators list ' + f'{colorama.Style.RESET_ALL}' + '(will be tried in the listed order): ' + f'{accelerators_str}') - node_to_cost_map, node_to_candidate_map = ( - Optimizer._estimate_nodes_cost_or_time(topo_order, minimize_cost, + graph = dag.get_graph() + local_dag = copy.deepcopy(dag) if has_resources_ordered else dag + for task_id in range(len(dag.tasks)): + task = dag.tasks[task_id] + if isinstance(task.resources, list): + local_task = local_dag.tasks[task_id] + for resources in task.resources: + # Check if there exists launchable resources + local_task.set_resources(resources) + launchable_resources_map, _ , _ = \ + _fill_in_launchable_resources( + task = local_task, + blocked_resources = blocked_resources, + try_fix_with_sky_check = True, + quiet = False + ) + if len(launchable_resources_map[resources]) != 0: + break + + local_graph = local_dag.get_graph() + local_topo_order = list(nx.topological_sort(local_graph)) + local_node_to_cost_map, local_node_to_candidate_map = ( + Optimizer._estimate_nodes_cost_or_time(local_topo_order, + minimize_cost, blocked_resources)) - - if dag.is_chain(): - best_plan, best_total_objective = Optimizer._optimize_by_dp( - topo_order, node_to_cost_map, minimize_cost) + if local_dag.is_chain(): + local_best_plan, best_total_objective = Optimizer._optimize_by_dp( + local_topo_order, local_node_to_cost_map, minimize_cost) else: - best_plan, best_total_objective = Optimizer._optimize_by_ilp( - graph, topo_order, node_to_cost_map, minimize_cost) + local_best_plan, best_total_objective = Optimizer._optimize_by_ilp( + local_graph, local_topo_order, local_node_to_cost_map, + minimize_cost) if minimize_cost: - total_time = Optimizer._compute_total_time(graph, topo_order, - best_plan) + total_time = Optimizer._compute_total_time(local_graph, + local_topo_order, + local_best_plan) total_cost = best_total_objective else: total_time = best_total_objective - total_cost = Optimizer._compute_total_cost(graph, topo_order, - best_plan) + total_cost = Optimizer._compute_total_cost(local_graph, + local_topo_order, + local_best_plan) + + if local_best_plan is None: + error_msg = (f'No launchable resource found for task {task}. ' + 'To fix: relax its resource requirements.\n' + 'Hint: \'sky show-gpus --all\' ' + 'to list available accelerators.\n' + ' \'sky check\' to check the enabled clouds.') + with ux_utils.print_exception_no_traceback(): + raise exceptions.ResourcesUnavailableError(error_msg) + + if has_resources_ordered: + best_plan = {} + # We have to manually set the best_resources for the tasks in the + # original dag, to pass the optimization results + # to the caller, as we deep copied the dag + # when the dag has nodes with ordered resources. + for task, resources in local_best_plan.items(): + task_idx = local_dag.tasks.index(task) + dag.tasks[task_idx].best_resources = resources + best_plan[dag.tasks[task_idx]] = resources + else: + best_plan = local_best_plan + + topo_order = list(nx.topological_sort(graph)) if has_resources_ordered \ + else local_topo_order + node_to_cost_map, _ = (Optimizer._estimate_nodes_cost_or_time( + topo_order=topo_order, + minimize_cost=minimize_cost, + blocked_resources=blocked_resources, + quiet=True)) if has_resources_ordered else ( + local_node_to_cost_map, local_node_to_candidate_map) if not quiet: Optimizer.print_optimized_plan(graph, topo_order, best_plan, total_time, total_cost, node_to_cost_map, minimize_cost) if not env_options.Options.MINIMIZE_LOGGING.get(): - Optimizer._print_candidates(node_to_candidate_map) + Optimizer._print_candidates(local_node_to_candidate_map) return best_plan @@ -971,6 +1152,7 @@ def _fill_in_launchable_resources( task: task_lib.Task, blocked_resources: Optional[Iterable[resources_lib.Resources]], try_fix_with_sky_check: bool = True, + quiet: bool = False ) -> Tuple[Dict[resources_lib.Resources, List[resources_lib.Resources]], _PerCloudCandidates, List[str]]: """Fills in the launchable resources for the task. @@ -990,7 +1172,7 @@ def _fill_in_launchable_resources( List[resources_lib.Resources]) if blocked_resources is None: blocked_resources = [] - for resources in task.get_resources(): + for resources in task.resources: if resources.cloud is not None and not _cloud_in_list( resources.cloud, enabled_clouds): if try_fix_with_sky_check: @@ -1036,8 +1218,10 @@ def _fill_in_launchable_resources( num_node_str = '' if task.num_nodes > 1: num_node_str = f'{task.num_nodes}x ' - logger.info(f'No resource satisfying {num_node_str}{resources} ' - f'on {clouds_str}.') + if not quiet: + logger.info( + f'No resource satisfying {num_node_str}{resources} ' + f'on {clouds_str}.') if len(all_fuzzy_candidates) > 0: logger.info('Did you mean: ' f'{colorama.Fore.CYAN}' diff --git a/sky/resources.py b/sky/resources.py index e74f25e9de7..e90b6b6700c 100644 --- a/sky/resources.py +++ b/sky/resources.py @@ -883,6 +883,18 @@ def get_cost(self, seconds: float) -> float: self.accelerators, self.use_spot, self._region, self._zone) return hourly_cost * hours + def get_accelerators_str(self) -> str: + accelerators = self.accelerators + if accelerators is None: + accelerators = '-' + elif isinstance(accelerators, dict) and len(accelerators) == 1: + accelerators, count = list(accelerators.items())[0] + accelerators = f'{accelerators}:{count}' + return accelerators + + def get_spot_str(self) -> str: + return '[Spot]' if self.use_spot else '' + def make_deploy_variables( self, cluster_name_on_cloud: str, region: clouds.Region, zones: Optional[List[clouds.Zone]]) -> Dict[str, Optional[str]]: @@ -1164,8 +1176,8 @@ def add_if_not_none(key, value): if self._use_spot_specified: add_if_not_none('use_spot', self.use_spot) - config['spot_recovery'] = self.spot_recovery - config['disk_size'] = self.disk_size + add_if_not_none('spot_recovery', self.spot_recovery) + add_if_not_none('disk_size', self.disk_size) add_if_not_none('region', self.region) add_if_not_none('zone', self.zone) add_if_not_none('image_id', self.image_id) diff --git a/sky/spot/controller.py b/sky/spot/controller.py index 9f1c7b40480..e0a61000eb7 100644 --- a/sky/spot/controller.py +++ b/sky/spot/controller.py @@ -50,7 +50,6 @@ def __init__(self, job_id: int, dag_yaml: str, self._job_id = job_id self._dag, self._dag_name = _get_dag_and_name(dag_yaml) logger.info(self._dag) - self._retry_until_up = retry_until_up # TODO(zhwu): this assumes the specific backend. self._backend = cloud_vm_ray_backend.CloudVmRayBackend() diff --git a/sky/spot/recovery_strategy.py b/sky/spot/recovery_strategy.py index 7ddc7e85df5..897899e55f8 100644 --- a/sky/spot/recovery_strategy.py +++ b/sky/spot/recovery_strategy.py @@ -8,7 +8,7 @@ import time import traceback import typing -from typing import Optional, Tuple +from typing import Optional import sky from sky import backends @@ -94,16 +94,20 @@ def __init_subclass__(cls, name: str, default: bool = False): def make(cls, cluster_name: str, backend: 'backends.Backend', task: 'task_lib.Task', retry_until_up: bool) -> 'StrategyExecutor': """Create a strategy from a task.""" - task_resources = task.resources - assert len(task_resources) == 1, 'Only one resource is supported.' - resources: 'sky.Resources' = list(task_resources)[0] - spot_recovery = resources.spot_recovery - assert spot_recovery is not None, ( - 'spot_recovery is required to use spot strategy.') + resource_list = list(task.resources) + spot_recovery = resource_list[0].spot_recovery + for resource in resource_list: + if resource.spot_recovery != spot_recovery: + raise ValueError( + 'The spot recovery strategy should be the same for all ' + 'resources.') # Remove the spot_recovery field from the resources, as the strategy # will be handled by the strategy class. - task.set_resources({resources.copy(spot_recovery=None)}) + new_resources_list = [r.copy(spot_recovery=None) for r in resource_list] + # set the new_task_resources to be the same type (list or set) as the + # original task.resources + task.set_resources(type(task.resources)(new_resources_list)) return SPOT_STRATEGIES[spot_recovery](cluster_name, backend, task, retry_until_up) @@ -378,8 +382,7 @@ def __init__(self, cluster_name: str, backend: 'backends.Backend', # first retry in the same cloud/region. (Inside recover() we may not # rely on cluster handle, as it can be None if the cluster is # preempted.) - self._launched_cloud_region: Optional[Tuple['sky.clouds.Cloud', - 'sky.clouds.Region']] = None + self._launched_resources: Optional['sky.resources.Resources'] = None def _launch(self, max_retry: Optional[int] = 3, @@ -392,10 +395,9 @@ def _launch(self, assert isinstance(handle, backends.CloudVmRayResourceHandle), ( 'Cluster should be launched.', handle) launched_resources = handle.launched_resources - self._launched_cloud_region = (launched_resources.cloud, - launched_resources.region) + self._launched_resources = launched_resources else: - self._launched_cloud_region = None + self._launched_resources = None return job_submitted_at def recover(self) -> float: @@ -411,19 +413,18 @@ def recover(self) -> float: while True: # Add region constraint to the task, to retry on the same region # first (if valid). - if self._launched_cloud_region is not None: + if self._launched_resources is not None: task = self.dag.tasks[0] - resources = list(task.resources)[0] - original_resources = resources - - launched_cloud, launched_region = self._launched_cloud_region - new_resources = resources.copy(cloud=launched_cloud, - region=launched_region) + original_resources = task.resources + launched_cloud = self._launched_resources.cloud + launched_region = self._launched_resources.region + new_resources = self._launched_resources.copy( + cloud=launched_cloud, region=launched_region, zone=None) task.set_resources({new_resources}) # Not using self.launch to avoid the retry until up logic. job_submitted_at = self._launch(raise_on_failure=False) # Restore the original dag, i.e. reset the region constraint. - task.set_resources({original_resources}) + task.set_resources(original_resources) if job_submitted_at is not None: return job_submitted_at @@ -499,16 +500,17 @@ def recover(self) -> float: # Step 2 logger.debug('Relaunch the cluster skipping the previously launched ' 'cloud/region.') - if self._launched_cloud_region is not None: + if self._launched_resources is not None: task = self.dag.tasks[0] - requested_resources = list(task.resources)[0] + requested_resources = self._launched_resources if (requested_resources.region is None and requested_resources.zone is None): # Optimization: We only block the previously launched region, # if the requested resources does not specify a region or zone, # because, otherwise, we will spend unnecessary time for # skipping the only specified region/zone. - launched_cloud, launched_region = self._launched_cloud_region + launched_cloud = self._launched_resources.cloud + launched_region = self._launched_resources.region task.blocked_resources = { requested_resources.copy(cloud=launched_cloud, region=launched_region) diff --git a/sky/task.py b/sky/task.py index 5adb3fe7ac5..5579b5e98fc 100644 --- a/sky/task.py +++ b/sky/task.py @@ -127,11 +127,12 @@ def _check_docker_login_config(task_envs: Dict[str, str]) -> bool: def _with_docker_login_config( - resources_set: Set['resources_lib.Resources'], + resources: Union[Set['resources_lib.Resources'], + List['resources_lib.Resources']], task_envs: Dict[str, str], -) -> Set['resources_lib.Resources']: +) -> Union[Set['resources_lib.Resources'], List['resources_lib.Resources']]: if not _check_docker_login_config(task_envs): - return resources_set + return resources docker_login_config = docker_utils.DockerLoginConfig.from_env_vars( task_envs) @@ -155,7 +156,10 @@ def _add_docker_login_config(resources: 'resources_lib.Resources'): return resources.copy(image_id={region: 'docker:' + docker_image}, _docker_login_config=docker_login_config) - return {_add_docker_login_config(resources) for resources in resources_set} + new_resources = [] + for r in resources: + new_resources.append(_add_docker_login_config(r)) + return type(resources)(new_resources) class Task: @@ -250,7 +254,8 @@ def __init__( self.estimated_inputs_size_gigabytes: Optional[float] = None self.estimated_outputs_size_gigabytes: Optional[float] = None # Default to CPUNode - self.resources = {sky.Resources()} + self.resources: Union[List[sky.Resources], + Set[sky.Resources]] = {sky.Resources()} # Resources that this task cannot run on. self.blocked_resources = blocked_resources @@ -422,10 +427,67 @@ def from_yaml_config( task.set_outputs(outputs=outputs, estimated_size_gigabytes=estimated_size_gigabytes) - resources = config.pop('resources', None) - resources = sky.Resources.from_yaml_config(resources) - - task.set_resources({resources}) + resources_config = config.pop('resources', None) + if resources_config and resources_config.get( + 'any_of') is not None and resources_config.get( + 'ordered') is not None: + with ux_utils.print_exception_no_traceback(): + raise ValueError( + 'Cannot specify both "any_of" and "ordered" in resources.') + if resources_config and resources_config.get('any_of') is not None: + # TODO(Ziming) In the future we can consider to allow + # additional field when any_of is specified, + # which means we override the fields in all the + # resources under any_of with the fields specified outsied any_of. + if len(resources_config) > 1: + with ux_utils.print_exception_no_traceback(): + raise ValueError( + 'Cannot specify "any_of" with other resource fields.') + resources_set = set() + for resource in resources_config['any_of']: + resources_set.add(sky.Resources.from_yaml_config(resource)) + task.set_resources(resources_set) + elif resources_config and resources_config.get('ordered') is not None: + if len(resources_config) > 1: + with ux_utils.print_exception_no_traceback(): + raise ValueError( + 'Cannot specify "ordered" with other resource fields.') + resources_list = [] + for resource in resources_config['ordered']: + resources_list.append(sky.Resources.from_yaml_config(resource)) + task.set_resources(resources_list) + # Translate accelerators field to potential multiple resources. + elif resources_config and resources_config.get( + 'accelerators') is not None: + accelerators = resources_config.get('accelerators') + if isinstance(accelerators, str): + accelerators = {accelerators} + elif isinstance(accelerators, dict): + accelerators = [ + f'{k}:{v}' if v is not None else f'{k}' + for k, v in accelerators.items() + ] + accelerators = set(accelerators) + + # In yaml file, we store accelerators as a list. + # In Task, we store a list of resources, each with 1 accelerator. + # This for loop is for format conversion. + tmp_resources_list = [] + for acc in accelerators: + tmp_resource = resources_config.copy() + tmp_resource['accelerators'] = acc + tmp_resources_list.append( + sky.Resources.from_yaml_config(tmp_resource)) + + if isinstance(accelerators, list): + task.set_resources(tmp_resources_list) + elif isinstance(accelerators, set): + task.set_resources(set(tmp_resources_list)) + else: + raise RuntimeError('Accelerators must be a list or a set.') + else: + task.set_resources( + {sky.Resources.from_yaml_config(resources_config)}) assert not config, f'Invalid task args: {config.keys()}' return task @@ -571,6 +633,7 @@ def get_estimated_outputs_size_gigabytes(self) -> Optional[float]: def set_resources( self, resources: Union['resources_lib.Resources', + List['resources_lib.Resources'], Set['resources_lib.Resources']] ) -> 'Task': """Sets the required resources to execute this task. @@ -579,10 +642,9 @@ def set_resources( requirements will be used (8 vCPUs). Args: - resources: either a sky.Resources, or a set of them. The latter case - is EXPERIMENTAL and indicates asking the optimizer to "pick the + resources: either a sky.Resources, a set of them, or a list of them. + A set or a list of resources asks the optimizer to "pick the best of these resources" to run this task. - Returns: self: The current task, with resources set. """ @@ -592,8 +654,15 @@ def set_resources( self.resources = _with_docker_login_config(resources, self.envs) return self - def get_resources(self): - return self.resources + def set_resources_override(self, override_params: Dict[str, Any]) -> 'Task': + """Sets the override parameters for the resources.""" + new_resources_list = [] + for res in list(self.resources): + new_resources = res.copy(**override_params) + new_resources_list.append(new_resources) + + self.set_resources(type(self.resources)(new_resources_list)) + return self def set_time_estimator(self, func: Callable[['sky.Resources'], int]) -> 'Task': @@ -814,7 +883,10 @@ def get_preferred_store_type(self) -> storage_lib.StoreType: # 3. if not specified or the task's cloud does not support storage, # use the first enabled storage cloud. # This should be refactored and moved to the optimizer. - assert len(self.resources) == 1, self.resources + + # This check is not needed to support multiple accelerators; + # We just need to get the storage_cloud. + # assert len(self.resources) == 1, self.resources storage_cloud = None backend_utils.check_public_cloud_enabled() @@ -961,10 +1033,17 @@ def add_if_not_none(key, value, no_empty: bool = False): add_if_not_none('name', self.name) - if self.resources is not None: - assert len(self.resources) == 1 - resources = list(self.resources)[0] - add_if_not_none('resources', resources.to_yaml_config()) + tmp_resource_config = {} + if len(self.resources) > 1: + resource_list = [] + for r in self.resources: + resource_list.append(r.to_yaml_config()) + key = 'ordered' if isinstance(self.resources, list) else 'any_of' + tmp_resource_config[key] = resource_list + else: + tmp_resource_config = list(self.resources)[0].to_yaml_config() + + add_if_not_none('resources', tmp_resource_config) add_if_not_none('num_nodes', self.num_nodes) if self.inputs is not None: diff --git a/sky/utils/dag_utils.py b/sky/utils/dag_utils.py index d71f9d5e405..03ab3a72713 100644 --- a/sky/utils/dag_utils.py +++ b/sky/utils/dag_utils.py @@ -7,6 +7,7 @@ from sky import task as task_lib from sky.backends import backend_utils from sky.utils import common_utils +from sky.utils import ux_utils logger = sky_logging.init_logger(__name__) @@ -94,18 +95,31 @@ def maybe_infer_and_fill_dag_and_task_names(dag: dag_lib.Dag) -> None: def fill_default_spot_config_in_dag_for_spot_launch(dag: dag_lib.Dag) -> None: for task_ in dag.tasks: - assert len(task_.resources) == 1, task_ - resources = list(task_.resources)[0] - - change_default_value: Dict[str, Any] = {} - if resources.use_spot_specified and not resources.use_spot: - logger.info( - 'Field `use_spot` is set to false but a managed spot job is ' - 'being launched. Ignoring the field and proceeding to use spot ' - 'instance(s).') - change_default_value['use_spot'] = True - if resources.spot_recovery is None: - change_default_value['spot_recovery'] = spot.SPOT_DEFAULT_STRATEGY - - new_resources = resources.copy(**change_default_value) - task_.set_resources({new_resources}) + + new_resources_list = [] + for resources in list(task_.resources): + change_default_value: Dict[str, Any] = {} + if resources.use_spot_specified and not resources.use_spot: + logger.info( + 'Field `use_spot` is set to false but a managed spot job is ' # pylint: disable=line-too-long + 'being launched. Ignoring the field and proceeding to use spot ' # pylint: disable=line-too-long + 'instance(s).') + change_default_value['use_spot'] = True + if resources.spot_recovery is None: + change_default_value[ + 'spot_recovery'] = spot.SPOT_DEFAULT_STRATEGY + + new_resources = resources.copy(**change_default_value) + new_resources_list.append(new_resources) + + spot_recovery_strategy = new_resources_list[0].spot_recovery + for resource in new_resources_list: + if resource.spot_recovery != spot_recovery_strategy: + with ux_utils.print_exception_no_traceback(): + raise ValueError('All resources in the task must have' + 'the same spot recovery strategy.') + + if isinstance(task_.resources, list): + task_.set_resources(new_resources_list) + else: + task_.set_resources(set(new_resources_list)) diff --git a/sky/utils/schemas.py b/sky/utils/schemas.py index 0b7a1cc4223..2216f66de5d 100644 --- a/sky/utils/schemas.py +++ b/sky/utils/schemas.py @@ -5,7 +5,7 @@ """ -def get_resources_schema(): +def get_single_resources_schema(): # To avoid circular imports, only import when needed. # pylint: disable=import-outside-toplevel from sky.clouds import service_catalog @@ -110,6 +110,136 @@ def get_resources_schema(): } +def get_resources_schema(): + # To avoid circular imports, only import when needed. + # pylint: disable=import-outside-toplevel + from sky.clouds import service_catalog + return { + '$schema': 'http://json-schema.org/draft-07/schema#', + 'type': 'object', + 'required': [], + 'additionalProperties': False, + 'properties': { + 'cloud': { + 'type': 'string', + 'case_insensitive_enum': list(service_catalog.ALL_CLOUDS) + }, + 'region': { + 'type': 'string', + }, + 'zone': { + 'type': 'string', + }, + 'cpus': { + 'anyOf': [{ + 'type': 'string', + }, { + 'type': 'number', + }], + }, + 'memory': { + 'anyOf': [{ + 'type': 'string', + }, { + 'type': 'number', + }], + }, + 'accelerators': { + # {'V100:1', 'A100:1'} will be + # read as a string and converted to dict. + 'anyOf': [{ + 'type': 'string', + }, { + 'type': 'object', + 'required': [], + 'maxProperties': 1, + 'additionalProperties': { + 'type': 'number' + } + }, { + 'type': 'array', + 'items': { + 'type': 'string', + } + }] + }, + 'instance_type': { + 'type': 'string', + }, + 'use_spot': { + 'type': 'boolean', + }, + 'spot_recovery': { + 'type': 'string', + }, + 'disk_size': { + 'type': 'integer', + }, + 'disk_tier': { + 'type': 'string', + }, + 'ports': { + 'anyOf': [{ + 'type': 'string', + }, { + 'type': 'integer', + }, { + 'type': 'array', + 'items': { + 'anyOf': [{ + 'type': 'string', + }, { + 'type': 'integer', + }] + } + }], + }, + 'accelerator_args': { + 'type': 'object', + 'required': [], + 'additionalProperties': False, + 'properties': { + 'runtime_version': { + 'type': 'string', + }, + 'tpu_name': { + 'type': 'string', + }, + 'tpu_vm': { + 'type': 'boolean', + } + } + }, + 'image_id': { + 'anyOf': [{ + 'type': 'string', + }, { + 'type': 'object', + 'required': [], + }] + }, + 'any_of': { + 'type': 'array', + 'items': { + k: v + for k, v in get_single_resources_schema().items() + # Validation may fail if $schema is included. + if k != '$schema' + }, + }, + 'ordered': { + 'type': 'array', + 'items': { + k: v + for k, v in get_single_resources_schema().items() + # Validation may fail if $schema is included. + if k != '$schema' + }, + } + } + } + + def get_storage_schema(): # pylint: disable=import-outside-toplevel from sky.data import storage diff --git a/tests/test_optimizer_dryruns.py b/tests/test_optimizer_dryruns.py index c6ad64f9f20..5b45bd2deef 100644 --- a/tests/test_optimizer_dryruns.py +++ b/tests/test_optimizer_dryruns.py @@ -715,3 +715,18 @@ def test_infer_cloud_from_region_or_zone(monkeypatch): assert ( 'Invalid (region \'us-east1\', zone \'us-west2-a\') for any cloud among' in str(e)) + + +def test_ordered_resources(enable_all_clouds, monkeypatch): + + with sky.Dag() as dag: + task = sky.Task('test_task') + task.set_resources([ + sky.Resources(accelerators={'V100': 1}), + sky.Resources(accelerators={'T4': 1}), + sky.Resources(accelerators={'K80': 1}), + sky.Resources(accelerators={'T4': 4}), + ]) + dag = sky.optimize(dag) + # 'V100' is picked because it is the first in the list. + assert 'V100' in repr(task.best_resources) diff --git a/tests/test_optimizer_random_dag.py b/tests/test_optimizer_random_dag.py index c87a40f6a94..f3c12008552 100644 --- a/tests/test_optimizer_random_dag.py +++ b/tests/test_optimizer_random_dag.py @@ -97,7 +97,7 @@ def _optimize_by_brute_force(tasks, plan): # NOTE: Here we assume that the Sky DAG is topologically sorted. nonlocal final_plan, min_objective task = tasks[0] - for resources in task.get_resources(): + for resources in task.resources: assert task.name in DUMMY_NODES or resources.is_launchable() plan[task] = resources resources_stack.append(resources) @@ -125,8 +125,12 @@ def _optimize_by_brute_force(tasks, plan): def compare_optimization_results(dag: sky.Dag, minimize_cost: bool): copy_dag = copy.deepcopy(dag) - - optimizer_plan = sky.Optimizer._optimize_objective(dag, minimize_cost) + if minimize_cost: + optimizer_plan = sky.Optimizer._optimize_dag(dag, + sky.OptimizeTarget.COST) + else: + optimizer_plan = sky.Optimizer._optimize_dag(dag, + sky.OptimizeTarget.TIME) if minimize_cost: objective = sky.Optimizer._compute_total_cost(dag.get_graph(), dag.tasks, optimizer_plan) diff --git a/tests/test_smoke.py b/tests/test_smoke.py index 6da0866b4c5..8b8c72200cd 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -3545,6 +3545,7 @@ def _check_equivalent(self, yaml_path): task = sky.Task.from_yaml(yaml_path) new_task_config = task.to_yaml_config() # d1 <= d2 + print(origin_task_config, new_task_config) self._is_dict_subset(origin_task_config, new_task_config) def test_load_dump_yaml_config_equivalent(self): @@ -3558,3 +3559,43 @@ def test_load_dump_yaml_config_equivalent(self): exist_ok=True) for yaml_path in self._TEST_YAML_PATHS: self._check_equivalent(yaml_path) + + +# ---------- Testing Multiple Accelerators ---------- +def test_multiple_accelerators_ordered(): + name = _get_cluster_name() + test = Test( + 'multiple-accelerators-ordered', + [ + f'sky launch -y -c {name} tests/test_yamls/test_multiple_accelerators_ordered.yaml | grep "Using user-specified accelerators list"', + f'sky logs {name} 1 --status', # Ensure the job succeeded. + ], + f'sky down -y {name}', + ) + run_one_test(test) + + +def test_multiple_accelerators_unordered(): + name = _get_cluster_name() + test = Test( + 'multiple-accelerators-unordered', + [ + f'sky launch -y -c {name} tests/test_yamls/test_multiple_accelerators_unordered.yaml', + f'sky logs {name} 1 --status', # Ensure the job succeeded. + ], + f'sky down -y {name}', + ) + run_one_test(test) + + +def test_multiple_resources(): + name = _get_cluster_name() + test = Test( + 'multiple-resources', + [ + f'sky launch -y -c {name} tests/test_yamls/test_multiple_resources.yaml', + f'sky logs {name} 1 --status', # Ensure the job succeeded. + ], + f'sky down -y {name}', + ) + run_one_test(test) diff --git a/tests/test_yamls/test_multiple_accelerators_ordered.yaml b/tests/test_yamls/test_multiple_accelerators_ordered.yaml new file mode 100644 index 00000000000..0286f929b51 --- /dev/null +++ b/tests/test_yamls/test_multiple_accelerators_ordered.yaml @@ -0,0 +1,7 @@ +name: multi-accelerators-ordered + +resources: + accelerators: ['A100-40GB:1', 'T4:1', 'V100:1', 'K80:1'] + +run: | + nvidia-smi diff --git a/tests/test_yamls/test_multiple_accelerators_unordered.yaml b/tests/test_yamls/test_multiple_accelerators_unordered.yaml new file mode 100644 index 00000000000..db0fc9c5f7c --- /dev/null +++ b/tests/test_yamls/test_multiple_accelerators_unordered.yaml @@ -0,0 +1,7 @@ +name: multi-accelerators-unordered + +resources: + accelerators: {'A100-40GB:1', 'T4:1', 'V100:1', 'K80:1'} + +run: | + nvidia-smi diff --git a/tests/test_yamls/test_multiple_resources.yaml b/tests/test_yamls/test_multiple_resources.yaml new file mode 100644 index 00000000000..37c0c25e867 --- /dev/null +++ b/tests/test_yamls/test_multiple_resources.yaml @@ -0,0 +1,13 @@ +name: multi-resources + +resources: + any_of: + - cloud: aws + region: us-east-1 + accelerators: A100:8 + - cloud: gcp + accelerators: T4:4 + - cloud: aws + +run: + echo hi \ No newline at end of file From 7f9a305630025008c51fd13f900cb80f8135e12b Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Mon, 13 Nov 2023 16:01:10 +0530 Subject: [PATCH 17/17] [Storage] Storage tests cleanup and fixes (#2774) * storage delete confirmation prompt override for tests * remove kubernetes stop test * commit msg * remove redundant storage tests * repr * revert eksctl changes --- sky/data/storage.py | 2 +- tests/test_smoke.py | 123 ++++++-------------------------------------- 2 files changed, 18 insertions(+), 107 deletions(-) diff --git a/sky/data/storage.py b/sky/data/storage.py index e18ae72df10..d9f8156fa89 100644 --- a/sky/data/storage.py +++ b/sky/data/storage.py @@ -731,7 +731,7 @@ def _add_store_from_metadata( # Following error is raised from _get_bucket and caught only when # an externally removed storage is attempted to be fetched. except exceptions.StorageExternalDeletionError: - logger.debug(f'Storage object, {self.name}, was attempted to ' + logger.debug(f'Storage object {self.name!r} was attempted to ' 'be reconstructed while the corresponding bucket' ' was externally deleted.') continue diff --git a/tests/test_smoke.py b/tests/test_smoke.py index 8b8c72200cd..4070d2f4bfd 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -804,7 +804,7 @@ def test_using_file_mounts_with_env_vars(generic_cloud: str): # ---------- storage ---------- @pytest.mark.aws -def test_aws_storage_mounts(): +def test_aws_storage_mounts_with_stop(): name = _get_cluster_name() storage_name = f'sky-test-{int(time.time())}' template_str = pathlib.Path( @@ -820,18 +820,23 @@ def test_aws_storage_mounts(): f'sky launch -y -c {name} --cloud aws {file_path}', f'sky logs {name} 1 --status', # Ensure job succeeded. f'aws s3 ls {storage_name}/hello.txt', + f'sky stop -y {name}', + f'sky start -y {name}', + # Check if hello.txt from mounting bucket exists after restart in + # the mounted directory + f'sky exec {name} -- "set -ex; ls /mount_private_mount/hello.txt"' ] test = Test( 'aws_storage_mounts', test_commands, - f'sky down -y {name}; sky storage delete {storage_name}', + f'sky down -y {name}; sky storage delete -y {storage_name}', timeout=20 * 60, # 20 mins ) run_one_test(test) @pytest.mark.gcp -def test_gcp_storage_mounts(): +def test_gcp_storage_mounts_with_stop(): name = _get_cluster_name() storage_name = f'sky-test-{int(time.time())}' template_str = pathlib.Path( @@ -847,11 +852,16 @@ def test_gcp_storage_mounts(): f'sky launch -y -c {name} --cloud gcp {file_path}', f'sky logs {name} 1 --status', # Ensure job succeeded. f'gsutil ls gs://{storage_name}/hello.txt', + f'sky stop -y {name}', + f'sky start -y {name}', + # Check if hello.txt from mounting bucket exists after restart in + # the mounted directory + f'sky exec {name} -- "set -ex; ls /mount_private_mount/hello.txt"' ] test = Test( 'gcp_storage_mounts', test_commands, - f'sky down -y {name}; sky storage delete {storage_name}', + f'sky down -y {name}; sky storage delete -y {storage_name}', timeout=20 * 60, # 20 mins ) run_one_test(test) @@ -881,7 +891,7 @@ def test_kubernetes_storage_mounts(): test = Test( 'kubernetes_storage_mounts', test_commands, - f'sky down -y {name}; sky storage delete {storage_name}', + f'sky down -y {name}; sky storage delete -y {storage_name}', timeout=20 * 60, # 20 mins ) run_one_test(test) @@ -910,7 +920,7 @@ def test_cloudflare_storage_mounts(generic_cloud: str): test = Test( 'cloudflare_storage_mounts', test_commands, - f'sky down -y {name}; sky storage delete {storage_name}', + f'sky down -y {name}; sky storage delete -y {storage_name}', timeout=20 * 60, # 20 mins ) run_one_test(test) @@ -939,106 +949,7 @@ def test_ibm_storage_mounts(): test = Test( 'ibm_storage_mounts', test_commands, - f'sky down -y {name}; sky storage delete {storage_name}', - timeout=20 * 60, # 20 mins - ) - run_one_test(test) - - -@pytest.mark.aws -def test_aws_storage_mounts_with_stop(): - name = _get_cluster_name() - storage_name = f'sky-test-{int(time.time())}' - template_str = pathlib.Path( - 'tests/test_yamls/test_storage_mounting.yaml.j2').read_text() - template = jinja2.Template(template_str) - content = template.render(storage_name=storage_name) - with tempfile.NamedTemporaryFile(suffix='.yaml', mode='w') as f: - f.write(content) - f.flush() - file_path = f.name - test_commands = [ - *storage_setup_commands, - f'sky launch -y -c {name} --cloud aws {file_path}', - f'sky logs {name} 1 --status', # Ensure job succeeded. - f'aws s3 ls {storage_name}/hello.txt', - f'sky stop -y {name}', - f'sky start -y {name}', - # Check if hello.txt from mounting bucket exists after restart in - # the mounted directory - f'sky exec {name} -- "set -ex; ls /mount_private_mount/hello.txt"' - ] - test = Test( - 'aws_storage_mounts', - test_commands, - f'sky down -y {name}; sky storage delete {storage_name}', - timeout=20 * 60, # 20 mins - ) - run_one_test(test) - - -@pytest.mark.gcp -def test_gcp_storage_mounts_with_stop(): - name = _get_cluster_name() - storage_name = f'sky-test-{int(time.time())}' - template_str = pathlib.Path( - 'tests/test_yamls/test_storage_mounting.yaml.j2').read_text() - template = jinja2.Template(template_str) - content = template.render(storage_name=storage_name) - with tempfile.NamedTemporaryFile(suffix='.yaml', mode='w') as f: - f.write(content) - f.flush() - file_path = f.name - test_commands = [ - *storage_setup_commands, - f'sky launch -y -c {name} --cloud gcp {file_path}', - f'sky logs {name} 1 --status', # Ensure job succeeded. - f'gsutil ls gs://{storage_name}/hello.txt', - f'sky stop -y {name}', - f'sky start -y {name}', - # Check if hello.txt from mounting bucket exists after restart in - # the mounted directory - f'sky exec {name} -- "set -ex; ls /mount_private_mount/hello.txt"' - ] - test = Test( - 'gcp_storage_mounts', - test_commands, - f'sky down -y {name}; sky storage delete {storage_name}', - timeout=20 * 60, # 20 mins - ) - run_one_test(test) - - -@pytest.mark.kubernetes -def test_kubernetes_storage_mounts_with_stop(): - # Tests bucket mounting on k8s, assuming S3 is configured. - # This test will fail if run on non x86_64 architecture, since goofys is - # built for x86_64 only. - name = _get_cluster_name() - storage_name = f'sky-test-{int(time.time())}' - template_str = pathlib.Path( - 'tests/test_yamls/test_storage_mounting.yaml.j2').read_text() - template = jinja2.Template(template_str) - content = template.render(storage_name=storage_name) - with tempfile.NamedTemporaryFile(suffix='.yaml', mode='w') as f: - f.write(content) - f.flush() - file_path = f.name - test_commands = [ - *storage_setup_commands, - f'sky launch -y -c {name} --cloud kubernetes {file_path}', - f'sky logs {name} 1 --status', # Ensure job succeeded. - f'aws s3 ls {storage_name}/hello.txt', - f'sky stop -y {name}', - f'sky start -y {name}', - # Check if hello.txt from mounting bucket exists after restart in - # the mounted directory - f'sky exec {name} -- "set -ex; ls /mount_private_mount/hello.txt"' - ] - test = Test( - 'kubernetes_storage_mounts', - test_commands, - f'sky down -y {name}; sky storage delete {storage_name}', + f'sky down -y {name}; sky storage delete -y {storage_name}', timeout=20 * 60, # 20 mins ) run_one_test(test)