From 1d31671778d70de6338984884f79310d9a52670e Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Mon, 22 Apr 2024 22:23:23 -0700 Subject: [PATCH] [Core] Add internal configs in schema.py to avoid spot job and serve failure (#3459) * [Core] Add internal configs in schema.py to avoid spot job and serve failure * format * fix * rename schema * Fix comment error * fix comment --- sky/backends/cloud_vm_ray_backend.py | 3 +- sky/resources.py | 4 + sky/utils/schemas.py | 120 ++++++++------------------- 3 files changed, 40 insertions(+), 87 deletions(-) diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py index 01cbefb42bd..3196c45da55 100644 --- a/sky/backends/cloud_vm_ray_backend.py +++ b/sky/backends/cloud_vm_ray_backend.py @@ -272,7 +272,8 @@ def get_or_fail(futures, pg) -> List[int]: # Keep invoking ray.wait if ready is empty. This is because # ray.wait with timeout=None will only wait for 10**6 seconds, # which will cause tasks running for more than 12 days to return - before becoming ready. (Such tasks are common in serving jobs.) + # before becoming ready. + # (Such tasks are common in serving jobs.) # Reference: https://github.com/ray-project/ray/blob/ray-2.9.3/python/ray/_private/worker.py#L2845-L2846 while not ready: ready, unready = ray.wait(futures) diff --git a/sky/resources.py b/sky/resources.py index 222be919d13..d024ec39bbc 100644 --- a/sky/resources.py +++ b/sky/resources.py @@ -1,4 +1,5 @@ """Resources: compute requirements of Tasks.""" +import dataclasses import functools import textwrap from typing import Any, Dict, List, Optional, Set, Tuple, Union @@ -1340,6 +1341,9 @@ def add_if_not_none(key, value): if self.disk_tier is not None: config['disk_tier'] = self.disk_tier.value add_if_not_none('ports', self.ports) + if self._docker_login_config is not None: + config['_docker_login_config'] = dataclasses.asdict( + self._docker_login_config) if self._is_image_managed is not None: config['_is_image_managed'] = self._is_image_managed if self._requires_fuse is not None: diff --git a/sky/utils/schemas.py b/sky/utils/schemas.py index 689905d7c71..740fe756230 100644 --- a/sky/utils/schemas.py +++ b/sky/utils/schemas.py @@ -5,7 +5,8 @@ """ -def get_single_resources_schema(): +def _get_single_resources_schema(): + """Schema for a single resource in a resources list.""" # To avoid circular imports, only import when needed. # pylint: disable=import-outside-toplevel from sky.clouds import service_catalog @@ -105,45 +106,47 @@ def get_single_resources_schema(): 'type': 'object', 'required': [], }] - } + }, + # The following fields are for internal use only. + '_docker_login_config': { + 'type': 'object', + 'required': ['username', 'password', 'server'], + 'additionalProperties': False, + 'properties': { + 'username': { + 'type': 'string', + }, + 'password': { + 'type': 'string', + }, + 'server': { + 'type': 'string', + } + } + }, + '_is_image_managed': { + 'type': 'boolean', + }, + '_requires_fuse': { + 'type': 'boolean', + }, } } def get_resources_schema(): - # To avoid circular imports, only import when needed. - # pylint: disable=import-outside-toplevel - from sky.clouds import service_catalog + """Resource schema in task config.""" + single_resources_schema = _get_single_resources_schema()['properties'] + single_resources_schema.pop('accelerators') return { '$schema': 'http://json-schema.org/draft-07/schema#', 'type': 'object', 'required': [], 'additionalProperties': False, 'properties': { - 'cloud': { - 'type': 'string', - 'case_insensitive_enum': list(service_catalog.ALL_CLOUDS) - }, - 'region': { - 'type': 'string', - }, - 'zone': { - 'type': 'string', - }, - 'cpus': { - 'anyOf': [{ - 'type': 'string', - }, { - 'type': 'number', - }], - }, - 'memory': { - 'anyOf': [{ - 'type': 'string', - }, { - 'type': 'number', - }], - }, + **single_resources_schema, + # We redefine the 'accelerators' field to allow one line list or + # a set of accelerators. 'accelerators': { # {'V100:1', 'A100:1'} will be # read as a string and converted to dict. @@ -166,66 +169,11 @@ def get_resources_schema(): } }] }, - 'instance_type': { - 'type': 'string', - }, - 'use_spot': { - 'type': 'boolean', - }, - 'spot_recovery': { - 'type': 'string', - }, - 'disk_size': { - 'type': 'integer', - }, - 'disk_tier': { - 'type': 'string', - }, - 'ports': { - 'anyOf': [{ - 'type': 'string', - }, { - 'type': 'integer', - }, { - 'type': 'array', - 'items': { - 'anyOf': [{ - 'type': 'string', - }, { - 'type': 'integer', - }] - } - }], - }, - 'accelerator_args': { - 'type': 'object', - 'required': [], - 'additionalProperties': False, - 'properties': { - 'runtime_version': { - 'type': 'string', - }, - 'tpu_name': { - 'type': 'string', - }, - 'tpu_vm': { - 'type': 'boolean', - } - } - }, - 'image_id': { - 'anyOf': [{ - 'type': 'string', - }, { - 'type': 'object', - 'required': [], - }] - }, 'any_of': { 'type': 'array', 'items': { k: v - for k, v in get_single_resources_schema().items() + for k, v in _get_single_resources_schema().items() # Validation may fail if $schema is included. if k != '$schema' }, @@ -234,7 +182,7 @@ def get_resources_schema(): 'type': 'array', 'items': { k: v - for k, v in get_single_resources_schema().items() + for k, v in _get_single_resources_schema().items() # Validation may fail if $schema is included. if k != '$schema' },