Skip to content

Commit

Permalink
[Core] Add internal configs in schema.py to avoid spot job and serve …
Browse files Browse the repository at this point in the history
…failure (#3459)

* [Core] Add internal configs in schema.py to avoid spot job and serve failure

* format

* fix

* rename schema

* Fix comment error

* fix comment
  • Loading branch information
Michaelvll authored Apr 23, 2024
1 parent cc1c58b commit 1d31671
Show file tree
Hide file tree
Showing 3 changed files with 40 additions and 87 deletions.
3 changes: 2 additions & 1 deletion sky/backends/cloud_vm_ray_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -272,7 +272,8 @@ def get_or_fail(futures, pg) -> List[int]:
# Keep invoking ray.wait if ready is empty. This is because
# ray.wait with timeout=None will only wait for 10**6 seconds,
# which will cause tasks running for more than 12 days to return
before becoming ready. (Such tasks are common in serving jobs.)
# before becoming ready.
# (Such tasks are common in serving jobs.)
# Reference: https://github.com/ray-project/ray/blob/ray-2.9.3/python/ray/_private/worker.py#L2845-L2846
while not ready:
ready, unready = ray.wait(futures)
Expand Down
4 changes: 4 additions & 0 deletions sky/resources.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Resources: compute requirements of Tasks."""
import dataclasses
import functools
import textwrap
from typing import Any, Dict, List, Optional, Set, Tuple, Union
Expand Down Expand Up @@ -1340,6 +1341,9 @@ def add_if_not_none(key, value):
if self.disk_tier is not None:
config['disk_tier'] = self.disk_tier.value
add_if_not_none('ports', self.ports)
if self._docker_login_config is not None:
config['_docker_login_config'] = dataclasses.asdict(
self._docker_login_config)
if self._is_image_managed is not None:
config['_is_image_managed'] = self._is_image_managed
if self._requires_fuse is not None:
Expand Down
120 changes: 34 additions & 86 deletions sky/utils/schemas.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@
"""


def get_single_resources_schema():
def _get_single_resources_schema():
"""Schema for a single resource in a resources list."""
# To avoid circular imports, only import when needed.
# pylint: disable=import-outside-toplevel
from sky.clouds import service_catalog
Expand Down Expand Up @@ -105,45 +106,47 @@ def get_single_resources_schema():
'type': 'object',
'required': [],
}]
}
},
# The following fields are for internal use only.
'_docker_login_config': {
'type': 'object',
'required': ['username', 'password', 'server'],
'additionalProperties': False,
'properties': {
'username': {
'type': 'string',
},
'password': {
'type': 'string',
},
'server': {
'type': 'string',
}
}
},
'_is_image_managed': {
'type': 'boolean',
},
'_requires_fuse': {
'type': 'boolean',
},
}
}


def get_resources_schema():
# To avoid circular imports, only import when needed.
# pylint: disable=import-outside-toplevel
from sky.clouds import service_catalog
"""Resource schema in task config."""
single_resources_schema = _get_single_resources_schema()['properties']
single_resources_schema.pop('accelerators')
return {
'$schema': 'http://json-schema.org/draft-07/schema#',
'type': 'object',
'required': [],
'additionalProperties': False,
'properties': {
'cloud': {
'type': 'string',
'case_insensitive_enum': list(service_catalog.ALL_CLOUDS)
},
'region': {
'type': 'string',
},
'zone': {
'type': 'string',
},
'cpus': {
'anyOf': [{
'type': 'string',
}, {
'type': 'number',
}],
},
'memory': {
'anyOf': [{
'type': 'string',
}, {
'type': 'number',
}],
},
**single_resources_schema,
# We redefine the 'accelerators' field to allow one line list or
# a set of accelerators.
'accelerators': {
# {'V100:1', 'A100:1'} will be
# read as a string and converted to dict.
Expand All @@ -166,66 +169,11 @@ def get_resources_schema():
}
}]
},
'instance_type': {
'type': 'string',
},
'use_spot': {
'type': 'boolean',
},
'spot_recovery': {
'type': 'string',
},
'disk_size': {
'type': 'integer',
},
'disk_tier': {
'type': 'string',
},
'ports': {
'anyOf': [{
'type': 'string',
}, {
'type': 'integer',
}, {
'type': 'array',
'items': {
'anyOf': [{
'type': 'string',
}, {
'type': 'integer',
}]
}
}],
},
'accelerator_args': {
'type': 'object',
'required': [],
'additionalProperties': False,
'properties': {
'runtime_version': {
'type': 'string',
},
'tpu_name': {
'type': 'string',
},
'tpu_vm': {
'type': 'boolean',
}
}
},
'image_id': {
'anyOf': [{
'type': 'string',
}, {
'type': 'object',
'required': [],
}]
},
'any_of': {
'type': 'array',
'items': {
k: v
for k, v in get_single_resources_schema().items()
for k, v in _get_single_resources_schema().items()
# Validation may fail if $schema is included.
if k != '$schema'
},
Expand All @@ -234,7 +182,7 @@ def get_resources_schema():
'type': 'array',
'items': {
k: v
for k, v in get_single_resources_schema().items()
for k, v in _get_single_resources_schema().items()
# Validation may fail if $schema is included.
if k != '$schema'
},
Expand Down

0 comments on commit 1d31671

Please sign in to comment.