Skip to content

Commit

Permalink
[Core] Fix optimizer for dag when some resources provided are not fea…
Browse files Browse the repository at this point in the history
…sible (#2657)

* Fix optimizer for dag when some of the resources provided are invalid

* format

* format

* address comments

* better output

* spacing

* Fix inconsistent repr

* use str instead of repr

* more robust replace for region, zone
  • Loading branch information
Michaelvll authored Oct 4, 2023
1 parent dc11e79 commit 9d1ff22
Show file tree
Hide file tree
Showing 3 changed files with 71 additions and 57 deletions.
93 changes: 44 additions & 49 deletions sky/optimizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -267,52 +267,6 @@ def _estimate_nodes_cost_or_time(
num_resources = len(node.get_resources())

for orig_resources, launchable_list in launchable_resources.items():
if not launchable_list:
location_hint = ''
source_hint = 'catalog'
if node.get_resources():
specified_resources = list(node.get_resources())[0]
if specified_resources.zone is not None:
location_hint = (
f' Zone: {specified_resources.zone}.')
elif specified_resources.region:
location_hint = (
f' Region: {specified_resources.region}.')

# If Kubernetes was included in the search space, then
# mention "kubernetes cluster" and/instead of "catalog"
# in the error message.
enabled_clouds = global_user_state.get_enabled_clouds()
if _cloud_in_list(clouds.Kubernetes(), enabled_clouds):
if specified_resources.cloud is None:
source_hint = 'catalog and kubernetes cluster'
elif specified_resources.cloud.is_same_cloud(
clouds.Kubernetes()):
source_hint = 'kubernetes cluster'

# TODO(romilb): When `sky show-gpus` supports Kubernetes,
# add a hint to run `sky show-gpus --kubernetes` to list
# available accelerators on Kubernetes.

bold = colorama.Style.BRIGHT
cyan = colorama.Fore.CYAN
reset = colorama.Style.RESET_ALL
fuzzy_candidates_str = ''
if fuzzy_candidates:
fuzzy_candidates_str = (
f'\nTry one of these offered accelerators: {cyan}'
f'{fuzzy_candidates}{reset}')
error_msg = (
f'{source_hint.capitalize()} does not contain any '
f'instances satisfying the request:\n{node}.'
f'{location_hint}\n\nTo fix: relax or change the '
f'resource requirements.{fuzzy_candidates_str}\n\n'
f'Hint: {bold}sky show-gpus{reset} '
'to list available accelerators.\n'
f' {bold}sky check{reset} to check the enabled '
'clouds.')
with ux_utils.print_exception_no_traceback():
raise exceptions.ResourcesUnavailableError(error_msg)
if num_resources == 1 and node.time_estimator_func is None:
logger.debug(
'Defaulting the task\'s estimated time to 1 hour.')
Expand Down Expand Up @@ -358,6 +312,44 @@ def _estimate_nodes_cost_or_time(
' estimated_cost (not incl. egress): ${:.1f}'.
format(estimated_cost_or_time))
node_to_cost_map[node][resources] = estimated_cost_or_time
if not node_to_cost_map[node]:
source_hint = 'catalog'
# If Kubernetes was included in the search space, then
# mention "kubernetes cluster" and/instead of "catalog"
# in the error message.
enabled_clouds = global_user_state.get_enabled_clouds()
if _cloud_in_list(clouds.Kubernetes(), enabled_clouds):
if any(orig_resources.cloud is None
for orig_resources in node.get_resources()):
source_hint = 'catalog and kubernetes cluster'
elif all(
isinstance(orig_resources.cloud, clouds.Kubernetes)
for orig_resources in node.get_resources()):
source_hint = 'kubernetes cluster'

# TODO(romilb): When `sky show-gpus` supports Kubernetes,
# add a hint to run `sky show-gpus --kubernetes` to list
# available accelerators on Kubernetes.

bold = colorama.Style.BRIGHT
cyan = colorama.Fore.CYAN
reset = colorama.Style.RESET_ALL
fuzzy_candidates_str = ''
if fuzzy_candidates:
fuzzy_candidates_str = (
f'\nTry one of these offered accelerators: {cyan}'
f'{fuzzy_candidates}{reset}')
error_msg = (
f'{source_hint.capitalize()} does not contain any '
f'instances satisfying the request:\n{node}.'
f'\n\nTo fix: relax or change the '
f'resource requirements.{fuzzy_candidates_str}\n\n'
f'Hint: {bold}sky show-gpus{reset} '
'to list available accelerators.\n'
f' {bold}sky check{reset} to check the enabled '
'clouds.')
with ux_utils.print_exception_no_traceback():
raise exceptions.ResourcesUnavailableError(error_msg)
return node_to_cost_map, node_to_candidate_map

@staticmethod
Expand Down Expand Up @@ -1008,8 +1000,8 @@ def _fill_in_launchable_resources(
False)
with ux_utils.print_exception_no_traceback():
raise exceptions.ResourcesUnavailableError(
f'task_lib.Task {task} requires {resources.cloud} which is '
'not enabled. To enable access, run '
f'Task requires {resources.cloud} which is '
f'not enabled: {task}.\nTo enable access, run '
f'{colorama.Style.BRIGHT}'
f'sky check {colorama.Style.RESET_ALL}, or change the '
'cloud requirement')
Expand Down Expand Up @@ -1041,7 +1033,10 @@ def _fill_in_launchable_resources(
if len(launchable[resources]) == 0:
clouds_str = str(clouds_list) if len(clouds_list) > 1 else str(
clouds_list[0])
logger.info(f'No resource satisfying {resources} '
num_node_str = ''
if task.num_nodes > 1:
num_node_str = f'{task.num_nodes}x '
logger.info(f'No resource satisfying {num_node_str}{resources} '
f'on {clouds_str}.')
if len(all_fuzzy_candidates) > 0:
logger.info('Did you mean: '
Expand Down
17 changes: 16 additions & 1 deletion sky/resources.py
Original file line number Diff line number Diff line change
Expand Up @@ -254,7 +254,7 @@ def __repr__(self) -> str:
if None in self.image_id:
image_id = f', image_id={self.image_id[None]}'
else:
image_id = f', image_id={self.image_id!r}'
image_id = f', image_id={self.image_id}'

disk_tier = ''
if self.disk_tier is not None:
Expand Down Expand Up @@ -291,6 +291,21 @@ def __repr__(self) -> str:

return f'{cloud_str}({hardware_str})'

@property
def repr_with_region_zone(self) -> str:
region_str = ''
if self.region is not None:
region_str = f', region={self.region}'
zone_str = ''
if self.zone is not None:
zone_str = f', zone={self.zone}'
repr_str = str(self)
if repr_str.endswith(')'):
repr_str = repr_str[:-1] + f'{region_str}{zone_str})'
else:
repr_str += f'{region_str}{zone_str}'
return repr_str

@property
def cloud(self):
return self._cloud
Expand Down
18 changes: 11 additions & 7 deletions sky/task.py
Original file line number Diff line number Diff line change
Expand Up @@ -997,8 +997,6 @@ def __rshift__(self, b):
sky.dag.get_current_dag().add_edge(self, b)

def __repr__(self):
if self.name and self.name != 'sky-cmd': # CLI launch with a command
return self.name
if isinstance(self.run, str):
run_msg = self.run.replace('\n', '\\n')
if len(run_msg) > 20:
Expand All @@ -1010,18 +1008,24 @@ def __repr__(self):
else:
run_msg = 'run=<fn>'

s = f'Task({run_msg})'
name_str = ''
if self.name is not None:
name_str = f'<name={self.name}>'
s = f'Task{name_str}({run_msg})'
if self.inputs is not None:
s += f'\n inputs: {self.inputs}'
if self.outputs is not None:
s += f'\n outputs: {self.outputs}'
if self.num_nodes > 1:
s += f'\n nodes: {self.num_nodes}'
if len(self.resources) > 1:
s += f'\n resources: {self.resources}'
elif len(
self.resources) == 1 and not list(self.resources)[0].is_empty():
s += f'\n resources: {list(self.resources)[0]}'
resources_str = ('{' + ', '.join(
r.repr_with_region_zone for r in self.resources) + '}')
s += f'\n resources: {resources_str}'
elif (len(self.resources) == 1 and
not list(self.resources)[0].is_empty()):
s += (f'\n resources: '
f'{list(self.resources)[0].repr_with_region_zone}')
else:
s += '\n resources: default instances'
return s

0 comments on commit 9d1ff22

Please sign in to comment.