Skip to content

Commit

Permalink
[k8s] Show hints when requested resources don't fit in Kubernetes clu…
Browse files Browse the repository at this point in the history
…ster (#3590)

* Add hints when feasible resources are not found

* lint

* lint

* wip

* move to resources_utils

* fixes

* fix

* add TODO

* Add todos and fix comments
  • Loading branch information
romilbhardwaj authored Jul 22, 2024
1 parent 2d24022 commit 541efba
Show file tree
Hide file tree
Showing 17 changed files with 210 additions and 108 deletions.
19 changes: 12 additions & 7 deletions sky/clouds/aws.py
Original file line number Diff line number Diff line change
Expand Up @@ -444,7 +444,7 @@ def make_deploy_resources_variables(

def _get_feasible_launchable_resources(
self, resources: 'resources_lib.Resources'
) -> Tuple[List['resources_lib.Resources'], List[str]]:
) -> resources_utils.FeasibleResources:
if resources.instance_type is not None:
assert resources.is_launchable(), resources
# Check the instance type is valid in the cloud
Expand All @@ -455,10 +455,12 @@ def _get_feasible_launchable_resources(
region=resources.region,
zone=resources.zone)
if not regions:
return ([], [])
# TODO: Add hints to all return values in this method to help
# users understand why the resources are not launchable.
return resources_utils.FeasibleResources([], [], None)
# Treat Resources(AWS, p3.2x, V100) as Resources(AWS, p3.2x).
resources = resources.copy(accelerators=None)
return ([resources], [])
return resources_utils.FeasibleResources([resources], [], None)

def _make(instance_list):
resource_list = []
Expand All @@ -484,9 +486,10 @@ def _make(instance_list):
memory=resources.memory,
disk_tier=resources.disk_tier)
if default_instance_type is None:
return ([], [])
return resources_utils.FeasibleResources([], [], None)
else:
return (_make([default_instance_type]), [])
return resources_utils.FeasibleResources(
_make([default_instance_type]), [], None)

assert len(accelerators) == 1, resources
acc, acc_count = list(accelerators.items())[0]
Expand All @@ -501,8 +504,10 @@ def _make(instance_list):
zone=resources.zone,
clouds='aws')
if instance_list is None:
return ([], fuzzy_candidate_list)
return (_make(instance_list), fuzzy_candidate_list)
return resources_utils.FeasibleResources([], fuzzy_candidate_list,
None)
return resources_utils.FeasibleResources(_make(instance_list),
fuzzy_candidate_list, None)

@classmethod
@functools.lru_cache(maxsize=1) # Cache since getting identity is slow.
Expand Down
19 changes: 12 additions & 7 deletions sky/clouds/azure.py
Original file line number Diff line number Diff line change
Expand Up @@ -378,17 +378,19 @@ def _failover_disk_tier() -> Optional[resources_utils.DiskTier]:

def _get_feasible_launchable_resources(
self, resources: 'resources.Resources'
) -> Tuple[List['resources.Resources'], List[str]]:
) -> 'resources_utils.FeasibleResources':
if resources.instance_type is not None:
assert resources.is_launchable(), resources
ok, _ = Azure.check_disk_tier(resources.instance_type,
resources.disk_tier)
if not ok:
return ([], [])
# TODO: Add hints to all return values in this method to help
# users understand why the resources are not launchable.
return resources_utils.FeasibleResources([], [], None)
# Treat Resources(Azure, Standard_NC4as_T4_v3, T4) as
# Resources(Azure, Standard_NC4as_T4_v3).
resources = resources.copy(accelerators=None)
return ([resources], [])
return resources_utils.FeasibleResources([resources], [], None)

def _make(instance_list):
resource_list = []
Expand Down Expand Up @@ -418,9 +420,10 @@ def _make(instance_list):
memory=resources.memory,
disk_tier=resources.disk_tier)
if default_instance_type is None:
return ([], [])
return resources_utils.FeasibleResources([], [], None)
else:
return (_make([default_instance_type]), [])
return resources_utils.FeasibleResources(
_make([default_instance_type]), [], None)

assert len(accelerators) == 1, resources
acc, acc_count = list(accelerators.items())[0]
Expand All @@ -435,8 +438,10 @@ def _make(instance_list):
zone=resources.zone,
clouds='azure')
if instance_list is None:
return ([], fuzzy_candidate_list)
return (_make(instance_list), fuzzy_candidate_list)
return resources_utils.FeasibleResources([], fuzzy_candidate_list,
None)
return resources_utils.FeasibleResources(_make(instance_list),
fuzzy_candidate_list, None)

@classmethod
def check_credentials(cls) -> Tuple[bool, Optional[str]]:
Expand Down
31 changes: 20 additions & 11 deletions sky/clouds/cloud.py
Original file line number Diff line number Diff line change
Expand Up @@ -341,22 +341,26 @@ def is_label_valid(cls, label_key: str,
return True, None

def get_feasible_launchable_resources(
self,
resources: 'resources_lib.Resources',
num_nodes: int = 1
) -> Tuple[List['resources_lib.Resources'], List[str]]:
"""Returns ([feasible and launchable resources], [fuzzy candidates]).
self,
resources: 'resources_lib.Resources',
num_nodes: int = 1) -> 'resources_utils.FeasibleResources':
"""Returns FeasibleResources for the given resources.
Feasible resources refer to an offering respecting the resource
requirements. Currently, this function implements "filtering" the
cloud's offerings only w.r.t. accelerators constraints.
Launchable resources require a cloud and an instance type be assigned.
Fuzzy candidates example: when the requested GPU is A100:1 but is not
available in a cloud/region, the fuzzy candidates are results of a fuzzy
search in the catalog that are offered in the location. E.g.,
['A100-80GB:1', 'A100-80GB:2', 'A100-80GB:4', 'A100:8']
The returned dataclass object FeasibleResources contains three fields:
- resources_list: a list of resources that are feasible to launch
- fuzzy_candidate_list: a list of resources that loosely match requested
resources. E.g., when A100:1 GPU is requested but is not available
in a cloud/region, the fuzzy candidates are results of a fuzzy
search in the catalog that are offered in the location. E.g.,
['A100-80GB:1', 'A100-80GB:2', 'A100-80GB:4', 'A100:8']
- hint: an optional string hint if no feasible resources are found.
"""
if resources.is_launchable():
self._check_instance_type_accelerators_combination(resources)
Expand All @@ -372,13 +376,18 @@ def get_feasible_launchable_resources(
# TODO(zhwu): The resources are now silently filtered out. We
# should have some logging telling the user why the resources
# are not considered.
return ([], [])
return resources_utils.FeasibleResources(resources_list=[],
fuzzy_candidate_list=[],
hint=None)
return self._get_feasible_launchable_resources(resources)

def _get_feasible_launchable_resources(
self, resources: 'resources_lib.Resources'
) -> Tuple[List['resources_lib.Resources'], List[str]]:
) -> 'resources_utils.FeasibleResources':
"""See get_feasible_launchable_resources()."""
# TODO: Currently only the Kubernetes implementation of this method
# returns hints when no feasible resources are found. This should be
# implemented for all clouds.
raise NotImplementedError

def get_reservations_available_resources(
Expand Down
20 changes: 13 additions & 7 deletions sky/clouds/cudo.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,13 +214,16 @@ def make_deploy_resources_variables(
}

def _get_feasible_launchable_resources(
self, resources: 'resources_lib.Resources'):
self, resources: 'resources_lib.Resources'
) -> 'resources_utils.FeasibleResources':
if resources.use_spot:
return ([], [])
# TODO: Add hints to all return values in this method to help
# users understand why the resources are not launchable.
return resources_utils.FeasibleResources([], [], None)
if resources.instance_type is not None:
assert resources.is_launchable(), resources
resources = resources.copy(accelerators=None)
return ([resources], [])
return resources_utils.FeasibleResources([resources], [], None)

def _make(instance_list):
resource_list = []
Expand All @@ -243,9 +246,10 @@ def _make(instance_list):
memory=resources.memory,
disk_tier=resources.disk_tier)
if default_instance_type is None:
return ([], [])
return resources_utils.FeasibleResources([], [], None)
else:
return (_make([default_instance_type]), [])
return resources_utils.FeasibleResources(
_make([default_instance_type]), [], None)

assert len(accelerators) == 1, resources
acc, acc_count = list(accelerators.items())[0]
Expand All @@ -260,8 +264,10 @@ def _make(instance_list):
zone=resources.zone,
clouds='cudo')
if instance_list is None:
return ([], fuzzy_candidate_list)
return (_make(instance_list), fuzzy_candidate_list)
return resources_utils.FeasibleResources([], fuzzy_candidate_list,
None)
return resources_utils.FeasibleResources(_make(instance_list),
fuzzy_candidate_list, None)

@classmethod
def check_credentials(cls) -> Tuple[bool, Optional[str]]:
Expand Down
15 changes: 10 additions & 5 deletions sky/clouds/fluidstack.py
Original file line number Diff line number Diff line change
Expand Up @@ -211,7 +211,9 @@ def _get_feasible_launchable_resources(
assert resources.is_launchable(), resources
# Accelerators are part of the instance type in Fluidstack Cloud
resources = resources.copy(accelerators=None)
return ([resources], [])
# TODO: Add hints to all return values in this method to help
# users understand why the resources are not launchable.
return resources_utils.FeasibleResources([resources], [], None)

def _make(instance_list):
resource_list = []
Expand Down Expand Up @@ -239,9 +241,10 @@ def _make(instance_list):
memory=resources.memory,
disk_tier=resources.disk_tier)
if default_instance_type is None:
return ([], [])
return resources_utils.FeasibleResources([], [], None)
else:
return (_make([default_instance_type]), [])
return resources_utils.FeasibleResources(
_make([default_instance_type]), [], None)

assert len(accelerators) == 1, resources
acc, acc_count = list(accelerators.items())[0]
Expand All @@ -256,8 +259,10 @@ def _make(instance_list):
zone=resources.zone,
clouds='fluidstack')
if instance_list is None:
return ([], fuzzy_candidate_list)
return (_make(instance_list), fuzzy_candidate_list)
return resources_utils.FeasibleResources([], fuzzy_candidate_list,
None)
return resources_utils.FeasibleResources(_make(instance_list),
fuzzy_candidate_list, None)

@classmethod
def check_credentials(cls) -> Tuple[bool, Optional[str]]:
Expand Down
28 changes: 18 additions & 10 deletions sky/clouds/gcp.py
Original file line number Diff line number Diff line change
Expand Up @@ -526,10 +526,10 @@ def make_deploy_resources_variables(

def _get_feasible_launchable_resources(
self, resources: 'resources.Resources'
) -> Tuple[List['resources.Resources'], List[str]]:
) -> 'resources_utils.FeasibleResources':
if resources.instance_type is not None:
assert resources.is_launchable(), resources
return ([resources], [])
return resources_utils.FeasibleResources([resources], [], None)

if resources.accelerators is None:
# Return a default instance type with the given number of vCPUs.
Expand All @@ -538,7 +538,9 @@ def _get_feasible_launchable_resources(
memory=resources.memory,
disk_tier=resources.disk_tier)
if host_vm_type is None:
return ([], [])
# TODO: Add hints to all return values in this method to help
# users understand why the resources are not launchable.
return resources_utils.FeasibleResources([], [], None)
else:
r = resources.copy(
cloud=GCP(),
Expand All @@ -547,7 +549,7 @@ def _get_feasible_launchable_resources(
cpus=None,
memory=None,
)
return ([r], [])
return resources_utils.FeasibleResources([r], [], None)

# Find instance candidates to meet user's requirements
assert len(resources.accelerators.items()
Expand All @@ -569,7 +571,8 @@ def _get_feasible_launchable_resources(
clouds='gcp')

if instance_list is None:
return ([], fuzzy_candidate_list)
return resources_utils.FeasibleResources([], fuzzy_candidate_list,
None)
assert len(
instance_list
) == 1, f'More than one instance type matched, {instance_list}'
Expand All @@ -584,11 +587,13 @@ def _get_feasible_launchable_resources(
if resources.cpus.endswith('+'):
cpus = float(resources.cpus[:-1])
if cpus > num_cpus_in_tpu_vm:
return ([], fuzzy_candidate_list)
return resources_utils.FeasibleResources(
[], fuzzy_candidate_list, None)
else:
cpus = float(resources.cpus)
if cpus != num_cpus_in_tpu_vm:
return ([], fuzzy_candidate_list)
return resources_utils.FeasibleResources(
[], fuzzy_candidate_list, None)
# FIXME(woosuk, wei-lin): This leverages the fact that TPU VMs
# have 334 GB RAM, and 400 GB RAM for tpu-v4. We need to move
# this to service catalog, instead.
Expand All @@ -597,11 +602,13 @@ def _get_feasible_launchable_resources(
if resources.memory.endswith('+'):
memory = float(resources.memory[:-1])
if memory > memory_in_tpu_vm:
return ([], fuzzy_candidate_list)
return resources_utils.FeasibleResources(
[], fuzzy_candidate_list, None)
else:
memory = float(resources.memory)
if memory != memory_in_tpu_vm:
return ([], fuzzy_candidate_list)
return resources_utils.FeasibleResources(
[], fuzzy_candidate_list, None)
else:
host_vm_type = instance_list[0]

Expand All @@ -613,7 +620,8 @@ def _get_feasible_launchable_resources(
cpus=None,
memory=None,
)
return ([r], fuzzy_candidate_list)
return resources_utils.FeasibleResources([r], fuzzy_candidate_list,
None)

@classmethod
def get_accelerators_from_instance_type(
Expand Down
18 changes: 12 additions & 6 deletions sky/clouds/ibm.py
Original file line number Diff line number Diff line change
Expand Up @@ -266,12 +266,15 @@ def get_default_instance_type(

def _get_feasible_launchable_resources(
self, resources: 'resources_lib.Resources'
) -> Tuple[List['resources_lib.Resources'], List[str]]:
) -> 'resources_utils.FeasibleResources':
fuzzy_candidate_list: List[str] = []
if resources.instance_type is not None:
assert resources.is_launchable(), resources
resources = resources.copy(accelerators=None)
return ([resources], fuzzy_candidate_list)
# TODO: Add hints to all return values in this method to help
# users understand why the resources are not launchable.
return resources_utils.FeasibleResources([resources],
fuzzy_candidate_list, None)

def _make(instance_list):
resource_list = []
Expand All @@ -296,9 +299,10 @@ def _make(instance_list):
memory=resources.memory,
disk_tier=resources.disk_tier)
if default_instance_type is None:
return ([], [])
return resources_utils.FeasibleResources([], [], None)
else:
return (_make([default_instance_type]), [])
return resources_utils.FeasibleResources(
_make([default_instance_type]), [], None)

assert len(accelerators) == 1, resources
acc, acc_count = list(accelerators.items())[0]
Expand All @@ -312,8 +316,10 @@ def _make(instance_list):
zone=resources.zone,
clouds='ibm')
if instance_list is None:
return ([], fuzzy_candidate_list)
return (_make(instance_list), fuzzy_candidate_list)
return resources_utils.FeasibleResources([], fuzzy_candidate_list,
None)
return resources_utils.FeasibleResources(_make(instance_list),
fuzzy_candidate_list, None)

@classmethod
def get_default_image(cls, region) -> str:
Expand Down
10 changes: 6 additions & 4 deletions sky/clouds/kubernetes.py
Original file line number Diff line number Diff line change
Expand Up @@ -342,12 +342,13 @@ def make_deploy_resources_variables(

def _get_feasible_launchable_resources(
self, resources: 'resources_lib.Resources'
) -> Tuple[List['resources_lib.Resources'], List[str]]:
) -> 'resources_utils.FeasibleResources':
fuzzy_candidate_list: List[str] = []
if resources.instance_type is not None:
assert resources.is_launchable(), resources
resources = resources.copy(accelerators=None)
return ([resources], fuzzy_candidate_list)
return resources_utils.FeasibleResources([resources],
fuzzy_candidate_list, None)

def _make(instance_list):
resource_list = []
Expand Down Expand Up @@ -403,10 +404,11 @@ def _make(instance_list):
logger.debug(f'Instance type {chosen_instance_type} does '
'not fit in the Kubernetes cluster. '
f'Reason: {reason}')
return [], []
return resources_utils.FeasibleResources([], [], reason)

# No fuzzy lists for Kubernetes
return _make([chosen_instance_type]), []
return resources_utils.FeasibleResources(_make([chosen_instance_type]),
[], None)

@classmethod
def check_credentials(cls) -> Tuple[bool, Optional[str]]:
Expand Down
Loading

0 comments on commit 541efba

Please sign in to comment.