From 541efbae1ccecb92b8712732decb067b13ddf8d6 Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Mon, 22 Jul 2024 13:49:47 -0700 Subject: [PATCH] [k8s] Show hints when requested resources don't fit in Kubernetes cluster (#3590) * Add hints when feasible resources are not found * lint * lint * wip * move to resources_utils * fixes * fix * add TODO * Add todos and fix comments --- sky/clouds/aws.py | 19 ++++++++++++------- sky/clouds/azure.py | 19 ++++++++++++------- sky/clouds/cloud.py | 31 ++++++++++++++++++++----------- sky/clouds/cudo.py | 20 +++++++++++++------- sky/clouds/fluidstack.py | 15 ++++++++++----- sky/clouds/gcp.py | 28 ++++++++++++++++++---------- sky/clouds/ibm.py | 18 ++++++++++++------ sky/clouds/kubernetes.py | 10 ++++++---- sky/clouds/lambda_cloud.py | 17 +++++++++++------ sky/clouds/oci.py | 17 +++++++++++------ sky/clouds/paperspace.py | 17 +++++++++++------ sky/clouds/runpod.py | 17 +++++++++++------ sky/clouds/scp.py | 19 ++++++++++++------- sky/clouds/vsphere.py | 17 +++++++++++------ sky/optimizer.py | 24 +++++++++++++----------- sky/provision/kubernetes/utils.py | 11 ++++++++--- sky/utils/resources_utils.py | 19 +++++++++++++++++++ 17 files changed, 210 insertions(+), 108 deletions(-) diff --git a/sky/clouds/aws.py b/sky/clouds/aws.py index fc001ea75c0..021f243da70 100644 --- a/sky/clouds/aws.py +++ b/sky/clouds/aws.py @@ -444,7 +444,7 @@ def make_deploy_resources_variables( def _get_feasible_launchable_resources( self, resources: 'resources_lib.Resources' - ) -> Tuple[List['resources_lib.Resources'], List[str]]: + ) -> resources_utils.FeasibleResources: if resources.instance_type is not None: assert resources.is_launchable(), resources # Check the instance type is valid in the cloud @@ -455,10 +455,12 @@ def _get_feasible_launchable_resources( region=resources.region, zone=resources.zone) if not regions: - return ([], []) + # TODO: Add hints to all return values in this method to help + # users understand why the resources are not launchable. + return resources_utils.FeasibleResources([], [], None) # Treat Resources(AWS, p3.2x, V100) as Resources(AWS, p3.2x). resources = resources.copy(accelerators=None) - return ([resources], []) + return resources_utils.FeasibleResources([resources], [], None) def _make(instance_list): resource_list = [] @@ -484,9 +486,10 @@ def _make(instance_list): memory=resources.memory, disk_tier=resources.disk_tier) if default_instance_type is None: - return ([], []) + return resources_utils.FeasibleResources([], [], None) else: - return (_make([default_instance_type]), []) + return resources_utils.FeasibleResources( + _make([default_instance_type]), [], None) assert len(accelerators) == 1, resources acc, acc_count = list(accelerators.items())[0] @@ -501,8 +504,10 @@ def _make(instance_list): zone=resources.zone, clouds='aws') if instance_list is None: - return ([], fuzzy_candidate_list) - return (_make(instance_list), fuzzy_candidate_list) + return resources_utils.FeasibleResources([], fuzzy_candidate_list, + None) + return resources_utils.FeasibleResources(_make(instance_list), + fuzzy_candidate_list, None) @classmethod @functools.lru_cache(maxsize=1) # Cache since getting identity is slow. diff --git a/sky/clouds/azure.py b/sky/clouds/azure.py index a035ff256c1..928ceb5cc52 100644 --- a/sky/clouds/azure.py +++ b/sky/clouds/azure.py @@ -378,17 +378,19 @@ def _failover_disk_tier() -> Optional[resources_utils.DiskTier]: def _get_feasible_launchable_resources( self, resources: 'resources.Resources' - ) -> Tuple[List['resources.Resources'], List[str]]: + ) -> 'resources_utils.FeasibleResources': if resources.instance_type is not None: assert resources.is_launchable(), resources ok, _ = Azure.check_disk_tier(resources.instance_type, resources.disk_tier) if not ok: - return ([], []) + # TODO: Add hints to all return values in this method to help + # users understand why the resources are not launchable. + return resources_utils.FeasibleResources([], [], None) # Treat Resources(Azure, Standard_NC4as_T4_v3, T4) as # Resources(Azure, Standard_NC4as_T4_v3). resources = resources.copy(accelerators=None) - return ([resources], []) + return resources_utils.FeasibleResources([resources], [], None) def _make(instance_list): resource_list = [] @@ -418,9 +420,10 @@ def _make(instance_list): memory=resources.memory, disk_tier=resources.disk_tier) if default_instance_type is None: - return ([], []) + return resources_utils.FeasibleResources([], [], None) else: - return (_make([default_instance_type]), []) + return resources_utils.FeasibleResources( + _make([default_instance_type]), [], None) assert len(accelerators) == 1, resources acc, acc_count = list(accelerators.items())[0] @@ -435,8 +438,10 @@ def _make(instance_list): zone=resources.zone, clouds='azure') if instance_list is None: - return ([], fuzzy_candidate_list) - return (_make(instance_list), fuzzy_candidate_list) + return resources_utils.FeasibleResources([], fuzzy_candidate_list, + None) + return resources_utils.FeasibleResources(_make(instance_list), + fuzzy_candidate_list, None) @classmethod def check_credentials(cls) -> Tuple[bool, Optional[str]]: diff --git a/sky/clouds/cloud.py b/sky/clouds/cloud.py index 93048a84e74..ce9c2ae602d 100644 --- a/sky/clouds/cloud.py +++ b/sky/clouds/cloud.py @@ -341,11 +341,10 @@ def is_label_valid(cls, label_key: str, return True, None def get_feasible_launchable_resources( - self, - resources: 'resources_lib.Resources', - num_nodes: int = 1 - ) -> Tuple[List['resources_lib.Resources'], List[str]]: - """Returns ([feasible and launchable resources], [fuzzy candidates]). + self, + resources: 'resources_lib.Resources', + num_nodes: int = 1) -> 'resources_utils.FeasibleResources': + """Returns FeasibleResources for the given resources. Feasible resources refer to an offering respecting the resource requirements. Currently, this function implements "filtering" the @@ -353,10 +352,15 @@ def get_feasible_launchable_resources( Launchable resources require a cloud and an instance type be assigned. - Fuzzy candidates example: when the requested GPU is A100:1 but is not - available in a cloud/region, the fuzzy candidates are results of a fuzzy - search in the catalog that are offered in the location. E.g., - ['A100-80GB:1', 'A100-80GB:2', 'A100-80GB:4', 'A100:8'] + The returned dataclass object FeasibleResources contains three fields: + + - resources_list: a list of resources that are feasible to launch + - fuzzy_candidate_list: a list of resources that loosely match requested + resources. E.g., when A100:1 GPU is requested but is not available + in a cloud/region, the fuzzy candidates are results of a fuzzy + search in the catalog that are offered in the location. E.g., + ['A100-80GB:1', 'A100-80GB:2', 'A100-80GB:4', 'A100:8'] + - hint: an optional string hint if no feasible resources are found. """ if resources.is_launchable(): self._check_instance_type_accelerators_combination(resources) @@ -372,13 +376,18 @@ def get_feasible_launchable_resources( # TODO(zhwu): The resources are now silently filtered out. We # should have some logging telling the user why the resources # are not considered. - return ([], []) + return resources_utils.FeasibleResources(resources_list=[], + fuzzy_candidate_list=[], + hint=None) return self._get_feasible_launchable_resources(resources) def _get_feasible_launchable_resources( self, resources: 'resources_lib.Resources' - ) -> Tuple[List['resources_lib.Resources'], List[str]]: + ) -> 'resources_utils.FeasibleResources': """See get_feasible_launchable_resources().""" + # TODO: Currently only the Kubernetes implementation of this method + # returns hints when no feasible resources are found. This should be + # implemented for all clouds. raise NotImplementedError def get_reservations_available_resources( diff --git a/sky/clouds/cudo.py b/sky/clouds/cudo.py index 8f7d4eaf923..8f100caebad 100644 --- a/sky/clouds/cudo.py +++ b/sky/clouds/cudo.py @@ -214,13 +214,16 @@ def make_deploy_resources_variables( } def _get_feasible_launchable_resources( - self, resources: 'resources_lib.Resources'): + self, resources: 'resources_lib.Resources' + ) -> 'resources_utils.FeasibleResources': if resources.use_spot: - return ([], []) + # TODO: Add hints to all return values in this method to help + # users understand why the resources are not launchable. + return resources_utils.FeasibleResources([], [], None) if resources.instance_type is not None: assert resources.is_launchable(), resources resources = resources.copy(accelerators=None) - return ([resources], []) + return resources_utils.FeasibleResources([resources], [], None) def _make(instance_list): resource_list = [] @@ -243,9 +246,10 @@ def _make(instance_list): memory=resources.memory, disk_tier=resources.disk_tier) if default_instance_type is None: - return ([], []) + return resources_utils.FeasibleResources([], [], None) else: - return (_make([default_instance_type]), []) + return resources_utils.FeasibleResources( + _make([default_instance_type]), [], None) assert len(accelerators) == 1, resources acc, acc_count = list(accelerators.items())[0] @@ -260,8 +264,10 @@ def _make(instance_list): zone=resources.zone, clouds='cudo') if instance_list is None: - return ([], fuzzy_candidate_list) - return (_make(instance_list), fuzzy_candidate_list) + return resources_utils.FeasibleResources([], fuzzy_candidate_list, + None) + return resources_utils.FeasibleResources(_make(instance_list), + fuzzy_candidate_list, None) @classmethod def check_credentials(cls) -> Tuple[bool, Optional[str]]: diff --git a/sky/clouds/fluidstack.py b/sky/clouds/fluidstack.py index c4f15a0e510..d292ace02f8 100644 --- a/sky/clouds/fluidstack.py +++ b/sky/clouds/fluidstack.py @@ -211,7 +211,9 @@ def _get_feasible_launchable_resources( assert resources.is_launchable(), resources # Accelerators are part of the instance type in Fluidstack Cloud resources = resources.copy(accelerators=None) - return ([resources], []) + # TODO: Add hints to all return values in this method to help + # users understand why the resources are not launchable. + return resources_utils.FeasibleResources([resources], [], None) def _make(instance_list): resource_list = [] @@ -239,9 +241,10 @@ def _make(instance_list): memory=resources.memory, disk_tier=resources.disk_tier) if default_instance_type is None: - return ([], []) + return resources_utils.FeasibleResources([], [], None) else: - return (_make([default_instance_type]), []) + return resources_utils.FeasibleResources( + _make([default_instance_type]), [], None) assert len(accelerators) == 1, resources acc, acc_count = list(accelerators.items())[0] @@ -256,8 +259,10 @@ def _make(instance_list): zone=resources.zone, clouds='fluidstack') if instance_list is None: - return ([], fuzzy_candidate_list) - return (_make(instance_list), fuzzy_candidate_list) + return resources_utils.FeasibleResources([], fuzzy_candidate_list, + None) + return resources_utils.FeasibleResources(_make(instance_list), + fuzzy_candidate_list, None) @classmethod def check_credentials(cls) -> Tuple[bool, Optional[str]]: diff --git a/sky/clouds/gcp.py b/sky/clouds/gcp.py index 050fda07fe4..e24e67b2486 100644 --- a/sky/clouds/gcp.py +++ b/sky/clouds/gcp.py @@ -526,10 +526,10 @@ def make_deploy_resources_variables( def _get_feasible_launchable_resources( self, resources: 'resources.Resources' - ) -> Tuple[List['resources.Resources'], List[str]]: + ) -> 'resources_utils.FeasibleResources': if resources.instance_type is not None: assert resources.is_launchable(), resources - return ([resources], []) + return resources_utils.FeasibleResources([resources], [], None) if resources.accelerators is None: # Return a default instance type with the given number of vCPUs. @@ -538,7 +538,9 @@ def _get_feasible_launchable_resources( memory=resources.memory, disk_tier=resources.disk_tier) if host_vm_type is None: - return ([], []) + # TODO: Add hints to all return values in this method to help + # users understand why the resources are not launchable. + return resources_utils.FeasibleResources([], [], None) else: r = resources.copy( cloud=GCP(), @@ -547,7 +549,7 @@ def _get_feasible_launchable_resources( cpus=None, memory=None, ) - return ([r], []) + return resources_utils.FeasibleResources([r], [], None) # Find instance candidates to meet user's requirements assert len(resources.accelerators.items() @@ -569,7 +571,8 @@ def _get_feasible_launchable_resources( clouds='gcp') if instance_list is None: - return ([], fuzzy_candidate_list) + return resources_utils.FeasibleResources([], fuzzy_candidate_list, + None) assert len( instance_list ) == 1, f'More than one instance type matched, {instance_list}' @@ -584,11 +587,13 @@ def _get_feasible_launchable_resources( if resources.cpus.endswith('+'): cpus = float(resources.cpus[:-1]) if cpus > num_cpus_in_tpu_vm: - return ([], fuzzy_candidate_list) + return resources_utils.FeasibleResources( + [], fuzzy_candidate_list, None) else: cpus = float(resources.cpus) if cpus != num_cpus_in_tpu_vm: - return ([], fuzzy_candidate_list) + return resources_utils.FeasibleResources( + [], fuzzy_candidate_list, None) # FIXME(woosuk, wei-lin): This leverages the fact that TPU VMs # have 334 GB RAM, and 400 GB RAM for tpu-v4. We need to move # this to service catalog, instead. @@ -597,11 +602,13 @@ def _get_feasible_launchable_resources( if resources.memory.endswith('+'): memory = float(resources.memory[:-1]) if memory > memory_in_tpu_vm: - return ([], fuzzy_candidate_list) + return resources_utils.FeasibleResources( + [], fuzzy_candidate_list, None) else: memory = float(resources.memory) if memory != memory_in_tpu_vm: - return ([], fuzzy_candidate_list) + return resources_utils.FeasibleResources( + [], fuzzy_candidate_list, None) else: host_vm_type = instance_list[0] @@ -613,7 +620,8 @@ def _get_feasible_launchable_resources( cpus=None, memory=None, ) - return ([r], fuzzy_candidate_list) + return resources_utils.FeasibleResources([r], fuzzy_candidate_list, + None) @classmethod def get_accelerators_from_instance_type( diff --git a/sky/clouds/ibm.py b/sky/clouds/ibm.py index e468fecf00f..b78cc4287c0 100644 --- a/sky/clouds/ibm.py +++ b/sky/clouds/ibm.py @@ -266,12 +266,15 @@ def get_default_instance_type( def _get_feasible_launchable_resources( self, resources: 'resources_lib.Resources' - ) -> Tuple[List['resources_lib.Resources'], List[str]]: + ) -> 'resources_utils.FeasibleResources': fuzzy_candidate_list: List[str] = [] if resources.instance_type is not None: assert resources.is_launchable(), resources resources = resources.copy(accelerators=None) - return ([resources], fuzzy_candidate_list) + # TODO: Add hints to all return values in this method to help + # users understand why the resources are not launchable. + return resources_utils.FeasibleResources([resources], + fuzzy_candidate_list, None) def _make(instance_list): resource_list = [] @@ -296,9 +299,10 @@ def _make(instance_list): memory=resources.memory, disk_tier=resources.disk_tier) if default_instance_type is None: - return ([], []) + return resources_utils.FeasibleResources([], [], None) else: - return (_make([default_instance_type]), []) + return resources_utils.FeasibleResources( + _make([default_instance_type]), [], None) assert len(accelerators) == 1, resources acc, acc_count = list(accelerators.items())[0] @@ -312,8 +316,10 @@ def _make(instance_list): zone=resources.zone, clouds='ibm') if instance_list is None: - return ([], fuzzy_candidate_list) - return (_make(instance_list), fuzzy_candidate_list) + return resources_utils.FeasibleResources([], fuzzy_candidate_list, + None) + return resources_utils.FeasibleResources(_make(instance_list), + fuzzy_candidate_list, None) @classmethod def get_default_image(cls, region) -> str: diff --git a/sky/clouds/kubernetes.py b/sky/clouds/kubernetes.py index 113774142c9..4dd1fe8ce75 100644 --- a/sky/clouds/kubernetes.py +++ b/sky/clouds/kubernetes.py @@ -342,12 +342,13 @@ def make_deploy_resources_variables( def _get_feasible_launchable_resources( self, resources: 'resources_lib.Resources' - ) -> Tuple[List['resources_lib.Resources'], List[str]]: + ) -> 'resources_utils.FeasibleResources': fuzzy_candidate_list: List[str] = [] if resources.instance_type is not None: assert resources.is_launchable(), resources resources = resources.copy(accelerators=None) - return ([resources], fuzzy_candidate_list) + return resources_utils.FeasibleResources([resources], + fuzzy_candidate_list, None) def _make(instance_list): resource_list = [] @@ -403,10 +404,11 @@ def _make(instance_list): logger.debug(f'Instance type {chosen_instance_type} does ' 'not fit in the Kubernetes cluster. ' f'Reason: {reason}') - return [], [] + return resources_utils.FeasibleResources([], [], reason) # No fuzzy lists for Kubernetes - return _make([chosen_instance_type]), [] + return resources_utils.FeasibleResources(_make([chosen_instance_type]), + [], None) @classmethod def check_credentials(cls) -> Tuple[bool, Optional[str]]: diff --git a/sky/clouds/lambda_cloud.py b/sky/clouds/lambda_cloud.py index 036f5a23979..ce45f087296 100644 --- a/sky/clouds/lambda_cloud.py +++ b/sky/clouds/lambda_cloud.py @@ -178,12 +178,14 @@ def make_deploy_resources_variables( def _get_feasible_launchable_resources( self, resources: 'resources_lib.Resources' - ) -> Tuple[List['resources_lib.Resources'], List[str]]: + ) -> 'resources_utils.FeasibleResources': if resources.instance_type is not None: assert resources.is_launchable(), resources # Accelerators are part of the instance type in Lambda Cloud resources = resources.copy(accelerators=None) - return ([resources], []) + # TODO: Add hints to all return values in this method to help + # users understand why the resources are not launchable. + return resources_utils.FeasibleResources([resources], [], None) def _make(instance_list): resource_list = [] @@ -209,9 +211,10 @@ def _make(instance_list): memory=resources.memory, disk_tier=resources.disk_tier) if default_instance_type is None: - return ([], []) + return resources_utils.FeasibleResources([], [], None) else: - return (_make([default_instance_type]), []) + return resources_utils.FeasibleResources( + _make([default_instance_type]), [], None) assert len(accelerators) == 1, resources acc, acc_count = list(accelerators.items())[0] @@ -226,8 +229,10 @@ def _make(instance_list): zone=resources.zone, clouds='lambda') if instance_list is None: - return ([], fuzzy_candidate_list) - return (_make(instance_list), fuzzy_candidate_list) + return resources_utils.FeasibleResources([], fuzzy_candidate_list, + None) + return resources_utils.FeasibleResources(_make(instance_list), + fuzzy_candidate_list, None) @classmethod def check_credentials(cls) -> Tuple[bool, Optional[str]]: diff --git a/sky/clouds/oci.py b/sky/clouds/oci.py index a911c3f38d0..7875e26d9cc 100644 --- a/sky/clouds/oci.py +++ b/sky/clouds/oci.py @@ -295,11 +295,13 @@ def make_deploy_resources_variables( def _get_feasible_launchable_resources( self, resources: 'resources_lib.Resources' - ) -> Tuple[List['resources_lib.Resources'], List[str]]: + ) -> 'resources_utils.FeasibleResources': if resources.instance_type is not None: assert resources.is_launchable(), resources resources = resources.copy(accelerators=None) - return ([resources], []) + # TODO: Add hints to all return values in this method to help + # users understand why the resources are not launchable. + return resources_utils.FeasibleResources([resources], [], None) def _make(instance_list): resource_list = [] @@ -326,9 +328,10 @@ def _make(instance_list): disk_tier=resources.disk_tier) if default_instance_type is None: - return ([], []) + return resources_utils.FeasibleResources([], [], None) else: - return (_make([default_instance_type]), []) + return resources_utils.FeasibleResources( + _make([default_instance_type]), [], None) assert len(accelerators) == 1, resources @@ -344,9 +347,11 @@ def _make(instance_list): zone=resources.zone, clouds='oci') if instance_list is None: - return ([], fuzzy_candidate_list) + return resources_utils.FeasibleResources([], fuzzy_candidate_list, + None) - return (_make(instance_list), fuzzy_candidate_list) + return resources_utils.FeasibleResources(_make(instance_list), + fuzzy_candidate_list, None) @classmethod def check_credentials(cls) -> Tuple[bool, Optional[str]]: diff --git a/sky/clouds/paperspace.py b/sky/clouds/paperspace.py index efa1afee781..171bcf33f16 100644 --- a/sky/clouds/paperspace.py +++ b/sky/clouds/paperspace.py @@ -196,11 +196,13 @@ def _get_feasible_launchable_resources( self, resources: 'resources_lib.Resources'): """Returns a list of feasible resources for the given resources.""" if resources.use_spot: - return ([], []) + # TODO: Add hints to all return values in this method to help + # users understand why the resources are not launchable. + return resources_utils.FeasibleResources([], [], None) if resources.instance_type is not None: assert resources.is_launchable(), resources resources = resources.copy(accelerators=None) - return ([resources], []) + return resources_utils.FeasibleResources([resources], [], None) def _make(instance_list): resource_list = [] @@ -223,9 +225,10 @@ def _make(instance_list): memory=resources.memory, disk_tier=resources.disk_tier) if default_instance_type is None: - return ([], []) + return resources_utils.FeasibleResources([], [], None) else: - return (_make([default_instance_type]), []) + return resources_utils.FeasibleResources( + _make([default_instance_type]), [], None) assert len(accelerators) == 1, resources acc, acc_count = list(accelerators.items())[0] @@ -241,8 +244,10 @@ def _make(instance_list): clouds='paperspace', )) if instance_list is None: - return ([], fuzzy_candidate_list) - return (_make(instance_list), fuzzy_candidate_list) + return resources_utils.FeasibleResources([], fuzzy_candidate_list, + None) + return resources_utils.FeasibleResources(_make(instance_list), + fuzzy_candidate_list, None) @classmethod def check_credentials(cls) -> Tuple[bool, Optional[str]]: diff --git a/sky/clouds/runpod.py b/sky/clouds/runpod.py index 3486330b8b3..4fc4bfce85b 100644 --- a/sky/clouds/runpod.py +++ b/sky/clouds/runpod.py @@ -187,12 +187,12 @@ def make_deploy_resources_variables( def _get_feasible_launchable_resources( self, resources: 'resources_lib.Resources' - ) -> Tuple[List['resources_lib.Resources'], List[str]]: + ) -> 'resources_utils.FeasibleResources': """Returns a list of feasible resources for the given resources.""" if resources.instance_type is not None: assert resources.is_launchable(), resources resources = resources.copy(accelerators=None) - return ([resources], []) + return resources_utils.FeasibleResources([resources], [], None) def _make(instance_list): resource_list = [] @@ -215,9 +215,12 @@ def _make(instance_list): memory=resources.memory, disk_tier=resources.disk_tier) if default_instance_type is None: - return ([], []) + # TODO: Add hints to all return values in this method to help + # users understand why the resources are not launchable. + return resources_utils.FeasibleResources([], [], None) else: - return (_make([default_instance_type]), []) + return resources_utils.FeasibleResources( + _make([default_instance_type]), [], None) assert len(accelerators) == 1, resources acc, acc_count = list(accelerators.items())[0] @@ -231,8 +234,10 @@ def _make(instance_list): zone=resources.zone, clouds='runpod') if instance_list is None: - return ([], fuzzy_candidate_list) - return (_make(instance_list), fuzzy_candidate_list) + return resources_utils.FeasibleResources([], fuzzy_candidate_list, + None) + return resources_utils.FeasibleResources(_make(instance_list), + fuzzy_candidate_list, None) @classmethod def check_credentials(cls) -> Tuple[bool, Optional[str]]: diff --git a/sky/clouds/scp.py b/sky/clouds/scp.py index da45a7e143e..9cfbd5129f6 100644 --- a/sky/clouds/scp.py +++ b/sky/clouds/scp.py @@ -251,16 +251,18 @@ def _get_default_ami(cls, region_name: str, instance_type: str) -> str: def _get_feasible_launchable_resources( self, resources: 'resources_lib.Resources' - ) -> Tuple[List['resources_lib.Resources'], List[str]]: + ) -> 'resources_utils.FeasibleResources': # Check if the host VM satisfies the min/max disk size limits. is_allowed = self._is_disk_size_allowed(resources) if not is_allowed: - return ([], []) + # TODO: Add hints to all return values in this method to help + # users understand why the resources are not launchable. + return resources_utils.FeasibleResources([], [], None) if resources.instance_type is not None: assert resources.is_launchable(), resources # Accelerators are part of the instance type in SCP Cloud resources = resources.copy(accelerators=None) - return ([resources], []) + return resources_utils.FeasibleResources([resources], [], None) def _make(instance_list): resource_list = [] @@ -287,9 +289,10 @@ def _make(instance_list): memory=resources.memory, disk_tier=resources.disk_tier) if default_instance_type is None: - return ([], []) + return resources_utils.FeasibleResources([], [], None) else: - return (_make([default_instance_type]), []) + return resources_utils.FeasibleResources( + _make([default_instance_type]), [], None) assert len(accelerators) == 1, resources acc, acc_count = list(accelerators.items())[0] @@ -304,8 +307,10 @@ def _make(instance_list): zone=resources.zone, clouds='scp') if instance_list is None: - return ([], fuzzy_candidate_list) - return (_make(instance_list), fuzzy_candidate_list) + return resources_utils.FeasibleResources([], fuzzy_candidate_list, + None) + return resources_utils.FeasibleResources(_make(instance_list), + fuzzy_candidate_list, None) @classmethod def check_credentials(cls) -> Tuple[bool, Optional[str]]: diff --git a/sky/clouds/vsphere.py b/sky/clouds/vsphere.py index 968368ff0aa..6e7e1abeb04 100644 --- a/sky/clouds/vsphere.py +++ b/sky/clouds/vsphere.py @@ -197,11 +197,13 @@ def make_deploy_resources_variables( def _get_feasible_launchable_resources( self, resources: 'resources_lib.Resources'): if resources.use_spot: - return ([], []) + # TODO: Add hints to all return values in this method to help + # users understand why the resources are not launchable. + return resources_utils.FeasibleResources([], [], None) if resources.instance_type is not None: assert resources.is_launchable(), resources resources = resources.copy(accelerators=None) - return ([resources], []) + return resources_utils.FeasibleResources([resources], [], None) def _make(instance_list): resource_list = [] @@ -226,9 +228,10 @@ def _make(instance_list): disk_tier=resources.disk_tier, ) if default_instance_type is None: - return ([], []) + return resources_utils.FeasibleResources([], [], None) else: - return (_make([default_instance_type]), []) + return resources_utils.FeasibleResources( + _make([default_instance_type]), [], None) assert len(accelerators) == 1, resources acc, acc_count = list(accelerators.items())[0] @@ -246,8 +249,10 @@ def _make(instance_list): clouds=_CLOUD_VSPHERE, ) if instance_list is None: - return ([], fuzzy_candidate_list) - return (_make(instance_list), fuzzy_candidate_list) + return resources_utils.FeasibleResources([], fuzzy_candidate_list, + None) + return resources_utils.FeasibleResources(_make(instance_list), + fuzzy_candidate_list, None) @classmethod def check_credentials(cls) -> Tuple[bool, Optional[str]]: diff --git a/sky/optimizer.py b/sky/optimizer.py index 9c11511a38b..7b4b29e3bce 100644 --- a/sky/optimizer.py +++ b/sky/optimizer.py @@ -348,10 +348,6 @@ def _estimate_nodes_cost_or_time( for orig_resources in node.resources): source_hint = 'kubernetes cluster' - # TODO(romilb): When `sky show-gpus` supports Kubernetes, - # add a hint to run `sky show-gpus --kubernetes` to list - # available accelerators on Kubernetes. - bold = colorama.Style.BRIGHT cyan = colorama.Fore.CYAN reset = colorama.Style.RESET_ALL @@ -1239,21 +1235,25 @@ def _fill_in_launchable_resources( continue clouds_list = ([resources.cloud] if resources.cloud is not None else enabled_clouds) + # If clouds provide hints, store them for later printing. + hints: Dict[clouds.Cloud, str] = {} for cloud in clouds_list: - (feasible_resources, - fuzzy_candidate_list) = cloud.get_feasible_launchable_resources( - resources, num_nodes=task.num_nodes) - if len(feasible_resources) > 0: + feasible_resources = cloud.get_feasible_launchable_resources( + resources, num_nodes=task.num_nodes) + if feasible_resources.hint is not None: + hints[cloud] = feasible_resources.hint + if len(feasible_resources.resources_list) > 0: # Assume feasible_resources is sorted by prices. Guaranteed by # the implementation of get_feasible_launchable_resources and # the underlying service_catalog filtering - cheapest = feasible_resources[0] + cheapest = feasible_resources.resources_list[0] # Generate region/zone-specified resources. launchable[resources].extend( _make_launchables_for_valid_region_zones(cheapest)) - cloud_candidates[cloud] = feasible_resources + cloud_candidates[cloud] = feasible_resources.resources_list else: - all_fuzzy_candidates.update(fuzzy_candidate_list) + all_fuzzy_candidates.update( + feasible_resources.fuzzy_candidate_list) if len(launchable[resources]) == 0: clouds_str = str(clouds_list) if len(clouds_list) > 1 else str( clouds_list[0]) @@ -1269,6 +1269,8 @@ def _fill_in_launchable_resources( f'{colorama.Fore.CYAN}' f'{sorted(all_fuzzy_candidates)}' f'{colorama.Style.RESET_ALL}') + for cloud, hint in hints.items(): + logger.info(f'{repr(cloud)}: {hint}') else: if resources.cpus is not None: logger.info('Try specifying a different CPU count, ' diff --git a/sky/provision/kubernetes/utils.py b/sky/provision/kubernetes/utils.py index fb400ab59f4..f042750d627 100644 --- a/sky/provision/kubernetes/utils.py +++ b/sky/provision/kubernetes/utils.py @@ -426,11 +426,16 @@ def check_cpu_mem_fits(candidate_instance_type: 'KubernetesInstanceType', ] assert len(gpu_nodes) > 0, 'GPU nodes not found' candidate_nodes = gpu_nodes - not_fit_reason_prefix = (f'GPU nodes with {acc_type} do not have ' - 'enough CPU and/or memory. ') + not_fit_reason_prefix = ( + f'GPU nodes with {acc_type} do not have ' + f'enough CPU (> {k8s_instance_type.cpus} CPUs) and/or ' + f'memory (> {k8s_instance_type.memory} G). ') else: candidate_nodes = nodes - not_fit_reason_prefix = 'No nodes found with enough CPU and/or memory. ' + not_fit_reason_prefix = (f'No nodes found with enough ' + f'CPU (> {k8s_instance_type.cpus} CPUs) ' + 'and/or memory ' + f'(> {k8s_instance_type.memory} G). ') # Check if CPU and memory requirements are met on at least one # candidate node. fits, reason = check_cpu_mem_fits(k8s_instance_type, candidate_nodes) diff --git a/sky/utils/resources_utils.py b/sky/utils/resources_utils.py index 87a62dab95b..95c784143cc 100644 --- a/sky/utils/resources_utils.py +++ b/sky/utils/resources_utils.py @@ -10,6 +10,7 @@ if typing.TYPE_CHECKING: from sky import backends + from sky import resources as resources_lib _PORT_RANGE_HINT_MSG = ('Invalid port range {}. Please use the format ' '"from-to", in which from <= to. e.g. "1-3".') @@ -157,3 +158,21 @@ def get_readable_resources_repr(handle: 'backends.CloudVmRayResourceHandle', launched_resource_str) return f'{handle.launched_nodes}x {launched_resource_str}' return _DEFAULT_MESSAGE_HANDLE_INITIALIZING + + +@dataclasses.dataclass +class FeasibleResources: + """Feasible resources returned by cloud. + + Used to represent a collection of feasible resources returned by cloud, + any fuzzy candidates, and optionally a string hint if no feasible resources + are found. + + Fuzzy candidates example: when the requested GPU is A100:1 but is not + available in a cloud/region, the fuzzy candidates are results of a fuzzy + search in the catalog that are offered in the location. E.g., + ['A100-80GB:1', 'A100-80GB:2', 'A100-80GB:4', 'A100:8'] + """ + resources_list: List['resources_lib.Resources'] + fuzzy_candidate_list: List[str] + hint: Optional[str]