From a2df6fc1722e149249f9fc5768a36118bdad16fe Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Thu, 18 Jul 2024 16:42:37 -0700 Subject: [PATCH] wip --- sky/clouds/aws.py | 17 ++++++++++------- sky/clouds/azure.py | 18 ++++++++++-------- sky/clouds/cloud.py | 27 +++++++++++++++------------ sky/clouds/cudo.py | 18 +++++++++++------- sky/clouds/fluidstack.py | 13 ++++++++----- sky/clouds/gcp.py | 30 +++++++++++++++++++----------- sky/clouds/ibm.py | 16 ++++++++++------ sky/clouds/kubernetes.py | 16 +++++++--------- sky/clouds/lambda_cloud.py | 15 +++++++++------ sky/clouds/oci.py | 15 +++++++++------ sky/clouds/paperspace.py | 15 +++++++++------ sky/clouds/runpod.py | 15 +++++++++------ sky/clouds/scp.py | 17 ++++++++++------- sky/clouds/vsphere.py | 15 +++++++++------ sky/optimizer.py | 14 +++++++------- sky/resources.py | 18 ++++++++++++++++++ 16 files changed, 170 insertions(+), 109 deletions(-) diff --git a/sky/clouds/aws.py b/sky/clouds/aws.py index 6a87f55c37d..74654fdcb04 100644 --- a/sky/clouds/aws.py +++ b/sky/clouds/aws.py @@ -441,7 +441,7 @@ def make_deploy_resources_variables( def _get_feasible_launchable_resources( self, resources: 'resources_lib.Resources' - ) -> Tuple[List['resources_lib.Resources'], List[str], Optional[str]]: + ) -> 'resources_lib.FeasibleResources': if resources.instance_type is not None: assert resources.is_launchable(), resources # Check the instance type is valid in the cloud @@ -452,10 +452,10 @@ def _get_feasible_launchable_resources( region=resources.region, zone=resources.zone) if not regions: - return ([], [], None) + return resources_lib.FeasibleResources([], [], None) # Treat Resources(AWS, p3.2x, V100) as Resources(AWS, p3.2x). resources = resources.copy(accelerators=None) - return ([resources], [], None) + return resources_lib.FeasibleResources([resources], [], None) def _make(instance_list): resource_list = [] @@ -481,9 +481,10 @@ def _make(instance_list): memory=resources.memory, disk_tier=resources.disk_tier) if default_instance_type is None: - return ([], [], None) + return resources_lib.FeasibleResources([], [], None) else: - return (_make([default_instance_type]), [], None) + return resources_lib.FeasibleResources( + _make([default_instance_type]), [], None) assert len(accelerators) == 1, resources acc, acc_count = list(accelerators.items())[0] @@ -498,8 +499,10 @@ def _make(instance_list): zone=resources.zone, clouds='aws') if instance_list is None: - return ([], fuzzy_candidate_list, None) - return (_make(instance_list), fuzzy_candidate_list, None) + return resources_lib.FeasibleResources([], fuzzy_candidate_list, + None) + return resources_lib.FeasibleResources(_make(instance_list), + fuzzy_candidate_list, None) @classmethod @functools.lru_cache(maxsize=1) # Cache since getting identity is slow. diff --git a/sky/clouds/azure.py b/sky/clouds/azure.py index 20be838e303..f42d33bf8d5 100644 --- a/sky/clouds/azure.py +++ b/sky/clouds/azure.py @@ -377,18 +377,18 @@ def _failover_disk_tier() -> Optional[resources_utils.DiskTier]: } def _get_feasible_launchable_resources( - self, resources: 'resources.Resources' - ) -> Tuple[List['resources.Resources'], List[str], Optional[str]]: + self, + resources: 'resources.Resources') -> 'resources.FeasibleResources': if resources.instance_type is not None: assert resources.is_launchable(), resources ok, _ = Azure.check_disk_tier(resources.instance_type, resources.disk_tier) if not ok: - return ([], [], None) + return resources.FeasibleResources([], [], None) # Treat Resources(Azure, Standard_NC4as_T4_v3, T4) as # Resources(Azure, Standard_NC4as_T4_v3). resources = resources.copy(accelerators=None) - return ([resources], [], None) + return resources.FeasibleResources([resources], [], None) def _make(instance_list): resource_list = [] @@ -418,9 +418,10 @@ def _make(instance_list): memory=resources.memory, disk_tier=resources.disk_tier) if default_instance_type is None: - return ([], [], None) + return resources.FeasibleResources([], [], None) else: - return (_make([default_instance_type]), [], None) + return resources.FeasibleResources( + _make([default_instance_type]), [], None) assert len(accelerators) == 1, resources acc, acc_count = list(accelerators.items())[0] @@ -435,8 +436,9 @@ def _make(instance_list): zone=resources.zone, clouds='azure') if instance_list is None: - return ([], fuzzy_candidate_list, None) - return (_make(instance_list), fuzzy_candidate_list, None) + return resources.FeasibleResources([], fuzzy_candidate_list, None) + return resources.FeasibleResources(_make(instance_list), + fuzzy_candidate_list, None) @classmethod def check_credentials(cls) -> Tuple[bool, Optional[str]]: diff --git a/sky/clouds/cloud.py b/sky/clouds/cloud.py index d74b470dbf0..b3d4d532932 100644 --- a/sky/clouds/cloud.py +++ b/sky/clouds/cloud.py @@ -341,11 +341,10 @@ def is_label_valid(cls, label_key: str, return True, None def get_feasible_launchable_resources( - self, - resources: 'resources_lib.Resources', - num_nodes: int = 1 - ) -> Tuple[List['resources_lib.Resources'], List[str], Optional[str]]: - """Returns ([feasible & launchable resources], [fuzzy candidates], hint) + self, + resources: 'resources_lib.Resources', + num_nodes: int = 1) -> 'resources_lib.FeasibleResources': + """Returns FeasibleResources for the given resources. Feasible resources refer to an offering respecting the resource requirements. Currently, this function implements "filtering" the @@ -353,13 +352,15 @@ def get_feasible_launchable_resources( Launchable resources require a cloud and an instance type be assigned. - The cloud may optionally return a string hint to the user if no feasible - resources are found. + The returned dataclass object FeasibleResources contains three fields: - Fuzzy candidates example: when the requested GPU is A100:1 but is not - available in a cloud/region, the fuzzy candidates are results of a fuzzy - search in the catalog that are offered in the location. E.g., - ['A100-80GB:1', 'A100-80GB:2', 'A100-80GB:4', 'A100:8'] + - resources_list: a list of resources that are feasible to launch + - fuzzy_candidate_list: a list of resources that loosely match the requested + resources. E.g., when A100:1 GPU is requested but is not available + in a cloud/region, the fuzzy candidates are results of a fuzzy + search in the catalog that are offered in the location. E.g., + ['A100-80GB:1', 'A100-80GB:2', 'A100-80GB:4', 'A100:8'] + - hint: an optional string hint if no feasible resources are found. """ if resources.is_launchable(): self._check_instance_type_accelerators_combination(resources) @@ -375,7 +376,9 @@ def get_feasible_launchable_resources( # TODO(zhwu): The resources are now silently filtered out. We # should have some logging telling the user why the resources # are not considered. - return ([], [], None) + return resources_lib.FeasibleResources(resources_list=[], + fuzzy_candidate_list=[], + hint=None) return self._get_feasible_launchable_resources(resources) def _get_feasible_launchable_resources( diff --git a/sky/clouds/cudo.py b/sky/clouds/cudo.py index 9b8bdce1035..0658a7fcf31 100644 --- a/sky/clouds/cudo.py +++ b/sky/clouds/cudo.py @@ -214,13 +214,14 @@ def make_deploy_resources_variables( } def _get_feasible_launchable_resources( - self, resources: 'resources_lib.Resources'): + self, resources: 'resources_lib.Resources' + ) -> 'resources_lib.FeasibleResources': if resources.use_spot: - return ([], [], None) + return resources_lib.FeasibleResources([], [], None) if resources.instance_type is not None: assert resources.is_launchable(), resources resources = resources.copy(accelerators=None) - return ([resources], [], None) + return resources_lib.FeasibleResources([resources], [], None) def _make(instance_list): resource_list = [] @@ -243,9 +244,10 @@ def _make(instance_list): memory=resources.memory, disk_tier=resources.disk_tier) if default_instance_type is None: - return ([], [], None) + return resources_lib.FeasibleResources([], [], None) else: - return (_make([default_instance_type]), [], None) + return resources_lib.FeasibleResources( + _make([default_instance_type]), [], None) assert len(accelerators) == 1, resources acc, acc_count = list(accelerators.items())[0] @@ -260,8 +262,10 @@ def _make(instance_list): zone=resources.zone, clouds='cudo') if instance_list is None: - return ([], fuzzy_candidate_list, None) - return (_make(instance_list), fuzzy_candidate_list, None) + return resources_lib.FeasibleResources([], fuzzy_candidate_list, + None) + return resources_lib.FeasibleResources(_make(instance_list), + fuzzy_candidate_list, None) @classmethod def check_credentials(cls) -> Tuple[bool, Optional[str]]: diff --git a/sky/clouds/fluidstack.py b/sky/clouds/fluidstack.py index c8d9a888f56..73658696f2c 100644 --- a/sky/clouds/fluidstack.py +++ b/sky/clouds/fluidstack.py @@ -211,7 +211,7 @@ def _get_feasible_launchable_resources( assert resources.is_launchable(), resources # Accelerators are part of the instance type in Fluidstack Cloud resources = resources.copy(accelerators=None) - return ([resources], [], None) + return resources_lib.FeasibleResources([resources], [], None) def _make(instance_list): resource_list = [] @@ -239,9 +239,10 @@ def _make(instance_list): memory=resources.memory, disk_tier=resources.disk_tier) if default_instance_type is None: - return ([], [], None) + return resources_lib.FeasibleResources([], [], None) else: - return (_make([default_instance_type]), [], None) + return resources_lib.FeasibleResources( + _make([default_instance_type]), [], None) assert len(accelerators) == 1, resources acc, acc_count = list(accelerators.items())[0] @@ -256,8 +257,10 @@ def _make(instance_list): zone=resources.zone, clouds='fluidstack') if instance_list is None: - return ([], fuzzy_candidate_list, None) - return (_make(instance_list), fuzzy_candidate_list, None) + return resources_lib.FeasibleResources([], fuzzy_candidate_list, + None) + return resources_lib.FeasibleResources(_make(instance_list), + fuzzy_candidate_list, None) @classmethod def check_credentials(cls) -> Tuple[bool, Optional[str]]: diff --git a/sky/clouds/gcp.py b/sky/clouds/gcp.py index b9d8dc0cd57..241b3dc1f68 100644 --- a/sky/clouds/gcp.py +++ b/sky/clouds/gcp.py @@ -525,11 +525,11 @@ def make_deploy_resources_variables( return resources_vars def _get_feasible_launchable_resources( - self, resources: 'resources.Resources' - ) -> Tuple[List['resources.Resources'], List[str], Optional[str]]: + self, + resources: 'resources.Resources') -> 'resources.FeasibleResources': if resources.instance_type is not None: assert resources.is_launchable(), resources - return ([resources], [], None) + return resources.FeasibleResources([resources], [], None) if resources.accelerators is None: # Return a default instance type with the given number of vCPUs. @@ -538,7 +538,7 @@ def _get_feasible_launchable_resources( memory=resources.memory, disk_tier=resources.disk_tier) if host_vm_type is None: - return ([], [], None) + return resources.FeasibleResources([], [], None) else: r = resources.copy( cloud=GCP(), @@ -547,7 +547,7 @@ def _get_feasible_launchable_resources( cpus=None, memory=None, ) - return ([r], [], None) + return resources.FeasibleResources([r], [], None) # Find instance candidates to meet user's requirements assert len(resources.accelerators.items() @@ -569,7 +569,7 @@ def _get_feasible_launchable_resources( clouds='gcp') if instance_list is None: - return ([], fuzzy_candidate_list, None) + return resources.FeasibleResources([], fuzzy_candidate_list, None) assert len( instance_list ) == 1, f'More than one instance type matched, {instance_list}' @@ -584,11 +584,15 @@ def _get_feasible_launchable_resources( if resources.cpus.endswith('+'): cpus = float(resources.cpus[:-1]) if cpus > num_cpus_in_tpu_vm: - return ([], fuzzy_candidate_list, None) + return resources.FeasibleResources([], + fuzzy_candidate_list, + None) else: cpus = float(resources.cpus) if cpus != num_cpus_in_tpu_vm: - return ([], fuzzy_candidate_list, None) + return resources.FeasibleResources([], + fuzzy_candidate_list, + None) # FIXME(woosuk, wei-lin): This leverages the fact that TPU VMs # have 334 GB RAM, and 400 GB RAM for tpu-v4. We need to move # this to service catalog, instead. @@ -597,11 +601,15 @@ def _get_feasible_launchable_resources( if resources.memory.endswith('+'): memory = float(resources.memory[:-1]) if memory > memory_in_tpu_vm: - return ([], fuzzy_candidate_list, None) + return resources.FeasibleResources([], + fuzzy_candidate_list, + None) else: memory = float(resources.memory) if memory != memory_in_tpu_vm: - return ([], fuzzy_candidate_list, None) + return resources.FeasibleResources([], + fuzzy_candidate_list, + None) else: host_vm_type = instance_list[0] @@ -613,7 +621,7 @@ def _get_feasible_launchable_resources( cpus=None, memory=None, ) - return ([r], fuzzy_candidate_list, None) + return resources.FeasibleResources([r], fuzzy_candidate_list, None) @classmethod def get_accelerators_from_instance_type( diff --git a/sky/clouds/ibm.py b/sky/clouds/ibm.py index 7ff79c20547..42fe05ba46c 100644 --- a/sky/clouds/ibm.py +++ b/sky/clouds/ibm.py @@ -266,12 +266,13 @@ def get_default_instance_type( def _get_feasible_launchable_resources( self, resources: 'resources_lib.Resources' - ) -> Tuple[List['resources_lib.Resources'], List[str], Optional[str]]: + ) -> 'resources_lib.FeasibleResources': fuzzy_candidate_list: List[str] = [] if resources.instance_type is not None: assert resources.is_launchable(), resources resources = resources.copy(accelerators=None) - return ([resources], fuzzy_candidate_list, None) + return resources_lib.FeasibleResources([resources], + fuzzy_candidate_list, None) def _make(instance_list): resource_list = [] @@ -296,9 +297,10 @@ def _make(instance_list): memory=resources.memory, disk_tier=resources.disk_tier) if default_instance_type is None: - return ([], [], None) + return resources_lib.FeasibleResources([], [], None) else: - return (_make([default_instance_type]), [], None) + return resources_lib.FeasibleResources( + _make([default_instance_type]), [], None) assert len(accelerators) == 1, resources acc, acc_count = list(accelerators.items())[0] @@ -312,8 +314,10 @@ def _make(instance_list): zone=resources.zone, clouds='ibm') if instance_list is None: - return ([], fuzzy_candidate_list, None) - return (_make(instance_list), fuzzy_candidate_list, None) + return resources_lib.FeasibleResources([], fuzzy_candidate_list, + None) + return resources_lib.FeasibleResources(_make(instance_list), + fuzzy_candidate_list, None) @classmethod def get_default_image(cls, region) -> str: diff --git a/sky/clouds/kubernetes.py b/sky/clouds/kubernetes.py index 20c91a6b016..d1a401d46f2 100644 --- a/sky/clouds/kubernetes.py +++ b/sky/clouds/kubernetes.py @@ -2,10 +2,10 @@ import json import os import re -import typing from typing import Dict, Iterator, List, Optional, Tuple from sky import clouds +from sky import resources as resources_lib from sky import sky_logging from sky import skypilot_config from sky.adaptors import kubernetes @@ -16,10 +16,6 @@ from sky.utils import resources_utils from sky.utils import schemas -if typing.TYPE_CHECKING: - # Renaming to avoid shadowing variables. - from sky import resources as resources_lib - logger = sky_logging.init_logger(__name__) # Check if KUBECONFIG is set, and use it if it is. @@ -342,12 +338,13 @@ def make_deploy_resources_variables( def _get_feasible_launchable_resources( self, resources: 'resources_lib.Resources' - ) -> Tuple[List['resources_lib.Resources'], List[str], Optional[str]]: + ) -> 'resources_lib.FeasibleResources': fuzzy_candidate_list: List[str] = [] if resources.instance_type is not None: assert resources.is_launchable(), resources resources = resources.copy(accelerators=None) - return ([resources], fuzzy_candidate_list, None) + return resources_lib.FeasibleResources([resources], + fuzzy_candidate_list, None) def _make(instance_list): resource_list = [] @@ -403,10 +400,11 @@ def _make(instance_list): logger.debug(f'Instance type {chosen_instance_type} does ' 'not fit in the Kubernetes cluster. ' f'Reason: {reason}') - return [], [], reason + return resources_lib.FeasibleResources([], [], reason) # No fuzzy lists for Kubernetes - return _make([chosen_instance_type]), [], None + return resources_lib.FeasibleResources(_make([chosen_instance_type]), + [], None) @classmethod def check_credentials(cls) -> Tuple[bool, Optional[str]]: diff --git a/sky/clouds/lambda_cloud.py b/sky/clouds/lambda_cloud.py index f33cfd3a0ab..4fb92fddf72 100644 --- a/sky/clouds/lambda_cloud.py +++ b/sky/clouds/lambda_cloud.py @@ -178,12 +178,12 @@ def make_deploy_resources_variables( def _get_feasible_launchable_resources( self, resources: 'resources_lib.Resources' - ) -> Tuple[List['resources_lib.Resources'], List[str], Optional[str]]: + ) -> 'resources_lib.FeasibleResources': if resources.instance_type is not None: assert resources.is_launchable(), resources # Accelerators are part of the instance type in Lambda Cloud resources = resources.copy(accelerators=None) - return ([resources], [], None) + return resources_lib.FeasibleResources([resources], [], None) def _make(instance_list): resource_list = [] @@ -209,9 +209,10 @@ def _make(instance_list): memory=resources.memory, disk_tier=resources.disk_tier) if default_instance_type is None: - return ([], [], None) + return resources_lib.FeasibleResources([], [], None) else: - return (_make([default_instance_type]), [], None) + return resources_lib.FeasibleResources( + _make([default_instance_type]), [], None) assert len(accelerators) == 1, resources acc, acc_count = list(accelerators.items())[0] @@ -226,8 +227,10 @@ def _make(instance_list): zone=resources.zone, clouds='lambda') if instance_list is None: - return ([], fuzzy_candidate_list, None) - return (_make(instance_list), fuzzy_candidate_list, None) + return resources_lib.FeasibleResources([], fuzzy_candidate_list, + None) + return resources_lib.FeasibleResources(_make(instance_list), + fuzzy_candidate_list, None) @classmethod def check_credentials(cls) -> Tuple[bool, Optional[str]]: diff --git a/sky/clouds/oci.py b/sky/clouds/oci.py index 39f966345be..747564be0ba 100644 --- a/sky/clouds/oci.py +++ b/sky/clouds/oci.py @@ -295,11 +295,11 @@ def make_deploy_resources_variables( def _get_feasible_launchable_resources( self, resources: 'resources_lib.Resources' - ) -> Tuple[List['resources_lib.Resources'], List[str], Optional[None]]: + ) -> 'resources_lib.FeasibleResources': if resources.instance_type is not None: assert resources.is_launchable(), resources resources = resources.copy(accelerators=None) - return ([resources], [], None) + return resources_lib.FeasibleResources([resources], [], None) def _make(instance_list): resource_list = [] @@ -326,9 +326,10 @@ def _make(instance_list): disk_tier=resources.disk_tier) if default_instance_type is None: - return ([], [], None) + return resources_lib.FeasibleResources([], [], None) else: - return (_make([default_instance_type]), [], None) + return resources_lib.FeasibleResources( + _make([default_instance_type]), [], None) assert len(accelerators) == 1, resources @@ -344,9 +345,11 @@ def _make(instance_list): zone=resources.zone, clouds='oci') if instance_list is None: - return ([], fuzzy_candidate_list, None) + return resources_lib.FeasibleResources([], fuzzy_candidate_list, + None) - return (_make(instance_list), fuzzy_candidate_list, None) + return resources_lib.FeasibleResources(_make(instance_list), + fuzzy_candidate_list, None) @classmethod def check_credentials(cls) -> Tuple[bool, Optional[str]]: diff --git a/sky/clouds/paperspace.py b/sky/clouds/paperspace.py index 800413c739b..986b32e0816 100644 --- a/sky/clouds/paperspace.py +++ b/sky/clouds/paperspace.py @@ -196,11 +196,11 @@ def _get_feasible_launchable_resources( self, resources: 'resources_lib.Resources'): """Returns a list of feasible resources for the given resources.""" if resources.use_spot: - return ([], [], None) + return resources_lib.FeasibleResources([], [], None) if resources.instance_type is not None: assert resources.is_launchable(), resources resources = resources.copy(accelerators=None) - return ([resources], [], None) + return resources_lib.FeasibleResources([resources], [], None) def _make(instance_list): resource_list = [] @@ -223,9 +223,10 @@ def _make(instance_list): memory=resources.memory, disk_tier=resources.disk_tier) if default_instance_type is None: - return ([], [], None) + return resources_lib.FeasibleResources([], [], None) else: - return (_make([default_instance_type]), [], None) + return resources_lib.FeasibleResources( + _make([default_instance_type]), [], None) assert len(accelerators) == 1, resources acc, acc_count = list(accelerators.items())[0] @@ -241,8 +242,10 @@ def _make(instance_list): clouds='paperspace', )) if instance_list is None: - return ([], fuzzy_candidate_list, None) - return (_make(instance_list), fuzzy_candidate_list, None) + return resources_lib.FeasibleResources([], fuzzy_candidate_list, + None) + return resources_lib.FeasibleResources(_make(instance_list), + fuzzy_candidate_list, None) @classmethod def check_credentials(cls) -> Tuple[bool, Optional[str]]: diff --git a/sky/clouds/runpod.py b/sky/clouds/runpod.py index 42ff58081ac..075c7b46414 100644 --- a/sky/clouds/runpod.py +++ b/sky/clouds/runpod.py @@ -187,12 +187,12 @@ def make_deploy_resources_variables( def _get_feasible_launchable_resources( self, resources: 'resources_lib.Resources' - ) -> Tuple[List['resources_lib.Resources'], List[str], Optional[str]]: + ) -> 'resources_lib.FeasibleResources': """Returns a list of feasible resources for the given resources.""" if resources.instance_type is not None: assert resources.is_launchable(), resources resources = resources.copy(accelerators=None) - return ([resources], [], None) + return resources_lib.FeasibleResources([resources], [], None) def _make(instance_list): resource_list = [] @@ -215,9 +215,10 @@ def _make(instance_list): memory=resources.memory, disk_tier=resources.disk_tier) if default_instance_type is None: - return ([], [], None) + return resources_lib.FeasibleResources([], [], None) else: - return (_make([default_instance_type]), [], None) + return resources_lib.FeasibleResources( + _make([default_instance_type]), [], None) assert len(accelerators) == 1, resources acc, acc_count = list(accelerators.items())[0] @@ -231,8 +232,10 @@ def _make(instance_list): zone=resources.zone, clouds='runpod') if instance_list is None: - return ([], fuzzy_candidate_list, None) - return (_make(instance_list), fuzzy_candidate_list, None) + return resources_lib.FeasibleResources([], fuzzy_candidate_list, + None) + return resources_lib.FeasibleResources(_make(instance_list), + fuzzy_candidate_list, None) @classmethod def check_credentials(cls) -> Tuple[bool, Optional[str]]: diff --git a/sky/clouds/scp.py b/sky/clouds/scp.py index f9b462f8343..43bb10c94bd 100644 --- a/sky/clouds/scp.py +++ b/sky/clouds/scp.py @@ -251,16 +251,16 @@ def _get_default_ami(cls, region_name: str, instance_type: str) -> str: def _get_feasible_launchable_resources( self, resources: 'resources_lib.Resources' - ) -> Tuple[List['resources_lib.Resources'], List[str], Optional[str]]: + ) -> 'resources_lib.FeasibleResources': # Check if the host VM satisfies the min/max disk size limits. is_allowed = self._is_disk_size_allowed(resources) if not is_allowed: - return ([], [], None) + return resources_lib.FeasibleResources([], [], None) if resources.instance_type is not None: assert resources.is_launchable(), resources # Accelerators are part of the instance type in SCP Cloud resources = resources.copy(accelerators=None) - return ([resources], [], None) + return resources_lib.FeasibleResources([resources], [], None) def _make(instance_list): resource_list = [] @@ -287,9 +287,10 @@ def _make(instance_list): memory=resources.memory, disk_tier=resources.disk_tier) if default_instance_type is None: - return ([], [], None) + return resources_lib.FeasibleResources([], [], None) else: - return (_make([default_instance_type]), [], None) + return resources_lib.FeasibleResources( + _make([default_instance_type]), [], None) assert len(accelerators) == 1, resources acc, acc_count = list(accelerators.items())[0] @@ -304,8 +305,10 @@ def _make(instance_list): zone=resources.zone, clouds='scp') if instance_list is None: - return ([], fuzzy_candidate_list, None) - return (_make(instance_list), fuzzy_candidate_list, None) + return resources_lib.FeasibleResources([], fuzzy_candidate_list, + None) + return resources_lib.FeasibleResources(_make(instance_list), + fuzzy_candidate_list, None) @classmethod def check_credentials(cls) -> Tuple[bool, Optional[str]]: diff --git a/sky/clouds/vsphere.py b/sky/clouds/vsphere.py index 86bb2ee95a0..a9836de95b8 100644 --- a/sky/clouds/vsphere.py +++ b/sky/clouds/vsphere.py @@ -197,11 +197,11 @@ def make_deploy_resources_variables( def _get_feasible_launchable_resources( self, resources: 'resources_lib.Resources'): if resources.use_spot: - return ([], [], None) + return resources_lib.FeasibleResources([], [], None) if resources.instance_type is not None: assert resources.is_launchable(), resources resources = resources.copy(accelerators=None) - return ([resources], [], None) + return resources_lib.FeasibleResources([resources], [], None) def _make(instance_list): resource_list = [] @@ -226,9 +226,10 @@ def _make(instance_list): disk_tier=resources.disk_tier, ) if default_instance_type is None: - return ([], [], None) + return resources_lib.FeasibleResources([], [], None) else: - return (_make([default_instance_type]), [], None) + return resources_lib.FeasibleResources( + _make([default_instance_type]), [], None) assert len(accelerators) == 1, resources acc, acc_count = list(accelerators.items())[0] @@ -246,8 +247,10 @@ def _make(instance_list): clouds=_CLOUD_VSPHERE, ) if instance_list is None: - return ([], fuzzy_candidate_list, None) - return (_make(instance_list), fuzzy_candidate_list, None) + return resources_lib.FeasibleResources([], fuzzy_candidate_list, + None) + return resources_lib.FeasibleResources(_make(instance_list), + fuzzy_candidate_list, None) @classmethod def check_credentials(cls) -> Tuple[bool, Optional[str]]: diff --git a/sky/optimizer.py b/sky/optimizer.py index 7cbfc6b9bf7..bf76c45b859 100644 --- a/sky/optimizer.py +++ b/sky/optimizer.py @@ -1238,12 +1238,11 @@ def _fill_in_launchable_resources( # If clouds provide hints, store them for later printing. hints: Dict[clouds.Cloud, str] = {} for cloud in clouds_list: - (feasible_resources, fuzzy_candidate_list, - hint) = cloud.get_feasible_launchable_resources( - resources, num_nodes=task.num_nodes) - if hint: - hints[cloud] = hint - if len(feasible_resources) > 0: + feasible_resources = cloud.get_feasible_launchable_resources( + resources, num_nodes=task.num_nodes) + if feasible_resources.hint: + hints[cloud] = feasible_resources.hint + if len(feasible_resources.resources_list) > 0: # Assume feasible_resources is sorted by prices. Guaranteed by # the implementation of get_feasible_launchable_resources and # the underlying service_catalog filtering @@ -1253,7 +1252,8 @@ def _fill_in_launchable_resources( _make_launchables_for_valid_region_zones(cheapest)) cloud_candidates[cloud] = feasible_resources else: - all_fuzzy_candidates.update(fuzzy_candidate_list) + all_fuzzy_candidates.update( + feasible_resources.fuzzy_candidate_list) if len(launchable[resources]) == 0: clouds_str = str(clouds_list) if len(clouds_list) > 1 else str( clouds_list[0]) diff --git a/sky/resources.py b/sky/resources.py index f0cb1abda1e..2ee3bd81fce 100644 --- a/sky/resources.py +++ b/sky/resources.py @@ -1567,3 +1567,21 @@ def __setstate__(self, state): '_cluster_config_overrides', None) self.__dict__.update(state) + + +@dataclasses.dataclass +class FeasibleResources: + """Feasible resources returned by cloud. + + Used to represent a collection of feasible resources returned by cloud, + any fuzzy candidates, and optionally a string hint if no feasible resources + are found. + + Fuzzy candidates example: when the requested GPU is A100:1 but is not + available in a cloud/region, the fuzzy candidates are results of a fuzzy + search in the catalog that are offered in the location. E.g., + ['A100-80GB:1', 'A100-80GB:2', 'A100-80GB:4', 'A100:8'] + """ + resources_list: List[Resources] + fuzzy_candidate_list: List[str] + hint: Optional[str]