Skip to content

Commit

Permalink
wip
Browse files Browse the repository at this point in the history
  • Loading branch information
romilbhardwaj committed Jul 18, 2024
1 parent 4da7b75 commit a2df6fc
Show file tree
Hide file tree
Showing 16 changed files with 170 additions and 109 deletions.
17 changes: 10 additions & 7 deletions sky/clouds/aws.py
Original file line number Diff line number Diff line change
Expand Up @@ -441,7 +441,7 @@ def make_deploy_resources_variables(

def _get_feasible_launchable_resources(
self, resources: 'resources_lib.Resources'
) -> Tuple[List['resources_lib.Resources'], List[str], Optional[str]]:
) -> 'resources_lib.FeasibleResources':
if resources.instance_type is not None:
assert resources.is_launchable(), resources
# Check the instance type is valid in the cloud
Expand All @@ -452,10 +452,10 @@ def _get_feasible_launchable_resources(
region=resources.region,
zone=resources.zone)
if not regions:
return ([], [], None)
return resources_lib.FeasibleResources([], [], None)
# Treat Resources(AWS, p3.2x, V100) as Resources(AWS, p3.2x).
resources = resources.copy(accelerators=None)
return ([resources], [], None)
return resources_lib.FeasibleResources([resources], [], None)

def _make(instance_list):
resource_list = []
Expand All @@ -481,9 +481,10 @@ def _make(instance_list):
memory=resources.memory,
disk_tier=resources.disk_tier)
if default_instance_type is None:
return ([], [], None)
return resources_lib.FeasibleResources([], [], None)
else:
return (_make([default_instance_type]), [], None)
return resources_lib.FeasibleResources(
_make([default_instance_type]), [], None)

assert len(accelerators) == 1, resources
acc, acc_count = list(accelerators.items())[0]
Expand All @@ -498,8 +499,10 @@ def _make(instance_list):
zone=resources.zone,
clouds='aws')
if instance_list is None:
return ([], fuzzy_candidate_list, None)
return (_make(instance_list), fuzzy_candidate_list, None)
return resources_lib.FeasibleResources([], fuzzy_candidate_list,
None)
return resources_lib.FeasibleResources(_make(instance_list),
fuzzy_candidate_list, None)

@classmethod
@functools.lru_cache(maxsize=1) # Cache since getting identity is slow.
Expand Down
18 changes: 10 additions & 8 deletions sky/clouds/azure.py
Original file line number Diff line number Diff line change
Expand Up @@ -377,18 +377,18 @@ def _failover_disk_tier() -> Optional[resources_utils.DiskTier]:
}

def _get_feasible_launchable_resources(
self, resources: 'resources.Resources'
) -> Tuple[List['resources.Resources'], List[str], Optional[str]]:
self,
resources: 'resources.Resources') -> 'resources.FeasibleResources':
if resources.instance_type is not None:
assert resources.is_launchable(), resources
ok, _ = Azure.check_disk_tier(resources.instance_type,
resources.disk_tier)
if not ok:
return ([], [], None)
return resources.FeasibleResources([], [], None)
# Treat Resources(Azure, Standard_NC4as_T4_v3, T4) as
# Resources(Azure, Standard_NC4as_T4_v3).
resources = resources.copy(accelerators=None)
return ([resources], [], None)
return resources.FeasibleResources([resources], [], None)

def _make(instance_list):
resource_list = []
Expand Down Expand Up @@ -418,9 +418,10 @@ def _make(instance_list):
memory=resources.memory,
disk_tier=resources.disk_tier)
if default_instance_type is None:
return ([], [], None)
return resources.FeasibleResources([], [], None)
else:
return (_make([default_instance_type]), [], None)
return resources.FeasibleResources(
_make([default_instance_type]), [], None)

assert len(accelerators) == 1, resources
acc, acc_count = list(accelerators.items())[0]
Expand All @@ -435,8 +436,9 @@ def _make(instance_list):
zone=resources.zone,
clouds='azure')
if instance_list is None:
return ([], fuzzy_candidate_list, None)
return (_make(instance_list), fuzzy_candidate_list, None)
return resources.FeasibleResources([], fuzzy_candidate_list, None)
return resources.FeasibleResources(_make(instance_list),
fuzzy_candidate_list, None)

@classmethod
def check_credentials(cls) -> Tuple[bool, Optional[str]]:
Expand Down
27 changes: 15 additions & 12 deletions sky/clouds/cloud.py
Original file line number Diff line number Diff line change
Expand Up @@ -341,25 +341,26 @@ def is_label_valid(cls, label_key: str,
return True, None

def get_feasible_launchable_resources(
self,
resources: 'resources_lib.Resources',
num_nodes: int = 1
) -> Tuple[List['resources_lib.Resources'], List[str], Optional[str]]:
"""Returns ([feasible & launchable resources], [fuzzy candidates], hint)
self,
resources: 'resources_lib.Resources',
num_nodes: int = 1) -> 'resources_lib.FeasibleResources':
"""Returns FeasibleResources for the given resources.
Feasible resources refer to an offering respecting the resource
requirements. Currently, this function implements "filtering" the
cloud's offerings only w.r.t. accelerators constraints.
Launchable resources require a cloud and an instance type be assigned.
The cloud may optionally return a string hint to the user if no feasible
resources are found.
The returned dataclass object FeasibleResources contains three fields:
Fuzzy candidates example: when the requested GPU is A100:1 but is not
available in a cloud/region, the fuzzy candidates are results of a fuzzy
search in the catalog that are offered in the location. E.g.,
['A100-80GB:1', 'A100-80GB:2', 'A100-80GB:4', 'A100:8']
- resources_list: a list of resources that are feasible to launch
- fuzzy_candidate_list: a list of resources that loosely match the requested
resources. E.g., when A100:1 GPU is requested but is not available
in a cloud/region, the fuzzy candidates are results of a fuzzy
search in the catalog that are offered in the location. E.g.,
['A100-80GB:1', 'A100-80GB:2', 'A100-80GB:4', 'A100:8']
- hint: an optional string hint if no feasible resources are found.
"""
if resources.is_launchable():
self._check_instance_type_accelerators_combination(resources)
Expand All @@ -375,7 +376,9 @@ def get_feasible_launchable_resources(
# TODO(zhwu): The resources are now silently filtered out. We
# should have some logging telling the user why the resources
# are not considered.
return ([], [], None)
return resources_lib.FeasibleResources(resources_list=[],
fuzzy_candidate_list=[],
hint=None)
return self._get_feasible_launchable_resources(resources)

def _get_feasible_launchable_resources(
Expand Down
18 changes: 11 additions & 7 deletions sky/clouds/cudo.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,13 +214,14 @@ def make_deploy_resources_variables(
}

def _get_feasible_launchable_resources(
self, resources: 'resources_lib.Resources'):
self, resources: 'resources_lib.Resources'
) -> 'resources_lib.FeasibleResources':
if resources.use_spot:
return ([], [], None)
return resources_lib.FeasibleResources([], [], None)
if resources.instance_type is not None:
assert resources.is_launchable(), resources
resources = resources.copy(accelerators=None)
return ([resources], [], None)
return resources_lib.FeasibleResources([resources], [], None)

def _make(instance_list):
resource_list = []
Expand All @@ -243,9 +244,10 @@ def _make(instance_list):
memory=resources.memory,
disk_tier=resources.disk_tier)
if default_instance_type is None:
return ([], [], None)
return resources_lib.FeasibleResources([], [], None)
else:
return (_make([default_instance_type]), [], None)
return resources_lib.FeasibleResources(
_make([default_instance_type]), [], None)

assert len(accelerators) == 1, resources
acc, acc_count = list(accelerators.items())[0]
Expand All @@ -260,8 +262,10 @@ def _make(instance_list):
zone=resources.zone,
clouds='cudo')
if instance_list is None:
return ([], fuzzy_candidate_list, None)
return (_make(instance_list), fuzzy_candidate_list, None)
return resources_lib.FeasibleResources([], fuzzy_candidate_list,
None)
return resources_lib.FeasibleResources(_make(instance_list),
fuzzy_candidate_list, None)

@classmethod
def check_credentials(cls) -> Tuple[bool, Optional[str]]:
Expand Down
13 changes: 8 additions & 5 deletions sky/clouds/fluidstack.py
Original file line number Diff line number Diff line change
Expand Up @@ -211,7 +211,7 @@ def _get_feasible_launchable_resources(
assert resources.is_launchable(), resources
# Accelerators are part of the instance type in Fluidstack Cloud
resources = resources.copy(accelerators=None)
return ([resources], [], None)
return resources_lib.FeasibleResources([resources], [], None)

def _make(instance_list):
resource_list = []
Expand Down Expand Up @@ -239,9 +239,10 @@ def _make(instance_list):
memory=resources.memory,
disk_tier=resources.disk_tier)
if default_instance_type is None:
return ([], [], None)
return resources_lib.FeasibleResources([], [], None)
else:
return (_make([default_instance_type]), [], None)
return resources_lib.FeasibleResources(
_make([default_instance_type]), [], None)

assert len(accelerators) == 1, resources
acc, acc_count = list(accelerators.items())[0]
Expand All @@ -256,8 +257,10 @@ def _make(instance_list):
zone=resources.zone,
clouds='fluidstack')
if instance_list is None:
return ([], fuzzy_candidate_list, None)
return (_make(instance_list), fuzzy_candidate_list, None)
return resources_lib.FeasibleResources([], fuzzy_candidate_list,
None)
return resources_lib.FeasibleResources(_make(instance_list),
fuzzy_candidate_list, None)

@classmethod
def check_credentials(cls) -> Tuple[bool, Optional[str]]:
Expand Down
30 changes: 19 additions & 11 deletions sky/clouds/gcp.py
Original file line number Diff line number Diff line change
Expand Up @@ -525,11 +525,11 @@ def make_deploy_resources_variables(
return resources_vars

def _get_feasible_launchable_resources(
self, resources: 'resources.Resources'
) -> Tuple[List['resources.Resources'], List[str], Optional[str]]:
self,
resources: 'resources.Resources') -> 'resources.FeasibleResources':
if resources.instance_type is not None:
assert resources.is_launchable(), resources
return ([resources], [], None)
return resources.FeasibleResources([resources], [], None)

if resources.accelerators is None:
# Return a default instance type with the given number of vCPUs.
Expand All @@ -538,7 +538,7 @@ def _get_feasible_launchable_resources(
memory=resources.memory,
disk_tier=resources.disk_tier)
if host_vm_type is None:
return ([], [], None)
return resources.FeasibleResources([], [], None)
else:
r = resources.copy(
cloud=GCP(),
Expand All @@ -547,7 +547,7 @@ def _get_feasible_launchable_resources(
cpus=None,
memory=None,
)
return ([r], [], None)
return resources.FeasibleResources([r], [], None)

# Find instance candidates to meet user's requirements
assert len(resources.accelerators.items()
Expand All @@ -569,7 +569,7 @@ def _get_feasible_launchable_resources(
clouds='gcp')

if instance_list is None:
return ([], fuzzy_candidate_list, None)
return resources.FeasibleResources([], fuzzy_candidate_list, None)
assert len(
instance_list
) == 1, f'More than one instance type matched, {instance_list}'
Expand All @@ -584,11 +584,15 @@ def _get_feasible_launchable_resources(
if resources.cpus.endswith('+'):
cpus = float(resources.cpus[:-1])
if cpus > num_cpus_in_tpu_vm:
return ([], fuzzy_candidate_list, None)
return resources.FeasibleResources([],
fuzzy_candidate_list,
None)
else:
cpus = float(resources.cpus)
if cpus != num_cpus_in_tpu_vm:
return ([], fuzzy_candidate_list, None)
return resources.FeasibleResources([],
fuzzy_candidate_list,
None)
# FIXME(woosuk, wei-lin): This leverages the fact that TPU VMs
# have 334 GB RAM, and 400 GB RAM for tpu-v4. We need to move
# this to service catalog, instead.
Expand All @@ -597,11 +601,15 @@ def _get_feasible_launchable_resources(
if resources.memory.endswith('+'):
memory = float(resources.memory[:-1])
if memory > memory_in_tpu_vm:
return ([], fuzzy_candidate_list, None)
return resources.FeasibleResources([],
fuzzy_candidate_list,
None)
else:
memory = float(resources.memory)
if memory != memory_in_tpu_vm:
return ([], fuzzy_candidate_list, None)
return resources.FeasibleResources([],
fuzzy_candidate_list,
None)
else:
host_vm_type = instance_list[0]

Expand All @@ -613,7 +621,7 @@ def _get_feasible_launchable_resources(
cpus=None,
memory=None,
)
return ([r], fuzzy_candidate_list, None)
return resources.FeasibleResources([r], fuzzy_candidate_list, None)

@classmethod
def get_accelerators_from_instance_type(
Expand Down
16 changes: 10 additions & 6 deletions sky/clouds/ibm.py
Original file line number Diff line number Diff line change
Expand Up @@ -266,12 +266,13 @@ def get_default_instance_type(

def _get_feasible_launchable_resources(
self, resources: 'resources_lib.Resources'
) -> Tuple[List['resources_lib.Resources'], List[str], Optional[str]]:
) -> 'resources_lib.FeasibleResources':
fuzzy_candidate_list: List[str] = []
if resources.instance_type is not None:
assert resources.is_launchable(), resources
resources = resources.copy(accelerators=None)
return ([resources], fuzzy_candidate_list, None)
return resources_lib.FeasibleResources([resources],
fuzzy_candidate_list, None)

def _make(instance_list):
resource_list = []
Expand All @@ -296,9 +297,10 @@ def _make(instance_list):
memory=resources.memory,
disk_tier=resources.disk_tier)
if default_instance_type is None:
return ([], [], None)
return resources_lib.FeasibleResources([], [], None)
else:
return (_make([default_instance_type]), [], None)
return resources_lib.FeasibleResources(
_make([default_instance_type]), [], None)

assert len(accelerators) == 1, resources
acc, acc_count = list(accelerators.items())[0]
Expand All @@ -312,8 +314,10 @@ def _make(instance_list):
zone=resources.zone,
clouds='ibm')
if instance_list is None:
return ([], fuzzy_candidate_list, None)
return (_make(instance_list), fuzzy_candidate_list, None)
return resources_lib.FeasibleResources([], fuzzy_candidate_list,
None)
return resources_lib.FeasibleResources(_make(instance_list),
fuzzy_candidate_list, None)

@classmethod
def get_default_image(cls, region) -> str:
Expand Down
Loading

0 comments on commit a2df6fc

Please sign in to comment.