Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Azure] Support fractional A10 instance types #3877

Merged
merged 29 commits into from
Oct 26, 2024
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
de35be4
fix
cblmemo Aug 26, 2024
39d6c15
change catalog to float gpu num
cblmemo Aug 27, 2024
7324504
support print float point gpu in sky launch. TODO: test if the ray de…
cblmemo Aug 27, 2024
347ad62
fix unittest
cblmemo Aug 27, 2024
71af06e
format
cblmemo Aug 27, 2024
d419442
patch ray resources to ceil value
cblmemo Aug 27, 2024
f529689
support launch from --gpus A10
cblmemo Aug 28, 2024
2031a50
only allow strictly match fractional gpu counts
cblmemo Aug 28, 2024
07e47d6
address comment
cblmemo Sep 3, 2024
639c686
Merge remote-tracking branch 'origin/master' into support-fractional-a10
cblmemo Sep 3, 2024
4c45ff7
Merge remote-tracking branch 'origin/master' into support-fractional-a10
cblmemo Sep 6, 2024
84d6d0d
change back condition
cblmemo Sep 7, 2024
eca7033
fix
cblmemo Sep 7, 2024
0055fc1
apply suggestions from code review
cblmemo Sep 11, 2024
9652119
fix
cblmemo Sep 11, 2024
a5c5b15
Update sky/backends/cloud_vm_ray_backend.py
cblmemo Sep 11, 2024
d2cff96
format
cblmemo Sep 11, 2024
e8e9954
fix display of fuzzy candidates
cblmemo Sep 11, 2024
db607fa
fix precision issue
cblmemo Sep 12, 2024
e98ecdc
fix num gpu required
cblmemo Sep 12, 2024
8ada7a2
refactor in check_resources_fit_cluster
cblmemo Oct 11, 2024
f6c9fad
change type annotation of acc_count
cblmemo Oct 11, 2024
a1f59a0
enable fuzzy fp acc count
cblmemo Oct 11, 2024
bcbf5ec
Merge remote-tracking branch 'origin/master' into support-fractional-a10
cblmemo Oct 11, 2024
3200d39
fix k8s
cblmemo Oct 11, 2024
6e41da5
Merge remote-tracking branch 'origin/master' into support-fractional-a10
cblmemo Oct 25, 2024
fb3049f
Update sky/clouds/service_catalog/common.py
cblmemo Oct 25, 2024
82d442f
fix integer gpus
cblmemo Oct 25, 2024
84d146c
format
cblmemo Oct 25, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 18 additions & 0 deletions sky/backends/cloud_vm_ray_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -2663,6 +2663,24 @@ def check_resources_fit_cluster(
'stores, but the existing cluster with '
f'{launched_resources!r} does not support FUSE '
f'mounting. Launch a new cluster to run this task.')
if (example_resource.accelerators is not None and
launched_resources.accelerators is not None):
for acc in example_resource.accelerators:
if acc not in launched_resources.accelerators:
continue
self_count = example_resource.accelerators[acc]
existing_count = launched_resources.accelerators[acc]
if (isinstance(self_count, float) and
isinstance(existing_count, float) and
not math.isclose(self_count, existing_count)):
with ux_utils.print_exception_no_traceback():
raise exceptions.ResourcesMismatchError(
'Task requested resources with fractional '
'accelerator counts. For fractional '
'counts, the required count must match the '
'existing cluster. Got required accelerator'
f' {acc}:{self_count} but the existing '
f'cluster has {acc}:{existing_count}.')
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This error message is not accurate? Our check is for ACC count of existing cluster instead of the task requested resources?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please see the above comments 🤔

requested_resource_str = ', '.join(requested_resource_list)
if isinstance(task.resources, list):
requested_resource_str = f'[{requested_resource_str}]'
Expand Down
11 changes: 4 additions & 7 deletions sky/clouds/aws.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,12 @@
import enum
import fnmatch
import functools
import json
import os
import re
import subprocess
import time
import typing
from typing import Any, Dict, Iterator, List, Optional, Set, Tuple
from typing import Any, Dict, Iterator, List, Optional, Set, Tuple, Union

from sky import clouds
from sky import exceptions
Expand Down Expand Up @@ -366,7 +365,7 @@ def get_default_instance_type(
def get_accelerators_from_instance_type(
cls,
instance_type: str,
) -> Optional[Dict[str, int]]:
) -> Optional[Dict[str, Union[int, float]]]:
return service_catalog.get_accelerators_from_instance_type(
instance_type, clouds='aws')

Expand Down Expand Up @@ -394,10 +393,8 @@ def make_deploy_resources_variables(
r = resources
# r.accelerators is cleared but .instance_type encodes the info.
acc_dict = self.get_accelerators_from_instance_type(r.instance_type)
if acc_dict is not None:
custom_resources = json.dumps(acc_dict, separators=(',', ':'))
else:
custom_resources = None
custom_resources = resources_utils.make_ray_custom_resources_str(
acc_dict)

if r.extract_docker_image() is not None:
image_id_to_use = None
Expand Down
8 changes: 4 additions & 4 deletions sky/clouds/azure.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,11 @@
"""Azure."""
import functools
import json
import os
import re
import subprocess
import textwrap
import typing
from typing import Any, Dict, Iterator, List, Optional, Tuple
from typing import Any, Dict, Iterator, List, Optional, Tuple, Union

import colorama

Expand Down Expand Up @@ -252,7 +251,7 @@ def zones_provision_loop(
def get_accelerators_from_instance_type(
cls,
instance_type: str,
) -> Optional[Dict[str, int]]:
) -> Optional[Dict[str, Union[int, float]]]:
Michaelvll marked this conversation as resolved.
Show resolved Hide resolved
return service_catalog.get_accelerators_from_instance_type(
instance_type, clouds='azure')

Expand Down Expand Up @@ -284,7 +283,8 @@ def make_deploy_resources_variables(
acc_dict = self.get_accelerators_from_instance_type(r.instance_type)
acc_count = None
if acc_dict is not None:
custom_resources = json.dumps(acc_dict, separators=(',', ':'))
custom_resources = resources_utils.make_ray_custom_resources_str(
acc_dict)
acc_count = str(sum(acc_dict.values()))
else:
custom_resources = None
Expand Down
18 changes: 11 additions & 7 deletions sky/clouds/cloud.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,9 @@
"""
import collections
import enum
import math
import typing
from typing import Dict, Iterable, Iterator, List, Optional, Set, Tuple
from typing import Dict, Iterable, Iterator, List, Optional, Set, Tuple, Union

from sky import exceptions
from sky import skypilot_config
Expand Down Expand Up @@ -306,7 +307,7 @@ def get_vcpus_mem_from_instance_type(
def get_accelerators_from_instance_type(
cls,
instance_type: str,
) -> Optional[Dict[str, int]]:
) -> Optional[Dict[str, Union[int, float]]]:
"""Returns {acc: acc_count} held by 'instance_type', if any."""
raise NotImplementedError

Expand Down Expand Up @@ -644,8 +645,9 @@ def _check_instance_type_accelerators_combination(
assert resources.is_launchable(), resources

def _equal_accelerators(
acc_requested: Optional[Dict[str, int]],
acc_from_instance_type: Optional[Dict[str, int]]) -> bool:
acc_requested: Optional[Dict[str, Union[int, float]]],
acc_from_instance_type: Optional[Dict[str, Union[int,
float]]]) -> bool:
"""Check the requested accelerators equals to the instance type

Check the requested accelerators equals to the accelerators
Expand All @@ -660,12 +662,14 @@ def _equal_accelerators(
for acc in acc_requested:
if acc not in acc_from_instance_type:
return False
if acc_requested[acc] != acc_from_instance_type[acc]:
# Avoid float point precision issue.
if not math.isclose(acc_requested[acc],
acc_from_instance_type[acc]):
return False
return True

acc_from_instance_type = (cls.get_accelerators_from_instance_type(
resources.instance_type))
acc_from_instance_type = cls.get_accelerators_from_instance_type(
resources.instance_type)
if not _equal_accelerators(resources.accelerators,
acc_from_instance_type):
with ux_utils.print_exception_no_traceback():
Expand Down
11 changes: 4 additions & 7 deletions sky/clouds/cudo.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
"""Cudo Compute"""
import json
import subprocess
import typing
from typing import Dict, Iterator, List, Optional, Tuple
from typing import Dict, Iterator, List, Optional, Tuple, Union

from sky import clouds
from sky.clouds import service_catalog
Expand Down Expand Up @@ -183,7 +182,7 @@ def get_default_instance_type(
def get_accelerators_from_instance_type(
cls,
instance_type: str,
) -> Optional[Dict[str, int]]:
) -> Optional[Dict[str, Union[int, float]]]:
return service_catalog.get_accelerators_from_instance_type(
instance_type, clouds='cudo')

Expand All @@ -202,10 +201,8 @@ def make_deploy_resources_variables(
del zones, cluster_name # unused
r = resources
acc_dict = self.get_accelerators_from_instance_type(r.instance_type)
if acc_dict is not None:
custom_resources = json.dumps(acc_dict, separators=(',', ':'))
else:
custom_resources = None
custom_resources = resources_utils.make_ray_custom_resources_str(
acc_dict)

return {
'instance_type': resources.instance_type,
Expand Down
11 changes: 4 additions & 7 deletions sky/clouds/fluidstack.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
"""Fluidstack Cloud."""
import json
import os
import typing
from typing import Dict, Iterator, List, Optional, Tuple
from typing import Dict, Iterator, List, Optional, Tuple, Union

import requests

Expand Down Expand Up @@ -155,7 +154,7 @@ def get_default_instance_type(
def get_accelerators_from_instance_type(
cls,
instance_type: str,
) -> Optional[Dict[str, int]]:
) -> Optional[Dict[str, Union[int, float]]]:
return service_catalog.get_accelerators_from_instance_type(
instance_type, clouds='fluidstack')

Expand Down Expand Up @@ -184,10 +183,8 @@ def make_deploy_resources_variables(

r = resources
acc_dict = self.get_accelerators_from_instance_type(r.instance_type)
if acc_dict is not None:
custom_resources = json.dumps(acc_dict, separators=(',', ':'))
else:
custom_resources = None
custom_resources = resources_utils.make_ray_custom_resources_str(
acc_dict)

return {
'instance_type': resources.instance_type,
Expand Down
4 changes: 2 additions & 2 deletions sky/clouds/gcp.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import subprocess
import time
import typing
from typing import Dict, Iterator, List, Optional, Set, Tuple
from typing import Dict, Iterator, List, Optional, Set, Tuple, Union

import colorama

Expand Down Expand Up @@ -631,7 +631,7 @@ def _get_feasible_launchable_resources(
def get_accelerators_from_instance_type(
cls,
instance_type: str,
) -> Optional[Dict[str, int]]:
) -> Optional[Dict[str, Union[int, float]]]:
# GCP handles accelerators separately from regular instance types,
# hence return none here.
return None
Expand Down
11 changes: 4 additions & 7 deletions sky/clouds/ibm.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
"""IBM Web Services."""
import json
import os
import typing
from typing import Any, Dict, Iterator, List, Optional, Tuple
from typing import Any, Dict, Iterator, List, Optional, Tuple, Union

import colorama

Expand Down Expand Up @@ -206,10 +205,8 @@ def _get_profile_resources(instance_profile):
'IBM does not currently support spot instances in this framework'

acc_dict = self.get_accelerators_from_instance_type(r.instance_type)
if acc_dict is not None:
custom_resources = json.dumps(acc_dict, separators=(',', ':'))
else:
custom_resources = None
custom_resources = resources_utils.make_ray_custom_resources_str(
acc_dict)

instance_resources = _get_profile_resources(r.instance_type)

Expand Down Expand Up @@ -247,7 +244,7 @@ def get_vcpus_mem_from_instance_type(
def get_accelerators_from_instance_type(
cls,
instance_type: str,
) -> Optional[Dict[str, int]]:
) -> Optional[Dict[str, Union[int, float]]]:
"""Returns {acc: acc_count} held by 'instance_type', if any."""
return service_catalog.get_accelerators_from_instance_type(
instance_type, clouds='ibm')
Expand Down
11 changes: 4 additions & 7 deletions sky/clouds/kubernetes.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
"""Kubernetes."""
import json
import os
import re
import typing
from typing import Dict, Iterator, List, Optional, Tuple
from typing import Dict, Iterator, List, Optional, Tuple, Union

from sky import clouds
from sky import sky_logging
Expand Down Expand Up @@ -180,7 +179,7 @@ def get_default_instance_type(
def get_accelerators_from_instance_type(
cls,
instance_type: str,
) -> Optional[Dict[str, int]]:
) -> Optional[Dict[str, Union[int, float]]]:
inst = kubernetes_utils.KubernetesInstanceType.from_instance_type(
instance_type)
return {
Expand Down Expand Up @@ -234,10 +233,8 @@ def make_deploy_resources_variables(

r = resources
acc_dict = self.get_accelerators_from_instance_type(r.instance_type)
if acc_dict is not None:
custom_resources = json.dumps(acc_dict, separators=(',', ':'))
else:
custom_resources = None
custom_resources = resources_utils.make_ray_custom_resources_str(
acc_dict)

# resources.memory and cpus are None if they are not explicitly set.
# We fetch the default values for the instance type in that case.
Expand Down
11 changes: 4 additions & 7 deletions sky/clouds/lambda_cloud.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
"""Lambda Cloud."""
import json
import typing
from typing import Dict, Iterator, List, Optional, Tuple
from typing import Dict, Iterator, List, Optional, Tuple, Union

import requests

Expand Down Expand Up @@ -137,7 +136,7 @@ def get_default_instance_type(
def get_accelerators_from_instance_type(
cls,
instance_type: str,
) -> Optional[Dict[str, int]]:
) -> Optional[Dict[str, Union[int, float]]]:
return service_catalog.get_accelerators_from_instance_type(
instance_type, clouds='lambda')

Expand Down Expand Up @@ -165,10 +164,8 @@ def make_deploy_resources_variables(

r = resources
acc_dict = self.get_accelerators_from_instance_type(r.instance_type)
if acc_dict is not None:
custom_resources = json.dumps(acc_dict, separators=(',', ':'))
else:
custom_resources = None
custom_resources = resources_utils.make_ray_custom_resources_str(
acc_dict)

return {
'instance_type': resources.instance_type,
Expand Down
11 changes: 4 additions & 7 deletions sky/clouds/oci.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,10 @@
- Hysun He ([email protected]) @ May 4, 2023: Support use the default
image_id (configurable) if no image_id specified in the task yaml.
"""
import json
import logging
import os
import typing
from typing import Dict, Iterator, List, Optional, Tuple
from typing import Dict, Iterator, List, Optional, Tuple, Union

from sky import clouds
from sky import exceptions
Expand Down Expand Up @@ -176,7 +175,7 @@ def get_default_instance_type(
def get_accelerators_from_instance_type(
cls,
instance_type: str,
) -> Optional[Dict[str, int]]:
) -> Optional[Dict[str, Union[int, float]]]:
return service_catalog.get_accelerators_from_instance_type(
instance_type, clouds='oci')

Expand All @@ -196,10 +195,8 @@ def make_deploy_resources_variables(

acc_dict = self.get_accelerators_from_instance_type(
resources.instance_type)
if acc_dict is not None:
custom_resources = json.dumps(acc_dict, separators=(',', ':'))
else:
custom_resources = None
custom_resources = resources_utils.make_ray_custom_resources_str(
acc_dict)

image_str = self._get_image_id(resources.image_id, region.name,
resources.instance_type)
Expand Down
11 changes: 4 additions & 7 deletions sky/clouds/paperspace.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
""" Paperspace Cloud. """

import json
import typing
from typing import Dict, Iterator, List, Optional, Tuple
from typing import Dict, Iterator, List, Optional, Tuple, Union

import requests

Expand Down Expand Up @@ -162,7 +161,7 @@ def get_default_instance_type(

@classmethod
def get_accelerators_from_instance_type(
cls, instance_type: str) -> Optional[Dict[str, int]]:
cls, instance_type: str) -> Optional[Dict[str, Union[int, float]]]:
return service_catalog.get_accelerators_from_instance_type(
instance_type, clouds='paperspace')

Expand All @@ -181,10 +180,8 @@ def make_deploy_resources_variables(

r = resources
acc_dict = self.get_accelerators_from_instance_type(r.instance_type)
if acc_dict is not None:
custom_resources = json.dumps(acc_dict, separators=(',', ':'))
else:
custom_resources = None
custom_resources = resources_utils.make_ray_custom_resources_str(
acc_dict)

return {
'instance_type': resources.instance_type,
Expand Down
Loading
Loading