From de35be47ba3d451adc163e318f76dfadaf91c115 Mon Sep 17 00:00:00 2001 From: cblmemo Date: Mon, 26 Aug 2024 15:22:33 -0700 Subject: [PATCH 01/25] fix --- sky/clouds/service_catalog/azure_catalog.py | 16 +++++++++++++++- .../service_catalog/data_fetchers/fetch_azure.py | 13 ------------- 2 files changed, 15 insertions(+), 14 deletions(-) diff --git a/sky/clouds/service_catalog/azure_catalog.py b/sky/clouds/service_catalog/azure_catalog.py index 141b356712e..08d400890cb 100644 --- a/sky/clouds/service_catalog/azure_catalog.py +++ b/sky/clouds/service_catalog/azure_catalog.py @@ -42,6 +42,15 @@ _DEFAULT_NUM_VCPUS = 8 _DEFAULT_MEMORY_CPU_RATIO = 4 +# Some A10 instance types only contains a fractional of GPU. We temporarily +# filter them out here to avoid using it as a whole A10 GPU. +# TODO(zhwu,tian): support fractional GPUs, which can be done on +# kubernetes as well. +# Ref: https://learn.microsoft.com/en-us/azure/virtual-machines/nva10v5-series +_FILTERED_A10_INSTANCE_TYPES = [ + f'Standard_NV{vcpu}ads_A10_v5' for vcpu in [6, 12, 18] +] + def instance_type_exists(instance_type: str) -> bool: return common.instance_type_exists_impl(_df, instance_type) @@ -138,7 +147,12 @@ def get_instance_type_for_accelerator( if zone is not None: with ux_utils.print_exception_no_traceback(): raise ValueError('Azure does not support zones.') - return common.get_instance_type_for_accelerator_impl(df=_df, + + # Filter out instance types that only contain a fractional of GPU. + df_filtered = _df.loc[~_df['InstanceType'].isin(_FILTERED_A10_INSTANCE_TYPES + )] + + return common.get_instance_type_for_accelerator_impl(df=df_filtered, acc_name=acc_name, acc_count=acc_count, cpus=cpus, diff --git a/sky/clouds/service_catalog/data_fetchers/fetch_azure.py b/sky/clouds/service_catalog/data_fetchers/fetch_azure.py index bbd337e23aa..615a0de350f 100644 --- a/sky/clouds/service_catalog/data_fetchers/fetch_azure.py +++ b/sky/clouds/service_catalog/data_fetchers/fetch_azure.py @@ -93,15 +93,6 @@ def get_regions() -> List[str]: # We have to manually remove it. DEPRECATED_FAMILIES = ['standardNVSv2Family'] -# Some A10 instance types only contains a fractional of GPU. We temporarily -# filter them out here to avoid using it as a whole A10 GPU. -# TODO(zhwu,tian): support fractional GPUs, which can be done on -# kubernetes as well. -# Ref: https://learn.microsoft.com/en-us/azure/virtual-machines/nva10v5-series -FILTERED_A10_INSTANCE_TYPES = [ - f'Standard_NV{vcpu}ads_A10_v5' for vcpu in [6, 12, 18] -] - USEFUL_COLUMNS = [ 'InstanceType', 'AcceleratorName', 'AcceleratorCount', 'vCPUs', 'MemoryGiB', 'GpuInfo', 'Price', 'SpotPrice', 'Region', 'Generation' @@ -299,10 +290,6 @@ def get_additional_columns(row): after_drop_len = len(df_ret) print(f'Dropped {before_drop_len - after_drop_len} duplicated rows') - # Filter out instance types that only contain a fractional of GPU. - df_ret = df_ret.loc[~df_ret['InstanceType'].isin(FILTERED_A10_INSTANCE_TYPES - )] - # Filter out deprecated families df_ret = df_ret.loc[~df_ret['family'].isin(DEPRECATED_FAMILIES)] df_ret = df_ret[USEFUL_COLUMNS] From 39d6c15fd4d4cf4038692c9b0f62db29721bfa39 Mon Sep 17 00:00:00 2001 From: cblmemo Date: Tue, 27 Aug 2024 14:06:26 -0700 Subject: [PATCH 02/25] change catalog to float gpu num --- sky/clouds/service_catalog/azure_catalog.py | 6 +++--- sky/clouds/service_catalog/constants.py | 5 +++++ .../service_catalog/data_fetchers/fetch_azure.py | 12 ++++++++++++ 3 files changed, 20 insertions(+), 3 deletions(-) diff --git a/sky/clouds/service_catalog/azure_catalog.py b/sky/clouds/service_catalog/azure_catalog.py index 08d400890cb..52dcb7cfde7 100644 --- a/sky/clouds/service_catalog/azure_catalog.py +++ b/sky/clouds/service_catalog/azure_catalog.py @@ -9,6 +9,7 @@ from sky import clouds as cloud_lib from sky.clouds import Azure from sky.clouds.service_catalog import common +from sky.clouds.service_catalog import constants from sky.utils import resources_utils from sky.utils import ux_utils @@ -47,9 +48,8 @@ # TODO(zhwu,tian): support fractional GPUs, which can be done on # kubernetes as well. # Ref: https://learn.microsoft.com/en-us/azure/virtual-machines/nva10v5-series -_FILTERED_A10_INSTANCE_TYPES = [ - f'Standard_NV{vcpu}ads_A10_v5' for vcpu in [6, 12, 18] -] +_FILTERED_A10_INSTANCE_TYPES = list( + constants.AZURE_FRACTIONAL_A10_INS_TYPE_TO_NUM_GPUS.keys()) def instance_type_exists(instance_type: str) -> bool: diff --git a/sky/clouds/service_catalog/constants.py b/sky/clouds/service_catalog/constants.py index 1373fd86a03..9fc6e8477f1 100644 --- a/sky/clouds/service_catalog/constants.py +++ b/sky/clouds/service_catalog/constants.py @@ -5,3 +5,8 @@ ALL_CLOUDS = ('aws', 'azure', 'gcp', 'ibm', 'lambda', 'scp', 'oci', 'kubernetes', 'runpod', 'vsphere', 'cudo', 'fluidstack', 'paperspace') +# Azure has those fractional A10 instance types, which still shows has 1 A10 GPU +# in the API response. We manually changing the number of GPUs to a float here. +AZURE_FRACTIONAL_A10_INS_TYPE_TO_NUM_GPUS = { + f'Standard_NV{vcpu}ads_A10_v5': vcpu / 24 for vcpu in [6, 12, 18] +} diff --git a/sky/clouds/service_catalog/data_fetchers/fetch_azure.py b/sky/clouds/service_catalog/data_fetchers/fetch_azure.py index 615a0de350f..82c7396e3e6 100644 --- a/sky/clouds/service_catalog/data_fetchers/fetch_azure.py +++ b/sky/clouds/service_catalog/data_fetchers/fetch_azure.py @@ -15,6 +15,7 @@ import requests from sky.adaptors import common as adaptors_common +from sky.clouds.service_catalog import constants if typing.TYPE_CHECKING: import pandas as pd @@ -265,6 +266,17 @@ def get_additional_columns(row): axis='columns', ) + def _upd_a10_gpu_count(row): + new_gpu_cnt = constants.AZURE_FRACTIONAL_A10_INS_TYPE_TO_NUM_GPUS.get( + row['InstanceType']) + if new_gpu_cnt is not None: + return new_gpu_cnt + return row['AcceleratorCount'] + + # Manually update the GPU count for fractional A10 instance types. + df_ret['AcceleratorCount'] = df_ret.apply(_upd_a10_gpu_count, + axis='columns') + # As of Dec 2023, a few H100 instance types fetched from Azure APIs do not # have pricing: # From 7324504dda8fe096e695d0d6c079bcf4beb638ea Mon Sep 17 00:00:00 2001 From: cblmemo Date: Tue, 27 Aug 2024 14:29:17 -0700 Subject: [PATCH 03/25] support print float point gpu in sky launch. TODO: test if the ray deployment group works for fractional one --- sky/clouds/aws.py | 4 ++-- sky/clouds/azure.py | 4 ++-- sky/clouds/cloud.py | 18 +++++++++++------- sky/clouds/cudo.py | 4 ++-- sky/clouds/fluidstack.py | 4 ++-- sky/clouds/gcp.py | 4 ++-- sky/clouds/ibm.py | 4 ++-- sky/clouds/kubernetes.py | 4 ++-- sky/clouds/lambda_cloud.py | 4 ++-- sky/clouds/oci.py | 4 ++-- sky/clouds/paperspace.py | 4 ++-- sky/clouds/runpod.py | 4 ++-- sky/clouds/scp.py | 4 ++-- sky/clouds/service_catalog/__init__.py | 2 +- sky/clouds/service_catalog/aws_catalog.py | 4 ++-- sky/clouds/service_catalog/azure_catalog.py | 4 ++-- sky/clouds/service_catalog/common.py | 8 +++++--- sky/clouds/service_catalog/cudo_catalog.py | 4 ++-- .../service_catalog/fluidstack_catalog.py | 4 ++-- sky/clouds/service_catalog/ibm_catalog.py | 4 ++-- sky/clouds/service_catalog/lambda_catalog.py | 4 ++-- sky/clouds/service_catalog/oci_catalog.py | 4 ++-- .../service_catalog/paperspace_catalog.py | 4 ++-- sky/clouds/service_catalog/runpod_catalog.py | 4 ++-- sky/clouds/service_catalog/scp_catalog.py | 4 ++-- sky/clouds/service_catalog/vsphere_catalog.py | 4 ++-- sky/clouds/vsphere.py | 4 ++-- sky/resources.py | 2 +- 28 files changed, 66 insertions(+), 60 deletions(-) diff --git a/sky/clouds/aws.py b/sky/clouds/aws.py index 3a05223574d..9c55836707e 100644 --- a/sky/clouds/aws.py +++ b/sky/clouds/aws.py @@ -8,7 +8,7 @@ import subprocess import time import typing -from typing import Any, Dict, Iterator, List, Optional, Set, Tuple +from typing import Any, Dict, Iterator, List, Optional, Set, Tuple, Union from sky import clouds from sky import exceptions @@ -366,7 +366,7 @@ def get_default_instance_type( def get_accelerators_from_instance_type( cls, instance_type: str, - ) -> Optional[Dict[str, int]]: + ) -> Optional[Dict[str, Union[int, float]]]: return service_catalog.get_accelerators_from_instance_type( instance_type, clouds='aws') diff --git a/sky/clouds/azure.py b/sky/clouds/azure.py index 928ceb5cc52..e5c3b9dd883 100644 --- a/sky/clouds/azure.py +++ b/sky/clouds/azure.py @@ -6,7 +6,7 @@ import subprocess import textwrap import typing -from typing import Any, Dict, Iterator, List, Optional, Tuple +from typing import Any, Dict, Iterator, List, Optional, Tuple, Union import colorama @@ -252,7 +252,7 @@ def zones_provision_loop( def get_accelerators_from_instance_type( cls, instance_type: str, - ) -> Optional[Dict[str, int]]: + ) -> Optional[Dict[str, Union[int, float]]]: return service_catalog.get_accelerators_from_instance_type( instance_type, clouds='azure') diff --git a/sky/clouds/cloud.py b/sky/clouds/cloud.py index 9775109ac80..8b892fa28ca 100644 --- a/sky/clouds/cloud.py +++ b/sky/clouds/cloud.py @@ -9,8 +9,9 @@ """ import collections import enum +import math import typing -from typing import Dict, Iterable, Iterator, List, Optional, Set, Tuple +from typing import Dict, Iterable, Iterator, List, Optional, Set, Tuple, Union from sky import exceptions from sky import skypilot_config @@ -306,7 +307,7 @@ def get_vcpus_mem_from_instance_type( def get_accelerators_from_instance_type( cls, instance_type: str, - ) -> Optional[Dict[str, int]]: + ) -> Optional[Dict[str, Union[int, float]]]: """Returns {acc: acc_count} held by 'instance_type', if any.""" raise NotImplementedError @@ -644,8 +645,9 @@ def _check_instance_type_accelerators_combination( assert resources.is_launchable(), resources def _equal_accelerators( - acc_requested: Optional[Dict[str, int]], - acc_from_instance_type: Optional[Dict[str, int]]) -> bool: + acc_requested: Optional[Dict[str, Union[int, float]]], + acc_from_instance_type: Optional[Dict[str, Union[int, + float]]]) -> bool: """Check the requested accelerators equals to the instance type Check the requested accelerators equals to the accelerators @@ -660,12 +662,14 @@ def _equal_accelerators( for acc in acc_requested: if acc not in acc_from_instance_type: return False - if acc_requested[acc] != acc_from_instance_type[acc]: + # Avoid float point precision issue. + if not math.isclose(acc_requested[acc], + acc_from_instance_type[acc]): return False return True - acc_from_instance_type = (cls.get_accelerators_from_instance_type( - resources.instance_type)) + acc_from_instance_type = cls.get_accelerators_from_instance_type( + resources.instance_type) if not _equal_accelerators(resources.accelerators, acc_from_instance_type): with ux_utils.print_exception_no_traceback(): diff --git a/sky/clouds/cudo.py b/sky/clouds/cudo.py index 8f100caebad..a4ea8586640 100644 --- a/sky/clouds/cudo.py +++ b/sky/clouds/cudo.py @@ -2,7 +2,7 @@ import json import subprocess import typing -from typing import Dict, Iterator, List, Optional, Tuple +from typing import Dict, Iterator, List, Optional, Tuple, Union from sky import clouds from sky.clouds import service_catalog @@ -183,7 +183,7 @@ def get_default_instance_type( def get_accelerators_from_instance_type( cls, instance_type: str, - ) -> Optional[Dict[str, int]]: + ) -> Optional[Dict[str, Union[int, float]]]: return service_catalog.get_accelerators_from_instance_type( instance_type, clouds='cudo') diff --git a/sky/clouds/fluidstack.py b/sky/clouds/fluidstack.py index ef397d4c55e..96737211cae 100644 --- a/sky/clouds/fluidstack.py +++ b/sky/clouds/fluidstack.py @@ -2,7 +2,7 @@ import json import os import typing -from typing import Dict, Iterator, List, Optional, Tuple +from typing import Dict, Iterator, List, Optional, Tuple, Union import requests @@ -155,7 +155,7 @@ def get_default_instance_type( def get_accelerators_from_instance_type( cls, instance_type: str, - ) -> Optional[Dict[str, int]]: + ) -> Optional[Dict[str, Union[int, float]]]: return service_catalog.get_accelerators_from_instance_type( instance_type, clouds='fluidstack') diff --git a/sky/clouds/gcp.py b/sky/clouds/gcp.py index 643d55d7037..3a6ebad2c22 100644 --- a/sky/clouds/gcp.py +++ b/sky/clouds/gcp.py @@ -7,7 +7,7 @@ import subprocess import time import typing -from typing import Dict, Iterator, List, Optional, Set, Tuple +from typing import Dict, Iterator, List, Optional, Set, Tuple, Union import colorama @@ -631,7 +631,7 @@ def _get_feasible_launchable_resources( def get_accelerators_from_instance_type( cls, instance_type: str, - ) -> Optional[Dict[str, int]]: + ) -> Optional[Dict[str, Union[int, float]]]: # GCP handles accelerators separately from regular instance types, # hence return none here. return None diff --git a/sky/clouds/ibm.py b/sky/clouds/ibm.py index b78cc4287c0..aca2aa4a44e 100644 --- a/sky/clouds/ibm.py +++ b/sky/clouds/ibm.py @@ -2,7 +2,7 @@ import json import os import typing -from typing import Any, Dict, Iterator, List, Optional, Tuple +from typing import Any, Dict, Iterator, List, Optional, Tuple, Union import colorama @@ -247,7 +247,7 @@ def get_vcpus_mem_from_instance_type( def get_accelerators_from_instance_type( cls, instance_type: str, - ) -> Optional[Dict[str, int]]: + ) -> Optional[Dict[str, Union[int, float]]]: """Returns {acc: acc_count} held by 'instance_type', if any.""" return service_catalog.get_accelerators_from_instance_type( instance_type, clouds='ibm') diff --git a/sky/clouds/kubernetes.py b/sky/clouds/kubernetes.py index 4dd1fe8ce75..498653aab17 100644 --- a/sky/clouds/kubernetes.py +++ b/sky/clouds/kubernetes.py @@ -3,7 +3,7 @@ import os import re import typing -from typing import Dict, Iterator, List, Optional, Tuple +from typing import Dict, Iterator, List, Optional, Tuple, Union from sky import clouds from sky import sky_logging @@ -180,7 +180,7 @@ def get_default_instance_type( def get_accelerators_from_instance_type( cls, instance_type: str, - ) -> Optional[Dict[str, int]]: + ) -> Optional[Dict[str, Union[int, float]]]: inst = kubernetes_utils.KubernetesInstanceType.from_instance_type( instance_type) return { diff --git a/sky/clouds/lambda_cloud.py b/sky/clouds/lambda_cloud.py index ce45f087296..98ea07a275b 100644 --- a/sky/clouds/lambda_cloud.py +++ b/sky/clouds/lambda_cloud.py @@ -1,7 +1,7 @@ """Lambda Cloud.""" import json import typing -from typing import Dict, Iterator, List, Optional, Tuple +from typing import Dict, Iterator, List, Optional, Tuple, Union import requests @@ -137,7 +137,7 @@ def get_default_instance_type( def get_accelerators_from_instance_type( cls, instance_type: str, - ) -> Optional[Dict[str, int]]: + ) -> Optional[Dict[str, Union[int, float]]]: return service_catalog.get_accelerators_from_instance_type( instance_type, clouds='lambda') diff --git a/sky/clouds/oci.py b/sky/clouds/oci.py index 7875e26d9cc..800ec3f2a07 100644 --- a/sky/clouds/oci.py +++ b/sky/clouds/oci.py @@ -9,7 +9,7 @@ import logging import os import typing -from typing import Dict, Iterator, List, Optional, Tuple +from typing import Dict, Iterator, List, Optional, Tuple, Union from sky import clouds from sky import exceptions @@ -176,7 +176,7 @@ def get_default_instance_type( def get_accelerators_from_instance_type( cls, instance_type: str, - ) -> Optional[Dict[str, int]]: + ) -> Optional[Dict[str, Union[int, float]]]: return service_catalog.get_accelerators_from_instance_type( instance_type, clouds='oci') diff --git a/sky/clouds/paperspace.py b/sky/clouds/paperspace.py index 171bcf33f16..5d78bf1068b 100644 --- a/sky/clouds/paperspace.py +++ b/sky/clouds/paperspace.py @@ -2,7 +2,7 @@ import json import typing -from typing import Dict, Iterator, List, Optional, Tuple +from typing import Dict, Iterator, List, Optional, Tuple, Union import requests @@ -162,7 +162,7 @@ def get_default_instance_type( @classmethod def get_accelerators_from_instance_type( - cls, instance_type: str) -> Optional[Dict[str, int]]: + cls, instance_type: str) -> Optional[Dict[str, Union[int, float]]]: return service_catalog.get_accelerators_from_instance_type( instance_type, clouds='paperspace') diff --git a/sky/clouds/runpod.py b/sky/clouds/runpod.py index 9a6b483619a..a93dbcfc12b 100644 --- a/sky/clouds/runpod.py +++ b/sky/clouds/runpod.py @@ -2,7 +2,7 @@ import json import typing -from typing import Dict, Iterator, List, Optional, Tuple +from typing import Dict, Iterator, List, Optional, Tuple, Union from sky import clouds from sky.clouds import service_catalog @@ -147,7 +147,7 @@ def get_default_instance_type( @classmethod def get_accelerators_from_instance_type( - cls, instance_type: str) -> Optional[Dict[str, int]]: + cls, instance_type: str) -> Optional[Dict[str, Union[int, float]]]: return service_catalog.get_accelerators_from_instance_type( instance_type, clouds='runpod') diff --git a/sky/clouds/scp.py b/sky/clouds/scp.py index 9cfbd5129f6..6263e7372fd 100644 --- a/sky/clouds/scp.py +++ b/sky/clouds/scp.py @@ -6,7 +6,7 @@ import json import typing -from typing import Dict, Iterator, List, Optional, Tuple +from typing import Dict, Iterator, List, Optional, Tuple, Union from sky import clouds from sky import exceptions @@ -160,7 +160,7 @@ def get_default_instance_type( def get_accelerators_from_instance_type( cls, instance_type: str, - ) -> Optional[Dict[str, int]]: + ) -> Optional[Dict[str, Union[int, float]]]: return service_catalog.get_accelerators_from_instance_type( instance_type, clouds='scp') diff --git a/sky/clouds/service_catalog/__init__.py b/sky/clouds/service_catalog/__init__.py index acc6fa0aa8b..ca09de672fb 100644 --- a/sky/clouds/service_catalog/__init__.py +++ b/sky/clouds/service_catalog/__init__.py @@ -238,7 +238,7 @@ def get_default_instance_type(cpus: Optional[str] = None, def get_accelerators_from_instance_type( instance_type: str, - clouds: CloudFilter = None) -> Optional[Dict[str, int]]: + clouds: CloudFilter = None) -> Optional[Dict[str, Union[int, float]]]: """Returns the accelerators from a instance type.""" return _map_clouds_catalog(clouds, 'get_accelerators_from_instance_type', instance_type) diff --git a/sky/clouds/service_catalog/aws_catalog.py b/sky/clouds/service_catalog/aws_catalog.py index 6847f304ae9..94af088b093 100644 --- a/sky/clouds/service_catalog/aws_catalog.py +++ b/sky/clouds/service_catalog/aws_catalog.py @@ -8,7 +8,7 @@ import os import threading import typing -from typing import Dict, List, Optional, Tuple +from typing import Dict, List, Optional, Tuple, Union import colorama @@ -244,7 +244,7 @@ def get_default_instance_type( def get_accelerators_from_instance_type( - instance_type: str) -> Optional[Dict[str, int]]: + instance_type: str) -> Optional[Dict[str, Union[int, float]]]: return common.get_accelerators_from_instance_type_impl( _get_df(), instance_type) diff --git a/sky/clouds/service_catalog/azure_catalog.py b/sky/clouds/service_catalog/azure_catalog.py index 52dcb7cfde7..3a8f79a802c 100644 --- a/sky/clouds/service_catalog/azure_catalog.py +++ b/sky/clouds/service_catalog/azure_catalog.py @@ -4,7 +4,7 @@ instance types and pricing information for Azure. """ import re -from typing import Dict, List, Optional, Tuple +from typing import Dict, List, Optional, Tuple, Union from sky import clouds as cloud_lib from sky.clouds import Azure @@ -127,7 +127,7 @@ def _filter_disk_type(instance_type: str) -> bool: def get_accelerators_from_instance_type( - instance_type: str) -> Optional[Dict[str, int]]: + instance_type: str) -> Optional[Dict[str, Union[int, float]]]: return common.get_accelerators_from_instance_type_impl(_df, instance_type) diff --git a/sky/clouds/service_catalog/common.py b/sky/clouds/service_catalog/common.py index fbbe0fdcef1..abe305f793b 100644 --- a/sky/clouds/service_catalog/common.py +++ b/sky/clouds/service_catalog/common.py @@ -5,7 +5,7 @@ import os import time import typing -from typing import Callable, Dict, List, NamedTuple, Optional, Tuple +from typing import Callable, Dict, List, NamedTuple, Optional, Tuple, Union import filelock import requests @@ -478,7 +478,7 @@ def get_instance_type_for_cpus_mem_impl( def get_accelerators_from_instance_type_impl( df: 'pd.DataFrame', instance_type: str, -) -> Optional[Dict[str, int]]: +) -> Optional[Dict[str, Union[int, float]]]: df = _get_instance_type(df, instance_type, None) if len(df) == 0: with ux_utils.print_exception_no_traceback(): @@ -487,7 +487,9 @@ def get_accelerators_from_instance_type_impl( acc_name, acc_count = row['AcceleratorName'], row['AcceleratorCount'] if pd.isnull(acc_name): return None - return {acc_name: int(acc_count)} + # Should be guaranteed by the catalog fetcher. + assert isinstance(acc_count, (int, float)), acc_count + return {acc_name: acc_count} def get_instance_type_for_accelerator_impl( diff --git a/sky/clouds/service_catalog/cudo_catalog.py b/sky/clouds/service_catalog/cudo_catalog.py index a3ccdab88e3..30a4e409cf8 100644 --- a/sky/clouds/service_catalog/cudo_catalog.py +++ b/sky/clouds/service_catalog/cudo_catalog.py @@ -1,7 +1,7 @@ """Cudo Compute Offerings Catalog.""" import typing -from typing import Dict, List, Optional, Tuple +from typing import Dict, List, Optional, Tuple, Union from sky.clouds.service_catalog import common import sky.provision.cudo.cudo_machine_type as cudo_mt @@ -56,7 +56,7 @@ def get_default_instance_type(cpus: Optional[str] = None, def get_accelerators_from_instance_type( - instance_type: str) -> Optional[Dict[str, int]]: + instance_type: str) -> Optional[Dict[str, Union[int, float]]]: return common.get_accelerators_from_instance_type_impl(_df, instance_type) diff --git a/sky/clouds/service_catalog/fluidstack_catalog.py b/sky/clouds/service_catalog/fluidstack_catalog.py index 2f47a38df43..7a28ac8174a 100644 --- a/sky/clouds/service_catalog/fluidstack_catalog.py +++ b/sky/clouds/service_catalog/fluidstack_catalog.py @@ -4,7 +4,7 @@ instance types and pricing information for FluidStack. """ import typing -from typing import Dict, List, Optional, Tuple +from typing import Dict, List, Optional, Tuple, Union from sky.clouds.service_catalog import common from sky.utils import ux_utils @@ -65,7 +65,7 @@ def get_default_instance_type(cpus: Optional[str] = None, def get_accelerators_from_instance_type( - instance_type: str) -> Optional[Dict[str, int]]: + instance_type: str) -> Optional[Dict[str, Union[int, float]]]: return common.get_accelerators_from_instance_type_impl(_df, instance_type) diff --git a/sky/clouds/service_catalog/ibm_catalog.py b/sky/clouds/service_catalog/ibm_catalog.py index 51b4e14f569..5cec86fbb65 100644 --- a/sky/clouds/service_catalog/ibm_catalog.py +++ b/sky/clouds/service_catalog/ibm_catalog.py @@ -4,7 +4,7 @@ instance types and pricing information for IBM. """ -from typing import Dict, List, Optional, Tuple +from typing import Dict, List, Optional, Tuple, Union from sky import sky_logging from sky.adaptors import ibm @@ -43,7 +43,7 @@ def get_vcpus_mem_from_instance_type( def get_accelerators_from_instance_type( - instance_type: str) -> Optional[Dict[str, int]]: + instance_type: str) -> Optional[Dict[str, Union[int, float]]]: return common.get_accelerators_from_instance_type_impl(_df, instance_type) diff --git a/sky/clouds/service_catalog/lambda_catalog.py b/sky/clouds/service_catalog/lambda_catalog.py index e843ab72cc0..24cb4064d54 100644 --- a/sky/clouds/service_catalog/lambda_catalog.py +++ b/sky/clouds/service_catalog/lambda_catalog.py @@ -4,7 +4,7 @@ instance types and pricing information for Lambda. """ import typing -from typing import Dict, List, Optional, Tuple +from typing import Dict, List, Optional, Tuple, Union from sky.clouds.service_catalog import common from sky.utils import resources_utils @@ -72,7 +72,7 @@ def get_default_instance_type( def get_accelerators_from_instance_type( - instance_type: str) -> Optional[Dict[str, int]]: + instance_type: str) -> Optional[Dict[str, Union[int, float]]]: return common.get_accelerators_from_instance_type_impl(_df, instance_type) diff --git a/sky/clouds/service_catalog/oci_catalog.py b/sky/clouds/service_catalog/oci_catalog.py index 2561b913dcf..6f19aefcdac 100644 --- a/sky/clouds/service_catalog/oci_catalog.py +++ b/sky/clouds/service_catalog/oci_catalog.py @@ -12,7 +12,7 @@ import logging import threading import typing -from typing import Dict, List, Optional, Tuple +from typing import Dict, List, Optional, Tuple, Union from sky.adaptors import oci as oci_adaptor from sky.clouds.service_catalog import common @@ -124,7 +124,7 @@ def get_default_instance_type( def get_accelerators_from_instance_type( - instance_type: str) -> Optional[Dict[str, int]]: + instance_type: str) -> Optional[Dict[str, Union[int, float]]]: return common.get_accelerators_from_instance_type_impl( _get_df(), instance_type) diff --git a/sky/clouds/service_catalog/paperspace_catalog.py b/sky/clouds/service_catalog/paperspace_catalog.py index 1eb635c93e5..49948b219a1 100644 --- a/sky/clouds/service_catalog/paperspace_catalog.py +++ b/sky/clouds/service_catalog/paperspace_catalog.py @@ -5,7 +5,7 @@ """ import typing -from typing import Dict, List, Optional, Tuple +from typing import Dict, List, Optional, Tuple, Union from sky.clouds.service_catalog import common from sky.utils import ux_utils @@ -60,7 +60,7 @@ def get_default_instance_type( def get_accelerators_from_instance_type( - instance_type: str) -> Optional[Dict[str, int]]: + instance_type: str) -> Optional[Dict[str, Union[int, float]]]: return common.get_accelerators_from_instance_type_impl(_df, instance_type) diff --git a/sky/clouds/service_catalog/runpod_catalog.py b/sky/clouds/service_catalog/runpod_catalog.py index 2d3ed44307b..7fbc46206ed 100644 --- a/sky/clouds/service_catalog/runpod_catalog.py +++ b/sky/clouds/service_catalog/runpod_catalog.py @@ -5,7 +5,7 @@ """ import typing -from typing import Dict, List, Optional, Tuple +from typing import Dict, List, Optional, Tuple, Union from sky.clouds.service_catalog import common from sky.utils import ux_utils @@ -56,7 +56,7 @@ def get_default_instance_type(cpus: Optional[str] = None, def get_accelerators_from_instance_type( - instance_type: str) -> Optional[Dict[str, int]]: + instance_type: str) -> Optional[Dict[str, Union[int, float]]]: return common.get_accelerators_from_instance_type_impl(_df, instance_type) diff --git a/sky/clouds/service_catalog/scp_catalog.py b/sky/clouds/service_catalog/scp_catalog.py index 209bb4cf631..e4773ab3250 100644 --- a/sky/clouds/service_catalog/scp_catalog.py +++ b/sky/clouds/service_catalog/scp_catalog.py @@ -5,7 +5,7 @@ """ import typing -from typing import Dict, List, Optional, Tuple +from typing import Dict, List, Optional, Tuple, Union from sky.clouds.service_catalog import common from sky.utils import resources_utils @@ -67,7 +67,7 @@ def get_default_instance_type( def get_accelerators_from_instance_type( - instance_type: str) -> Optional[Dict[str, int]]: + instance_type: str) -> Optional[Dict[str, Union[int, float]]]: return common.get_accelerators_from_instance_type_impl(_df, instance_type) diff --git a/sky/clouds/service_catalog/vsphere_catalog.py b/sky/clouds/service_catalog/vsphere_catalog.py index e1199d3d266..74fb2fbe60d 100644 --- a/sky/clouds/service_catalog/vsphere_catalog.py +++ b/sky/clouds/service_catalog/vsphere_catalog.py @@ -2,7 +2,7 @@ import io import os import typing -from typing import Dict, List, Optional, Tuple +from typing import Dict, List, Optional, Tuple, Union from sky.adaptors import common as adaptors_common from sky.clouds.service_catalog import common @@ -85,7 +85,7 @@ def get_default_instance_type( def get_accelerators_from_instance_type( - instance_type: str) -> Optional[Dict[str, int]]: + instance_type: str) -> Optional[Dict[str, Union[int, float]]]: return common.get_accelerators_from_instance_type_impl( _get_df(), instance_type) diff --git a/sky/clouds/vsphere.py b/sky/clouds/vsphere.py index 6e7e1abeb04..af149ccf67c 100644 --- a/sky/clouds/vsphere.py +++ b/sky/clouds/vsphere.py @@ -2,7 +2,7 @@ import json import subprocess import typing -from typing import Dict, Iterator, List, Optional, Tuple +from typing import Dict, Iterator, List, Optional, Tuple, Union import requests @@ -152,7 +152,7 @@ def get_default_instance_type( def get_accelerators_from_instance_type( cls, instance_type: str, - ) -> Optional[Dict[str, int]]: + ) -> Optional[Dict[str, Union[int, float]]]: return service_catalog.get_accelerators_from_instance_type( instance_type, clouds=_CLOUD_VSPHERE) diff --git a/sky/resources.py b/sky/resources.py index 2f19cd1aa01..5b5f4f6c2fe 100644 --- a/sky/resources.py +++ b/sky/resources.py @@ -391,7 +391,7 @@ def memory(self) -> Optional[str]: @property @functools.lru_cache(maxsize=1) - def accelerators(self) -> Optional[Dict[str, int]]: + def accelerators(self) -> Optional[Dict[str, Union[int, float]]]: """Returns the accelerators field directly or by inferring. For example, Resources(AWS, 'p3.2xlarge') has its accelerators field From 347ad621210378c0532ca0688d73de2b189e2962 Mon Sep 17 00:00:00 2001 From: cblmemo Date: Tue, 27 Aug 2024 14:40:27 -0700 Subject: [PATCH 04/25] fix unittest --- sky/clouds/service_catalog/common.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/sky/clouds/service_catalog/common.py b/sky/clouds/service_catalog/common.py index abe305f793b..d52ae942d8b 100644 --- a/sky/clouds/service_catalog/common.py +++ b/sky/clouds/service_catalog/common.py @@ -487,9 +487,11 @@ def get_accelerators_from_instance_type_impl( acc_name, acc_count = row['AcceleratorName'], row['AcceleratorCount'] if pd.isnull(acc_name): return None - # Should be guaranteed by the catalog fetcher. - assert isinstance(acc_count, (int, float)), acc_count - return {acc_name: acc_count} + def _convert(value): + if int(value) == value: + return int(value) + return float(value) + return {acc_name: _convert(acc_count)} def get_instance_type_for_accelerator_impl( From 71af06e58b3968b0797374105c0eb5be91aea450 Mon Sep 17 00:00:00 2001 From: cblmemo Date: Tue, 27 Aug 2024 14:41:58 -0700 Subject: [PATCH 05/25] format --- sky/clouds/service_catalog/common.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sky/clouds/service_catalog/common.py b/sky/clouds/service_catalog/common.py index d52ae942d8b..36800c9be18 100644 --- a/sky/clouds/service_catalog/common.py +++ b/sky/clouds/service_catalog/common.py @@ -487,10 +487,12 @@ def get_accelerators_from_instance_type_impl( acc_name, acc_count = row['AcceleratorName'], row['AcceleratorCount'] if pd.isnull(acc_name): return None + def _convert(value): if int(value) == value: return int(value) return float(value) + return {acc_name: _convert(acc_count)} From d41944212904ebbf4a6d15facb96841cad8e7537 Mon Sep 17 00:00:00 2001 From: cblmemo Date: Tue, 27 Aug 2024 15:08:00 -0700 Subject: [PATCH 06/25] patch ray resources to ceil value --- sky/clouds/aws.py | 7 ++----- sky/clouds/azure.py | 4 ++-- sky/clouds/cudo.py | 7 ++----- sky/clouds/fluidstack.py | 7 ++----- sky/clouds/ibm.py | 7 ++----- sky/clouds/kubernetes.py | 7 ++----- sky/clouds/lambda_cloud.py | 7 ++----- sky/clouds/oci.py | 7 ++----- sky/clouds/paperspace.py | 7 ++----- sky/clouds/runpod.py | 7 ++----- sky/clouds/scp.py | 7 ++----- sky/clouds/vsphere.py | 7 ++----- sky/utils/resources_utils.py | 14 +++++++++++++- 13 files changed, 37 insertions(+), 58 deletions(-) diff --git a/sky/clouds/aws.py b/sky/clouds/aws.py index 9c55836707e..155e055436e 100644 --- a/sky/clouds/aws.py +++ b/sky/clouds/aws.py @@ -2,7 +2,6 @@ import enum import fnmatch import functools -import json import os import re import subprocess @@ -394,10 +393,8 @@ def make_deploy_resources_variables( r = resources # r.accelerators is cleared but .instance_type encodes the info. acc_dict = self.get_accelerators_from_instance_type(r.instance_type) - if acc_dict is not None: - custom_resources = json.dumps(acc_dict, separators=(',', ':')) - else: - custom_resources = None + custom_resources = resources_utils.make_ray_custom_resources_str( + acc_dict) if r.extract_docker_image() is not None: image_id_to_use = None diff --git a/sky/clouds/azure.py b/sky/clouds/azure.py index e5c3b9dd883..bb95cd9e782 100644 --- a/sky/clouds/azure.py +++ b/sky/clouds/azure.py @@ -1,6 +1,5 @@ """Azure.""" import functools -import json import os import re import subprocess @@ -284,7 +283,8 @@ def make_deploy_resources_variables( acc_dict = self.get_accelerators_from_instance_type(r.instance_type) acc_count = None if acc_dict is not None: - custom_resources = json.dumps(acc_dict, separators=(',', ':')) + custom_resources = resources_utils.make_ray_custom_resources_str( + acc_dict) acc_count = str(sum(acc_dict.values())) else: custom_resources = None diff --git a/sky/clouds/cudo.py b/sky/clouds/cudo.py index a4ea8586640..523cfba2a24 100644 --- a/sky/clouds/cudo.py +++ b/sky/clouds/cudo.py @@ -1,5 +1,4 @@ """Cudo Compute""" -import json import subprocess import typing from typing import Dict, Iterator, List, Optional, Tuple, Union @@ -202,10 +201,8 @@ def make_deploy_resources_variables( del zones, cluster_name # unused r = resources acc_dict = self.get_accelerators_from_instance_type(r.instance_type) - if acc_dict is not None: - custom_resources = json.dumps(acc_dict, separators=(',', ':')) - else: - custom_resources = None + custom_resources = resources_utils.make_ray_custom_resources_str( + acc_dict) return { 'instance_type': resources.instance_type, diff --git a/sky/clouds/fluidstack.py b/sky/clouds/fluidstack.py index 96737211cae..70379f67354 100644 --- a/sky/clouds/fluidstack.py +++ b/sky/clouds/fluidstack.py @@ -1,5 +1,4 @@ """Fluidstack Cloud.""" -import json import os import typing from typing import Dict, Iterator, List, Optional, Tuple, Union @@ -184,10 +183,8 @@ def make_deploy_resources_variables( r = resources acc_dict = self.get_accelerators_from_instance_type(r.instance_type) - if acc_dict is not None: - custom_resources = json.dumps(acc_dict, separators=(',', ':')) - else: - custom_resources = None + custom_resources = resources_utils.make_ray_custom_resources_str( + acc_dict) return { 'instance_type': resources.instance_type, diff --git a/sky/clouds/ibm.py b/sky/clouds/ibm.py index aca2aa4a44e..0ac3c36cc48 100644 --- a/sky/clouds/ibm.py +++ b/sky/clouds/ibm.py @@ -1,5 +1,4 @@ """IBM Web Services.""" -import json import os import typing from typing import Any, Dict, Iterator, List, Optional, Tuple, Union @@ -206,10 +205,8 @@ def _get_profile_resources(instance_profile): 'IBM does not currently support spot instances in this framework' acc_dict = self.get_accelerators_from_instance_type(r.instance_type) - if acc_dict is not None: - custom_resources = json.dumps(acc_dict, separators=(',', ':')) - else: - custom_resources = None + custom_resources = resources_utils.make_ray_custom_resources_str( + acc_dict) instance_resources = _get_profile_resources(r.instance_type) diff --git a/sky/clouds/kubernetes.py b/sky/clouds/kubernetes.py index 498653aab17..a0107018805 100644 --- a/sky/clouds/kubernetes.py +++ b/sky/clouds/kubernetes.py @@ -1,5 +1,4 @@ """Kubernetes.""" -import json import os import re import typing @@ -234,10 +233,8 @@ def make_deploy_resources_variables( r = resources acc_dict = self.get_accelerators_from_instance_type(r.instance_type) - if acc_dict is not None: - custom_resources = json.dumps(acc_dict, separators=(',', ':')) - else: - custom_resources = None + custom_resources = resources_utils.make_ray_custom_resources_str( + acc_dict) # resources.memory and cpus are None if they are not explicitly set. # We fetch the default values for the instance type in that case. diff --git a/sky/clouds/lambda_cloud.py b/sky/clouds/lambda_cloud.py index 98ea07a275b..198ebba0236 100644 --- a/sky/clouds/lambda_cloud.py +++ b/sky/clouds/lambda_cloud.py @@ -1,5 +1,4 @@ """Lambda Cloud.""" -import json import typing from typing import Dict, Iterator, List, Optional, Tuple, Union @@ -165,10 +164,8 @@ def make_deploy_resources_variables( r = resources acc_dict = self.get_accelerators_from_instance_type(r.instance_type) - if acc_dict is not None: - custom_resources = json.dumps(acc_dict, separators=(',', ':')) - else: - custom_resources = None + custom_resources = resources_utils.make_ray_custom_resources_str( + acc_dict) return { 'instance_type': resources.instance_type, diff --git a/sky/clouds/oci.py b/sky/clouds/oci.py index 800ec3f2a07..359117f7681 100644 --- a/sky/clouds/oci.py +++ b/sky/clouds/oci.py @@ -5,7 +5,6 @@ - Hysun He (hysun.he@oracle.com) @ May 4, 2023: Support use the default image_id (configurable) if no image_id specified in the task yaml. """ -import json import logging import os import typing @@ -196,10 +195,8 @@ def make_deploy_resources_variables( acc_dict = self.get_accelerators_from_instance_type( resources.instance_type) - if acc_dict is not None: - custom_resources = json.dumps(acc_dict, separators=(',', ':')) - else: - custom_resources = None + custom_resources = resources_utils.make_ray_custom_resources_str( + acc_dict) image_str = self._get_image_id(resources.image_id, region.name, resources.instance_type) diff --git a/sky/clouds/paperspace.py b/sky/clouds/paperspace.py index 5d78bf1068b..2f1d549a928 100644 --- a/sky/clouds/paperspace.py +++ b/sky/clouds/paperspace.py @@ -1,6 +1,5 @@ """ Paperspace Cloud. """ -import json import typing from typing import Dict, Iterator, List, Optional, Tuple, Union @@ -181,10 +180,8 @@ def make_deploy_resources_variables( r = resources acc_dict = self.get_accelerators_from_instance_type(r.instance_type) - if acc_dict is not None: - custom_resources = json.dumps(acc_dict, separators=(',', ':')) - else: - custom_resources = None + custom_resources = resources_utils.make_ray_custom_resources_str( + acc_dict) return { 'instance_type': resources.instance_type, diff --git a/sky/clouds/runpod.py b/sky/clouds/runpod.py index a93dbcfc12b..3fcf7fc542b 100644 --- a/sky/clouds/runpod.py +++ b/sky/clouds/runpod.py @@ -1,6 +1,5 @@ """ RunPod Cloud. """ -import json import typing from typing import Dict, Iterator, List, Optional, Tuple, Union @@ -166,10 +165,8 @@ def make_deploy_resources_variables( r = resources acc_dict = self.get_accelerators_from_instance_type(r.instance_type) - if acc_dict is not None: - custom_resources = json.dumps(acc_dict, separators=(',', ':')) - else: - custom_resources = None + custom_resources = resources_utils.make_ray_custom_resources_str( + acc_dict) if r.image_id is None: image_id = 'runpod/base:0.0.2' diff --git a/sky/clouds/scp.py b/sky/clouds/scp.py index 6263e7372fd..9ebbfad99e2 100644 --- a/sky/clouds/scp.py +++ b/sky/clouds/scp.py @@ -4,7 +4,6 @@ to access the SCP catalog and check credentials for the SCP access. """ -import json import typing from typing import Dict, Iterator, List, Optional, Tuple, Union @@ -188,11 +187,9 @@ def make_deploy_resources_variables( r = resources acc_dict = self.get_accelerators_from_instance_type(r.instance_type) + custom_resources = resources_utils.make_ray_custom_resources_str( + acc_dict) - if acc_dict is not None: - custom_resources = json.dumps(acc_dict, separators=(',', ':')) - else: - custom_resources = None image_id = self._get_image_id(r.image_id, region.name, r.instance_type) return { 'instance_type': resources.instance_type, diff --git a/sky/clouds/vsphere.py b/sky/clouds/vsphere.py index af149ccf67c..62980fdc067 100644 --- a/sky/clouds/vsphere.py +++ b/sky/clouds/vsphere.py @@ -1,5 +1,4 @@ """Vsphere cloud implementation.""" -import json import subprocess import typing from typing import Dict, Iterator, List, Optional, Tuple, Union @@ -182,10 +181,8 @@ def make_deploy_resources_variables( zone_names = [zone.name for zone in zones] r = resources acc_dict = self.get_accelerators_from_instance_type(r.instance_type) - if acc_dict is not None: - custom_resources = json.dumps(acc_dict, separators=(',', ':')) - else: - custom_resources = None + custom_resources = resources_utils.make_ray_custom_resources_str( + acc_dict) return { 'instance_type': resources.instance_type, diff --git a/sky/utils/resources_utils.py b/sky/utils/resources_utils.py index 95c784143cc..e029dc99b75 100644 --- a/sky/utils/resources_utils.py +++ b/sky/utils/resources_utils.py @@ -2,9 +2,11 @@ import dataclasses import enum import itertools +import json +import math import re import typing -from typing import List, Optional, Set +from typing import Dict, List, Optional, Set, Union from sky.utils import ux_utils @@ -160,6 +162,16 @@ def get_readable_resources_repr(handle: 'backends.CloudVmRayResourceHandle', return _DEFAULT_MESSAGE_HANDLE_INITIALIZING +def make_ray_custom_resources_str( + resource_dict: Optional[Dict[str, Union[int, float]]]) -> Optional[str]: + """Convert resources to Ray custom resources format.""" + if resource_dict is None: + return None + # Ray does not allow fractional resources, so we need to ceil the values. + ceiled_dict = {k: math.ceil(v) for k, v in resource_dict.items()} + return json.dumps(ceiled_dict, separators=(',', ':')) + + @dataclasses.dataclass class FeasibleResources: """Feasible resources returned by cloud. From f52968922e3d7ecdb06e29681fedb3bce9b97834 Mon Sep 17 00:00:00 2001 From: cblmemo Date: Wed, 28 Aug 2024 10:14:44 -0700 Subject: [PATCH 07/25] support launch from --gpus A10 --- sky/clouds/service_catalog/azure_catalog.py | 15 +-------------- sky/clouds/service_catalog/constants.py | 5 ----- .../service_catalog/data_fetchers/fetch_azure.py | 11 +++++++++-- 3 files changed, 10 insertions(+), 21 deletions(-) diff --git a/sky/clouds/service_catalog/azure_catalog.py b/sky/clouds/service_catalog/azure_catalog.py index 3a8f79a802c..4f76938019e 100644 --- a/sky/clouds/service_catalog/azure_catalog.py +++ b/sky/clouds/service_catalog/azure_catalog.py @@ -9,7 +9,6 @@ from sky import clouds as cloud_lib from sky.clouds import Azure from sky.clouds.service_catalog import common -from sky.clouds.service_catalog import constants from sky.utils import resources_utils from sky.utils import ux_utils @@ -43,14 +42,6 @@ _DEFAULT_NUM_VCPUS = 8 _DEFAULT_MEMORY_CPU_RATIO = 4 -# Some A10 instance types only contains a fractional of GPU. We temporarily -# filter them out here to avoid using it as a whole A10 GPU. -# TODO(zhwu,tian): support fractional GPUs, which can be done on -# kubernetes as well. -# Ref: https://learn.microsoft.com/en-us/azure/virtual-machines/nva10v5-series -_FILTERED_A10_INSTANCE_TYPES = list( - constants.AZURE_FRACTIONAL_A10_INS_TYPE_TO_NUM_GPUS.keys()) - def instance_type_exists(instance_type: str) -> bool: return common.instance_type_exists_impl(_df, instance_type) @@ -148,11 +139,7 @@ def get_instance_type_for_accelerator( with ux_utils.print_exception_no_traceback(): raise ValueError('Azure does not support zones.') - # Filter out instance types that only contain a fractional of GPU. - df_filtered = _df.loc[~_df['InstanceType'].isin(_FILTERED_A10_INSTANCE_TYPES - )] - - return common.get_instance_type_for_accelerator_impl(df=df_filtered, + return common.get_instance_type_for_accelerator_impl(df=_df, acc_name=acc_name, acc_count=acc_count, cpus=cpus, diff --git a/sky/clouds/service_catalog/constants.py b/sky/clouds/service_catalog/constants.py index 9fc6e8477f1..1373fd86a03 100644 --- a/sky/clouds/service_catalog/constants.py +++ b/sky/clouds/service_catalog/constants.py @@ -5,8 +5,3 @@ ALL_CLOUDS = ('aws', 'azure', 'gcp', 'ibm', 'lambda', 'scp', 'oci', 'kubernetes', 'runpod', 'vsphere', 'cudo', 'fluidstack', 'paperspace') -# Azure has those fractional A10 instance types, which still shows has 1 A10 GPU -# in the API response. We manually changing the number of GPUs to a float here. -AZURE_FRACTIONAL_A10_INS_TYPE_TO_NUM_GPUS = { - f'Standard_NV{vcpu}ads_A10_v5': vcpu / 24 for vcpu in [6, 12, 18] -} diff --git a/sky/clouds/service_catalog/data_fetchers/fetch_azure.py b/sky/clouds/service_catalog/data_fetchers/fetch_azure.py index 82c7396e3e6..cb2f39d18fc 100644 --- a/sky/clouds/service_catalog/data_fetchers/fetch_azure.py +++ b/sky/clouds/service_catalog/data_fetchers/fetch_azure.py @@ -15,7 +15,6 @@ import requests from sky.adaptors import common as adaptors_common -from sky.clouds.service_catalog import constants if typing.TYPE_CHECKING: import pandas as pd @@ -94,6 +93,14 @@ def get_regions() -> List[str]: # We have to manually remove it. DEPRECATED_FAMILIES = ['standardNVSv2Family'] +# Azure has those fractional A10 instance types, which still shows has 1 A10 GPU +# in the API response. We manually changing the number of GPUs to a float here. +# Ref: https://learn.microsoft.com/en-us/azure/virtual-machines/nva10v5-series +# TODO(zhwu,tian): Support fractional GPUs on k8s as well. +AZURE_FRACTIONAL_A10_INS_TYPE_TO_NUM_GPUS = { + f'Standard_NV{vcpu}ads_A10_v5': vcpu / 24 for vcpu in [6, 12, 18] +} + USEFUL_COLUMNS = [ 'InstanceType', 'AcceleratorName', 'AcceleratorCount', 'vCPUs', 'MemoryGiB', 'GpuInfo', 'Price', 'SpotPrice', 'Region', 'Generation' @@ -267,7 +274,7 @@ def get_additional_columns(row): ) def _upd_a10_gpu_count(row): - new_gpu_cnt = constants.AZURE_FRACTIONAL_A10_INS_TYPE_TO_NUM_GPUS.get( + new_gpu_cnt = AZURE_FRACTIONAL_A10_INS_TYPE_TO_NUM_GPUS.get( row['InstanceType']) if new_gpu_cnt is not None: return new_gpu_cnt From 2031a5061e1b0a1b3dd292ba19fc8f01d76559ee Mon Sep 17 00:00:00 2001 From: cblmemo Date: Wed, 28 Aug 2024 11:14:16 -0700 Subject: [PATCH 08/25] only allow strictly match fractional gpu counts --- sky/backends/cloud_vm_ray_backend.py | 18 ++++++++++++++++++ sky/resources.py | 11 +++++++++++ 2 files changed, 29 insertions(+) diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py index 9545436f05c..5500d6733b0 100644 --- a/sky/backends/cloud_vm_ray_backend.py +++ b/sky/backends/cloud_vm_ray_backend.py @@ -2663,6 +2663,24 @@ def check_resources_fit_cluster( 'stores, but the existing cluster with ' f'{launched_resources!r} does not support FUSE ' f'mounting. Launch a new cluster to run this task.') + if (example_resource.accelerators is not None and + launched_resources.accelerators is not None): + for acc in example_resource.accelerators: + if acc not in launched_resources.accelerators: + continue + self_count = example_resource.accelerators[acc] + existing_count = launched_resources.accelerators[acc] + if (isinstance(self_count, float) and + isinstance(existing_count, float) and + not math.isclose(self_count, existing_count)): + with ux_utils.print_exception_no_traceback(): + raise exceptions.ResourcesMismatchError( + 'Task requested resources with fractional ' + 'accelerator counts. For fractional ' + 'counts, the required count must match the ' + 'existing cluster. Got required accelerator' + f' {acc}:{self_count} but the existing ' + f'cluster has {acc}:{existing_count}.') requested_resource_str = ', '.join(requested_resource_list) if isinstance(task.resources, list): requested_resource_str = f'[{requested_resource_str}]' diff --git a/sky/resources.py b/sky/resources.py index 5b5f4f6c2fe..ea027ab89f4 100644 --- a/sky/resources.py +++ b/sky/resources.py @@ -1,6 +1,7 @@ """Resources: compute requirements of Tasks.""" import dataclasses import functools +import math import textwrap from typing import Any, Dict, List, Optional, Set, Tuple, Union @@ -1141,6 +1142,16 @@ def less_demanding_than( return False if self.accelerators[acc] > other_accelerators[acc]: return False + if isinstance(self.accelerators[acc], float) or isinstance( + other_accelerators[acc], float): + # If the requested accelerator count is a float, we only + # allow strictly equal counts since all of the float point + # accelerator counts are less than 1 (e.g., 0.1, 0.5), and + # we want to avoid semantic ambiguity (e.g. launching + # with --gpus A10:0.25 on a A10:0.75 cluster). + if not math.isclose(self.accelerators[acc], + other_accelerators[acc]): + return False # self.accelerators <= other.accelerators if (self.accelerator_args is not None and From 07e47d62a2d837299fce6b438fb99d2dce6d9cbb Mon Sep 17 00:00:00 2001 From: cblmemo Date: Tue, 3 Sep 2024 14:54:32 -0700 Subject: [PATCH 09/25] address comment --- sky/backends/cloud_vm_ray_backend.py | 4 ++-- sky/clouds/service_catalog/data_fetchers/fetch_azure.py | 2 ++ sky/resources.py | 4 ++-- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py index 5500d6733b0..c4d57202efd 100644 --- a/sky/backends/cloud_vm_ray_backend.py +++ b/sky/backends/cloud_vm_ray_backend.py @@ -2670,8 +2670,8 @@ def check_resources_fit_cluster( continue self_count = example_resource.accelerators[acc] existing_count = launched_resources.accelerators[acc] - if (isinstance(self_count, float) and - isinstance(existing_count, float) and + if (isinstance(existing_count, float) and + not existing_count.is_integer() and not math.isclose(self_count, existing_count)): with ux_utils.print_exception_no_traceback(): raise exceptions.ResourcesMismatchError( diff --git a/sky/clouds/service_catalog/data_fetchers/fetch_azure.py b/sky/clouds/service_catalog/data_fetchers/fetch_azure.py index cb2f39d18fc..1eaaaae37dc 100644 --- a/sky/clouds/service_catalog/data_fetchers/fetch_azure.py +++ b/sky/clouds/service_catalog/data_fetchers/fetch_azure.py @@ -281,6 +281,8 @@ def _upd_a10_gpu_count(row): return row['AcceleratorCount'] # Manually update the GPU count for fractional A10 instance types. + # Those instance types have fractional GPU count, but Azure API returns + # 1 GPU count for them. We manually update the GPU count here. df_ret['AcceleratorCount'] = df_ret.apply(_upd_a10_gpu_count, axis='columns') diff --git a/sky/resources.py b/sky/resources.py index ea027ab89f4..e5c28ea87c5 100644 --- a/sky/resources.py +++ b/sky/resources.py @@ -1142,8 +1142,8 @@ def less_demanding_than( return False if self.accelerators[acc] > other_accelerators[acc]: return False - if isinstance(self.accelerators[acc], float) or isinstance( - other_accelerators[acc], float): + if (isinstance(other_accelerators[acc], float) and + not other_accelerators[acc].is_integer()): # If the requested accelerator count is a float, we only # allow strictly equal counts since all of the float point # accelerator counts are less than 1 (e.g., 0.1, 0.5), and From 84d6d0d99cfd7b56fb1d745d36d630ebf11c6980 Mon Sep 17 00:00:00 2001 From: cblmemo Date: Fri, 6 Sep 2024 17:10:32 -0700 Subject: [PATCH 10/25] change back condition --- sky/backends/cloud_vm_ray_backend.py | 4 ++-- sky/resources.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py index a47203256d9..48083993223 100644 --- a/sky/backends/cloud_vm_ray_backend.py +++ b/sky/backends/cloud_vm_ray_backend.py @@ -2670,8 +2670,8 @@ def check_resources_fit_cluster( continue self_count = example_resource.accelerators[acc] existing_count = launched_resources.accelerators[acc] - if (isinstance(existing_count, float) and - not existing_count.is_integer() and + if (isinstance(self_count, float) and + isinstance(existing_count, float) and not math.isclose(self_count, existing_count)): with ux_utils.print_exception_no_traceback(): raise exceptions.ResourcesMismatchError( diff --git a/sky/resources.py b/sky/resources.py index e5c28ea87c5..ea027ab89f4 100644 --- a/sky/resources.py +++ b/sky/resources.py @@ -1142,8 +1142,8 @@ def less_demanding_than( return False if self.accelerators[acc] > other_accelerators[acc]: return False - if (isinstance(other_accelerators[acc], float) and - not other_accelerators[acc].is_integer()): + if isinstance(self.accelerators[acc], float) or isinstance( + other_accelerators[acc], float): # If the requested accelerator count is a float, we only # allow strictly equal counts since all of the float point # accelerator counts are less than 1 (e.g., 0.1, 0.5), and From eca70334c27f64ba3b269c0988369f204e212444 Mon Sep 17 00:00:00 2001 From: cblmemo Date: Fri, 6 Sep 2024 19:27:03 -0700 Subject: [PATCH 11/25] fix --- sky/clouds/service_catalog/data_fetchers/fetch_azure.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sky/clouds/service_catalog/data_fetchers/fetch_azure.py b/sky/clouds/service_catalog/data_fetchers/fetch_azure.py index 1eaaaae37dc..1b98c9b7cd4 100644 --- a/sky/clouds/service_catalog/data_fetchers/fetch_azure.py +++ b/sky/clouds/service_catalog/data_fetchers/fetch_azure.py @@ -98,7 +98,7 @@ def get_regions() -> List[str]: # Ref: https://learn.microsoft.com/en-us/azure/virtual-machines/nva10v5-series # TODO(zhwu,tian): Support fractional GPUs on k8s as well. AZURE_FRACTIONAL_A10_INS_TYPE_TO_NUM_GPUS = { - f'Standard_NV{vcpu}ads_A10_v5': vcpu / 24 for vcpu in [6, 12, 18] + f'Standard_NV{vcpu}ads_A10_v5': vcpu / 36 for vcpu in [6, 12, 18] } USEFUL_COLUMNS = [ From 0055fc1e6524998f4c5bce552c4a632d2a19a43b Mon Sep 17 00:00:00 2001 From: cblmemo Date: Tue, 10 Sep 2024 23:25:57 -0700 Subject: [PATCH 12/25] apply suggestions from code review --- sky/backends/cloud_vm_ray_backend.py | 14 +++++++------- sky/clouds/azure.py | 6 ++---- sky/resources.py | 8 ++++---- 3 files changed, 13 insertions(+), 15 deletions(-) diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py index 48083993223..d9dfa55e73c 100644 --- a/sky/backends/cloud_vm_ray_backend.py +++ b/sky/backends/cloud_vm_ray_backend.py @@ -2670,16 +2670,16 @@ def check_resources_fit_cluster( continue self_count = example_resource.accelerators[acc] existing_count = launched_resources.accelerators[acc] - if (isinstance(self_count, float) and - isinstance(existing_count, float) and + if (isinstance(existing_count, float) and + not existing_count.is_integer() and not math.isclose(self_count, existing_count)): with ux_utils.print_exception_no_traceback(): raise exceptions.ResourcesMismatchError( - 'Task requested resources with fractional ' - 'accelerator counts. For fractional ' - 'counts, the required count must match the ' - 'existing cluster. Got required accelerator' - f' {acc}:{self_count} but the existing ' + 'Cluster has a fractional accelerator ' + 'counts. For such cluster, the required ' + 'count must match exactly with the count ' + 'in the cluster. Got required accelerator ' + f'{acc}:{self_count} but the existing ' f'cluster has {acc}:{existing_count}.') requested_resource_str = ', '.join(requested_resource_list) if isinstance(task.resources, list): diff --git a/sky/clouds/azure.py b/sky/clouds/azure.py index 0236aec0884..696f332ce6f 100644 --- a/sky/clouds/azure.py +++ b/sky/clouds/azure.py @@ -284,11 +284,9 @@ def make_deploy_resources_variables( acc_dict = self.get_accelerators_from_instance_type(r.instance_type) acc_count = None if acc_dict is not None: - custom_resources = resources_utils.make_ray_custom_resources_str( - acc_dict) acc_count = str(sum(acc_dict.values())) - else: - custom_resources = None + custom_resources = resources_utils.make_ray_custom_resources_str( + acc_dict) if (resources.image_id is None or resources.extract_docker_image() is not None): diff --git a/sky/resources.py b/sky/resources.py index ea027ab89f4..73ec1136b79 100644 --- a/sky/resources.py +++ b/sky/resources.py @@ -1142,11 +1142,11 @@ def less_demanding_than( return False if self.accelerators[acc] > other_accelerators[acc]: return False - if isinstance(self.accelerators[acc], float) or isinstance( - other_accelerators[acc], float): - # If the requested accelerator count is a float, we only + if (isinstance(other_accelerators[acc], float) and + not other_accelerators[acc].is_integer()): + # If the existing accelerator count is a float, we only # allow strictly equal counts since all of the float point - # accelerator counts are less than 1 (e.g., 0.1, 0.5), and + # accelerator counts are less than 1 (e.g., 0.167, 0.5), and # we want to avoid semantic ambiguity (e.g. launching # with --gpus A10:0.25 on a A10:0.75 cluster). if not math.isclose(self.accelerators[acc], From 96521195b4be5ab7e7495a0d80abc52cf8fa7b79 Mon Sep 17 00:00:00 2001 From: cblmemo Date: Wed, 11 Sep 2024 00:01:40 -0700 Subject: [PATCH 13/25] fix --- sky/backends/cloud_vm_ray_backend.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py index d9dfa55e73c..5816523bbce 100644 --- a/sky/backends/cloud_vm_ray_backend.py +++ b/sky/backends/cloud_vm_ray_backend.py @@ -3429,6 +3429,22 @@ def _execute( valid_resource = self.check_resources_fit_cluster(handle, task, check_ports=True) + # For fractional acc count clusters, we round up the number of accs to 1 + # (see sky/utils/resources_utils.py::make_ray_custom_resources_str). + # Also, we requires to launch on such cluster, the specified acc count + # must exactly match the launched acc count. Therefore, here we set the + # required acc count to 1 to make sure there will be only one task + # running on the cluster. + launched_accs = handle.launched_resources.accelerators + if (launched_accs is not None and + valid_resource.accelerators is not None): + for _, count in launched_accs.items(): + if isinstance(count, float) and not count.is_integer(): + valid_resource = valid_resource.copy( + accelerators={ + k: math.ceil(v) + for k, v in valid_resource.accelerators.items() + }) task_copy = copy.copy(task) # Handle multiple resources exec case. task_copy.set_resources(valid_resource) From a5c5b15cfec2b5f15de405828904d018f6d2077c Mon Sep 17 00:00:00 2001 From: Tian Xia Date: Wed, 11 Sep 2024 00:04:36 -0700 Subject: [PATCH 14/25] Update sky/backends/cloud_vm_ray_backend.py Co-authored-by: Zhanghao Wu --- sky/backends/cloud_vm_ray_backend.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py index 5816523bbce..ce7c661e169 100644 --- a/sky/backends/cloud_vm_ray_backend.py +++ b/sky/backends/cloud_vm_ray_backend.py @@ -2676,9 +2676,8 @@ def check_resources_fit_cluster( with ux_utils.print_exception_no_traceback(): raise exceptions.ResourcesMismatchError( 'Cluster has a fractional accelerator ' - 'counts. For such cluster, the required ' - 'count must match exactly with the count ' - 'in the cluster. Got required accelerator ' + 'counts. For such cluster, a task should request exact ' + 'the same count of accelerators. Got required accelerator ' f'{acc}:{self_count} but the existing ' f'cluster has {acc}:{existing_count}.') requested_resource_str = ', '.join(requested_resource_list) From d2cff96bbf87899919d3d6c7ec2eb26357230441 Mon Sep 17 00:00:00 2001 From: cblmemo Date: Wed, 11 Sep 2024 00:05:56 -0700 Subject: [PATCH 15/25] format --- sky/backends/cloud_vm_ray_backend.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py index ce7c661e169..b3f4c15f78d 100644 --- a/sky/backends/cloud_vm_ray_backend.py +++ b/sky/backends/cloud_vm_ray_backend.py @@ -2676,8 +2676,9 @@ def check_resources_fit_cluster( with ux_utils.print_exception_no_traceback(): raise exceptions.ResourcesMismatchError( 'Cluster has a fractional accelerator ' - 'counts. For such cluster, a task should request exact ' - 'the same count of accelerators. Got required accelerator ' + 'counts. For such cluster, a task should ' + 'request exact the same count of ' + 'accelerators. Got required accelerator ' f'{acc}:{self_count} but the existing ' f'cluster has {acc}:{existing_count}.') requested_resource_str = ', '.join(requested_resource_list) From e8e9954870cc27202b419bf190ab7952d18cce3d Mon Sep 17 00:00:00 2001 From: cblmemo Date: Wed, 11 Sep 2024 00:34:31 -0700 Subject: [PATCH 16/25] fix display of fuzzy candidates --- sky/clouds/service_catalog/common.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/sky/clouds/service_catalog/common.py b/sky/clouds/service_catalog/common.py index 4a70676babb..0ea317cd80e 100644 --- a/sky/clouds/service_catalog/common.py +++ b/sky/clouds/service_catalog/common.py @@ -527,8 +527,11 @@ def get_instance_type_for_accelerator_impl( fuzzy_candidate_list = [] if len(fuzzy_result) > 0: for _, row in fuzzy_result.iterrows(): + acc_cnt = row['AcceleratorCount'] + acc_count_display = (int(acc_cnt) + if acc_cnt.is_integer() else acc_cnt) fuzzy_candidate_list.append(f'{row["AcceleratorName"]}:' - f'{int(row["AcceleratorCount"])}') + f'{acc_count_display}') return (None, fuzzy_candidate_list) result = _filter_with_cpus(result, cpus) From db607fac96bfe47244287fb25d532e605e5607e7 Mon Sep 17 00:00:00 2001 From: cblmemo Date: Wed, 11 Sep 2024 22:48:06 -0700 Subject: [PATCH 17/25] fix precision issue --- sky/clouds/service_catalog/data_fetchers/fetch_azure.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/sky/clouds/service_catalog/data_fetchers/fetch_azure.py b/sky/clouds/service_catalog/data_fetchers/fetch_azure.py index 1b98c9b7cd4..f646cac339a 100644 --- a/sky/clouds/service_catalog/data_fetchers/fetch_azure.py +++ b/sky/clouds/service_catalog/data_fetchers/fetch_azure.py @@ -97,8 +97,10 @@ def get_regions() -> List[str]: # in the API response. We manually changing the number of GPUs to a float here. # Ref: https://learn.microsoft.com/en-us/azure/virtual-machines/nva10v5-series # TODO(zhwu,tian): Support fractional GPUs on k8s as well. +# TODO(tian): Maybe we should support literally fractional count, i.e. A10:1/6 +# instead of float point count (A10:0.167). AZURE_FRACTIONAL_A10_INS_TYPE_TO_NUM_GPUS = { - f'Standard_NV{vcpu}ads_A10_v5': vcpu / 36 for vcpu in [6, 12, 18] + f'Standard_NV{vcpu}ads_A10_v5': round(vcpu / 36, 3) for vcpu in [6, 12, 18] } USEFUL_COLUMNS = [ From e98ecdc0475fb5551e37571faa80da2f17dea609 Mon Sep 17 00:00:00 2001 From: cblmemo Date: Wed, 11 Sep 2024 23:13:05 -0700 Subject: [PATCH 18/25] fix num gpu required --- sky/backends/cloud_vm_ray_backend.py | 26 +++----------------------- sky/resources.py | 11 ----------- 2 files changed, 3 insertions(+), 34 deletions(-) diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py index b3f4c15f78d..582c489fc23 100644 --- a/sky/backends/cloud_vm_ray_backend.py +++ b/sky/backends/cloud_vm_ray_backend.py @@ -2663,24 +2663,6 @@ def check_resources_fit_cluster( 'stores, but the existing cluster with ' f'{launched_resources!r} does not support FUSE ' f'mounting. Launch a new cluster to run this task.') - if (example_resource.accelerators is not None and - launched_resources.accelerators is not None): - for acc in example_resource.accelerators: - if acc not in launched_resources.accelerators: - continue - self_count = example_resource.accelerators[acc] - existing_count = launched_resources.accelerators[acc] - if (isinstance(existing_count, float) and - not existing_count.is_integer() and - not math.isclose(self_count, existing_count)): - with ux_utils.print_exception_no_traceback(): - raise exceptions.ResourcesMismatchError( - 'Cluster has a fractional accelerator ' - 'counts. For such cluster, a task should ' - 'request exact the same count of ' - 'accelerators. Got required accelerator ' - f'{acc}:{self_count} but the existing ' - f'cluster has {acc}:{existing_count}.') requested_resource_str = ', '.join(requested_resource_list) if isinstance(task.resources, list): requested_resource_str = f'[{requested_resource_str}]' @@ -3431,10 +3413,8 @@ def _execute( check_ports=True) # For fractional acc count clusters, we round up the number of accs to 1 # (see sky/utils/resources_utils.py::make_ray_custom_resources_str). - # Also, we requires to launch on such cluster, the specified acc count - # must exactly match the launched acc count. Therefore, here we set the - # required acc count to 1 to make sure there will be only one task - # running on the cluster. + # Here we scale the required acc count to (required / launched) * 1 so + # the total number of accs is the same as the requested number. launched_accs = handle.launched_resources.accelerators if (launched_accs is not None and valid_resource.accelerators is not None): @@ -3442,7 +3422,7 @@ def _execute( if isinstance(count, float) and not count.is_integer(): valid_resource = valid_resource.copy( accelerators={ - k: math.ceil(v) + k: v / count for k, v in valid_resource.accelerators.items() }) task_copy = copy.copy(task) diff --git a/sky/resources.py b/sky/resources.py index 73ec1136b79..5b5f4f6c2fe 100644 --- a/sky/resources.py +++ b/sky/resources.py @@ -1,7 +1,6 @@ """Resources: compute requirements of Tasks.""" import dataclasses import functools -import math import textwrap from typing import Any, Dict, List, Optional, Set, Tuple, Union @@ -1142,16 +1141,6 @@ def less_demanding_than( return False if self.accelerators[acc] > other_accelerators[acc]: return False - if (isinstance(other_accelerators[acc], float) and - not other_accelerators[acc].is_integer()): - # If the existing accelerator count is a float, we only - # allow strictly equal counts since all of the float point - # accelerator counts are less than 1 (e.g., 0.167, 0.5), and - # we want to avoid semantic ambiguity (e.g. launching - # with --gpus A10:0.25 on a A10:0.75 cluster). - if not math.isclose(self.accelerators[acc], - other_accelerators[acc]): - return False # self.accelerators <= other.accelerators if (self.accelerator_args is not None and From 8ada7a277a5dd9c6e2000b945abeac56a5769618 Mon Sep 17 00:00:00 2001 From: cblmemo Date: Fri, 11 Oct 2024 15:22:57 -0700 Subject: [PATCH 19/25] refactor in check_resources_fit_cluster --- sky/backends/cloud_vm_ray_backend.py | 29 ++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py index 582c489fc23..8ec3396205a 100644 --- a/sky/backends/cloud_vm_ray_backend.py +++ b/sky/backends/cloud_vm_ray_backend.py @@ -2676,6 +2676,21 @@ def check_resources_fit_cluster( f' Existing:\t{handle.launched_nodes}x ' f'{handle.launched_resources}\n' f'{mismatch_str}') + else: + # For fractional acc count clusters, we round up the number of accs + # to 1 (sky/utils/resources_utils.py::make_ray_custom_resources_str) + # Here we scale the required acc count to (required / launched) * 1 + # so the total number of accs is the same as the requested number. + launched_accs = launched_resources.accelerators + if (launched_accs is not None and + valid_resource.accelerators is not None): + for _, count in launched_accs.items(): + if isinstance(count, float) and not count.is_integer(): + valid_resource = valid_resource.copy( + accelerators={ + k: v / count + for k, v in valid_resource.accelerators.items() + }) return valid_resource def _provision( @@ -3411,20 +3426,6 @@ def _execute( valid_resource = self.check_resources_fit_cluster(handle, task, check_ports=True) - # For fractional acc count clusters, we round up the number of accs to 1 - # (see sky/utils/resources_utils.py::make_ray_custom_resources_str). - # Here we scale the required acc count to (required / launched) * 1 so - # the total number of accs is the same as the requested number. - launched_accs = handle.launched_resources.accelerators - if (launched_accs is not None and - valid_resource.accelerators is not None): - for _, count in launched_accs.items(): - if isinstance(count, float) and not count.is_integer(): - valid_resource = valid_resource.copy( - accelerators={ - k: v / count - for k, v in valid_resource.accelerators.items() - }) task_copy = copy.copy(task) # Handle multiple resources exec case. task_copy.set_resources(valid_resource) From f6c9fad202ac6f6788b2df69768c475d69a43720 Mon Sep 17 00:00:00 2001 From: cblmemo Date: Fri, 11 Oct 2024 15:26:09 -0700 Subject: [PATCH 20/25] change type annotation of acc_count --- sky/clouds/service_catalog/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sky/clouds/service_catalog/common.py b/sky/clouds/service_catalog/common.py index 0ea317cd80e..1ad89bfab71 100644 --- a/sky/clouds/service_catalog/common.py +++ b/sky/clouds/service_catalog/common.py @@ -501,7 +501,7 @@ def _convert(value): def get_instance_type_for_accelerator_impl( df: 'pd.DataFrame', acc_name: str, - acc_count: int, + acc_count: Union[int, float], cpus: Optional[str] = None, memory: Optional[str] = None, use_spot: bool = False, From a1f59a034a7a9c20bddbe55e353444b8ba20333d Mon Sep 17 00:00:00 2001 From: cblmemo Date: Fri, 11 Oct 2024 15:29:37 -0700 Subject: [PATCH 21/25] enable fuzzy fp acc count --- sky/clouds/service_catalog/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sky/clouds/service_catalog/common.py b/sky/clouds/service_catalog/common.py index 1ad89bfab71..a20da388874 100644 --- a/sky/clouds/service_catalog/common.py +++ b/sky/clouds/service_catalog/common.py @@ -514,7 +514,7 @@ def get_instance_type_for_accelerator_impl( accelerators with sorted prices and a list of candidates with fuzzy search. """ result = df[(df['AcceleratorName'].str.fullmatch(acc_name, case=False)) & - (df['AcceleratorCount'] == acc_count)] + (abs(df['AcceleratorCount'] - acc_count) <= 0.01)] result = _filter_region_zone(result, region, zone) if len(result) == 0: fuzzy_result = df[ From 3200d398b9a2c4bd30701f045a5897634754c08c Mon Sep 17 00:00:00 2001 From: cblmemo Date: Fri, 11 Oct 2024 15:31:09 -0700 Subject: [PATCH 22/25] fix k8s --- sky/clouds/kubernetes.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sky/clouds/kubernetes.py b/sky/clouds/kubernetes.py index d674eca259a..c95dbd83df2 100644 --- a/sky/clouds/kubernetes.py +++ b/sky/clouds/kubernetes.py @@ -1,4 +1,5 @@ """Kubernetes.""" +import functools import os import re import typing From fb3049f42405f484b90a9dcc752f38efde685ec7 Mon Sep 17 00:00:00 2001 From: Tian Xia Date: Fri, 25 Oct 2024 13:14:43 -0700 Subject: [PATCH 23/25] Update sky/clouds/service_catalog/common.py Co-authored-by: Zhanghao Wu --- sky/clouds/service_catalog/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sky/clouds/service_catalog/common.py b/sky/clouds/service_catalog/common.py index f48ca3fea8c..13af48ad217 100644 --- a/sky/clouds/service_catalog/common.py +++ b/sky/clouds/service_catalog/common.py @@ -530,7 +530,7 @@ def get_instance_type_for_accelerator_impl( for _, row in fuzzy_result.iterrows(): acc_cnt = row['AcceleratorCount'] acc_count_display = (int(acc_cnt) - if acc_cnt.is_integer() else acc_cnt) + if acc_cnt.is_integer() else f'{acc_cnt:.2f}') fuzzy_candidate_list.append(f'{row["AcceleratorName"]}:' f'{acc_count_display}') return (None, fuzzy_candidate_list) From 82d442f9ee54fbf0525c3e47300d11876f6c0d6c Mon Sep 17 00:00:00 2001 From: cblmemo Date: Fri, 25 Oct 2024 13:54:29 -0700 Subject: [PATCH 24/25] fix integer gpus --- sky/clouds/service_catalog/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sky/clouds/service_catalog/common.py b/sky/clouds/service_catalog/common.py index 13af48ad217..48868058462 100644 --- a/sky/clouds/service_catalog/common.py +++ b/sky/clouds/service_catalog/common.py @@ -528,7 +528,7 @@ def get_instance_type_for_accelerator_impl( fuzzy_candidate_list = [] if len(fuzzy_result) > 0: for _, row in fuzzy_result.iterrows(): - acc_cnt = row['AcceleratorCount'] + acc_cnt = float(row['AcceleratorCount']) acc_count_display = (int(acc_cnt) if acc_cnt.is_integer() else f'{acc_cnt:.2f}') fuzzy_candidate_list.append(f'{row["AcceleratorName"]}:' From 84d146c4f3127db13977d6133d517de8f6d8b215 Mon Sep 17 00:00:00 2001 From: cblmemo Date: Fri, 25 Oct 2024 13:57:21 -0700 Subject: [PATCH 25/25] format --- sky/clouds/service_catalog/common.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sky/clouds/service_catalog/common.py b/sky/clouds/service_catalog/common.py index 48868058462..1082b4e9efd 100644 --- a/sky/clouds/service_catalog/common.py +++ b/sky/clouds/service_catalog/common.py @@ -529,8 +529,8 @@ def get_instance_type_for_accelerator_impl( if len(fuzzy_result) > 0: for _, row in fuzzy_result.iterrows(): acc_cnt = float(row['AcceleratorCount']) - acc_count_display = (int(acc_cnt) - if acc_cnt.is_integer() else f'{acc_cnt:.2f}') + acc_count_display = (int(acc_cnt) if acc_cnt.is_integer() else + f'{acc_cnt:.2f}') fuzzy_candidate_list.append(f'{row["AcceleratorName"]}:' f'{acc_count_display}') return (None, fuzzy_candidate_list)