From 66b86350b567938ca8f181bed12d7b5e840c633c Mon Sep 17 00:00:00 2001 From: Zongheng Yang Date: Wed, 6 Dec 2023 16:25:09 -0800 Subject: [PATCH] Azure: update fetch_azure to support two H100 families. (#2844) * Azure: update fetch_azure to support two H100 families. * format --- .../data_fetchers/fetch_azure.py | 50 +++++++++++-------- sky/utils/accelerator_registry.py | 10 +++- 2 files changed, 38 insertions(+), 22 deletions(-) diff --git a/sky/clouds/service_catalog/data_fetchers/fetch_azure.py b/sky/clouds/service_catalog/data_fetchers/fetch_azure.py index c7714784896..33a87351e45 100644 --- a/sky/clouds/service_catalog/data_fetchers/fetch_azure.py +++ b/sky/clouds/service_catalog/data_fetchers/fetch_azure.py @@ -36,6 +36,34 @@ SINGLE_THREADED = False +# Family name to SkyPilot GPU name mapping. +# +# When adding a new accelerator: +# - The instance type is typically already fetched, but we need to find the +# family name and add it to this mapping. +# - To inspect family names returned by Azure API, check the dataframes in +# get_all_regions_instance_types_df(). +FAMILY_NAME_TO_SKYPILOT_GPU_NAME = { + 'standardNCFamily': 'K80', + 'standardNCSv2Family': 'P100', + 'standardNCSv3Family': 'V100', + 'standardNCPromoFamily': 'K80', + 'StandardNCASv3_T4Family': 'T4', + 'standardNDSv2Family': 'V100-32GB', + 'StandardNCADSA100v4Family': 'A100-80GB', + 'standardNDAMSv4_A100Family': 'A100-80GB', + 'StandardNDASv4_A100Family': 'A100', + 'standardNVFamily': 'M60', + 'standardNVSv2Family': 'M60', + 'standardNVSv3Family': 'M60', + 'standardNVPromoFamily': 'M60', + 'standardNVSv4Family': 'Radeon MI25', + 'standardNDSFamily': 'P40', + 'StandardNVADSA10v5Family': 'A10', + 'StandardNCadsH100v5Family': 'H100', + 'standardNDSH100v5Family': 'H100', +} + def get_regions() -> List[str]: """Get all available regions.""" @@ -78,7 +106,7 @@ def get_pricing_url(region: Optional[str] = None) -> str: def get_pricing_df(region: Optional[str] = None) -> pd.DataFrame: all_items = [] url = get_pricing_url(region) - print(f'Getting pricing for {region}') + print(f'Getting pricing for {region}, url: {url}') page = 0 while url is not None: page += 1 @@ -125,29 +153,11 @@ def get_sku_df(region_set: Set[str]) -> pd.DataFrame: def get_gpu_name(family: str) -> Optional[str]: - gpu_data = { - 'standardNCFamily': 'K80', - 'standardNCSv2Family': 'P100', - 'standardNCSv3Family': 'V100', - 'standardNCPromoFamily': 'K80', - 'StandardNCASv3_T4Family': 'T4', - 'standardNDSv2Family': 'V100-32GB', - 'StandardNCADSA100v4Family': 'A100-80GB', - 'standardNDAMSv4_A100Family': 'A100-80GB', - 'StandardNDASv4_A100Family': 'A100', - 'standardNVFamily': 'M60', - 'standardNVSv2Family': 'M60', - 'standardNVSv3Family': 'M60', - 'standardNVPromoFamily': 'M60', - 'standardNVSv4Family': 'Radeon MI25', - 'standardNDSFamily': 'P40', - 'StandardNVADSA10v5Family': 'A10', - } # NP-series offer Xilinx U250 FPGAs which are not GPUs, # so we do not include them here. # https://docs.microsoft.com/en-us/azure/virtual-machines/np-series family = family.replace(' ', '') - return gpu_data.get(family) + return FAMILY_NAME_TO_SKYPILOT_GPU_NAME.get(family) def get_all_regions_instance_types_df(region_set: Set[str]): diff --git a/sky/utils/accelerator_registry.py b/sky/utils/accelerator_registry.py index ada328171a7..5d234d79eba 100644 --- a/sky/utils/accelerator_registry.py +++ b/sky/utils/accelerator_registry.py @@ -6,9 +6,12 @@ # NOTE: Must include accelerators supported for local clusters. # # 1. What if a name is in this list, but not in any catalog? +# # The name will be canonicalized, but the accelerator will not be supported. # Optimizer will print an error message. +# # 2. What if a name is not in this list, but in a catalog? +# # The list is simply an optimization to short-circuit the search in the catalog. # If the name is not found in the list, it will be searched in the catalog # with its case being ignored. If a match is found, the name will be @@ -16,7 +19,9 @@ # expensive operation, as it requires reading the catalog or making external # API calls (such as for Kubernetes). Thus it is desirable to keep this list # up-to-date with commonly used accelerators. + # 3. (For SkyPilot dev) What to do if I want to add a new accelerator? +# # Append its case-sensitive canonical name to this list. The name must match # `AcceleratorName` in the service catalog, or what we define in # `onprem_utils.get_local_cluster_accelerators`. @@ -42,6 +47,7 @@ 'Radeon MI25', 'P4', 'L4', + 'H100', ] @@ -72,11 +78,11 @@ def canonicalize_accelerator_name(accelerator: str) -> str: if len(names) == 1: return names[0] - # Do not print an error meessage here. Optimizer will handle it. + # Do not print an error message here. Optimizer will handle it. if len(names) == 0: return accelerator - # Currenlty unreachable. + # Currently unreachable. # This can happen if catalogs have the same accelerator with # different names (e.g., A10g and A10G). assert len(names) > 1