From bf4ef4da148fbf33b07d48a912f719fd8a2df48a Mon Sep 17 00:00:00 2001 From: mjibril Date: Mon, 18 Nov 2024 17:51:18 +0100 Subject: [PATCH] [FluidStack] Fix provisioning and add new gpu types (#4359) [FluidStack] Fix provisioning and add new gpu types * Add new `provisioning` status to fix failed deployments * Add H100 SXM5 GPU mapping --- .../data_fetchers/fetch_fluidstack.py | 23 ++++++++++++++++++- sky/provision/fluidstack/instance.py | 4 +--- 2 files changed, 23 insertions(+), 4 deletions(-) diff --git a/sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py b/sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py index cf943541e08..7a8b7e42e79 100644 --- a/sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py +++ b/sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py @@ -15,6 +15,26 @@ DEFAULT_FLUIDSTACK_API_KEY_PATH = os.path.expanduser('~/.fluidstack/api_key') plan_vcpus_memory = [{ + 'gpu_type': 'H100_SXM5_80GB', + 'gpu_count': 1, + 'min_cpu_count': 52, + 'min_memory': 450 +}, { + 'gpu_type': 'H100_SXM5_80GB', + 'gpu_count': 2, + 'min_cpu_count': 52, + 'min_memory': 450 +}, { + 'gpu_type': 'H100_SXM5_80GB', + 'gpu_count': 4, + 'min_cpu_count': 104, + 'min_memory': 900 +}, { + 'gpu_type': 'H100_SXM5_80GB', + 'gpu_count': 8, + 'min_cpu_count': 192, + 'min_memory': 1800 +}, { 'gpu_type': 'RTX_A6000_48GB', 'gpu_count': 2, 'min_cpu_count': 12, @@ -150,7 +170,8 @@ 'H100_PCIE_80GB': 'H100', 'H100_NVLINK_80GB': 'H100', 'A100_NVLINK_80GB': 'A100-80GB', - 'A100_SXM4_80GB': 'A100-80GB', + 'A100_SXM4_80GB': 'A100-80GB-SXM', + 'H100_SXM5_80GB': 'H100-SXM', 'A100_PCIE_80GB': 'A100-80GB', 'A100_SXM4_40GB': 'A100', 'A100_PCIE_40GB': 'A100', diff --git a/sky/provision/fluidstack/instance.py b/sky/provision/fluidstack/instance.py index 538aafc8887..7fa6cb0463b 100644 --- a/sky/provision/fluidstack/instance.py +++ b/sky/provision/fluidstack/instance.py @@ -79,9 +79,7 @@ def run_instances(region: str, cluster_name_on_cloud: str, config: common.ProvisionConfig) -> common.ProvisionRecord: """Runs instances for the given cluster.""" - pending_status = [ - 'pending', - ] + pending_status = ['pending', 'provisioning'] while True: instances = _filter_instances(cluster_name_on_cloud, pending_status) if len(instances) > config.count: