From 4e71e6f52694a029ebd82aa938fc73c23f81c0b0 Mon Sep 17 00:00:00 2001 From: Andrew Aikawa Date: Sat, 27 Apr 2024 02:09:05 +0000 Subject: [PATCH 01/26] GFDLabel formatter for k8s --- sky/provision/kubernetes/utils.py | 47 ++++++++++++++++++++++++++++++- 1 file changed, 46 insertions(+), 1 deletion(-) diff --git a/sky/provision/kubernetes/utils.py b/sky/provision/kubernetes/utils.py index 1cb31328d50..c460f453f82 100644 --- a/sky/provision/kubernetes/utils.py +++ b/sky/provision/kubernetes/utils.py @@ -95,6 +95,28 @@ def get_gke_accelerator_name(accelerator: str) -> str: else: return 'nvidia-tesla-{}'.format(accelerator.lower()) +def get_gfd_accelerator_from_value(value: str) -> str: + """Returns the accelerator name for GPU feature discovery (GFD) labeled nodes. + + Searches against a canonical list of NVIDIA GPUs and pattern + matches the canonical GPU name against the GFD label. Taken from + sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml + """ + canonical_gpu_names = [ + 'A100-80GB', 'A100', 'A10G', 'H100', 'K80', 'M60', 'T4g', 'T4', 'V100', + 'A10', 'P100', 'P40', 'P4', 'L4' + ] + + for canonical_name in canonical_gpu_names: + if canonical_name.lower() in value.lower(): + return canonical_name + + # If we didn't find a canonical name: + # 1. remove 'NVIDIA ' if present (e.g., 'NVIDIA RTX A6000' -> 'RTX A6000') + # 2. remove 'GeForce ' if present (e.g., 'NVIDIA GeForce RTX 3070' -> 'RTX 3070') + # 3. replace 'RTX ' with 'RTX' (without spaces) (e.g., 'RTX 6000' -> 'RTX6000') + return gpu_name.lower().replace('nvidia ', '').replace('geforce ', '').replace('rtx ', 'rtx').replace(' ', '-') + class SkyPilotLabelFormatter(GPULabelFormatter): """Custom label formatter for SkyPilot @@ -178,11 +200,34 @@ def get_accelerator_from_label_value(cls, value: str) -> str: f'Invalid accelerator name in GKE cluster: {value}') +class GFDLabelFormatter(GPULabelFormatter): + """GPU Feature Discovery label formatter + + NVIDIA GPUs nodes are labeled by GPU feature discovery + e.g. nvidia.com/gpu.product=NVIDIA-H100-80GB-HBM3 + https://github.com/NVIDIA/gpu-feature-discovery + + GPU feature discovery is included as part of the + NVIDIA GPU Operator: + https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/overview.html + """ + + LABEL_KEY = 'nvidia.com/gpu.product' + + @classmethod + def get_label_key(cls) -> str: + return cls.LABEL_KEY + + @classmethod + def get_accelerator_from_label_value(cls, value: str) -> str: + return get_gfd_accelerator_from_value(value) + + # LABEL_FORMATTER_REGISTRY stores the label formats SkyPilot will try to # discover the accelerator type from. The order of the list is important, as # it will be used to determine the priority of the label formats. LABEL_FORMATTER_REGISTRY = [ - SkyPilotLabelFormatter, CoreWeaveLabelFormatter, GKELabelFormatter + SkyPilotLabelFormatter, CoreWeaveLabelFormatter, GKELabelFormatter, GFDLabelFormatter ] From b91a57609b20de58ada99e0b20fab995fcad5b2d Mon Sep 17 00:00:00 2001 From: Andrew Aikawa Date: Sat, 27 Apr 2024 02:12:32 +0000 Subject: [PATCH 02/26] update comment --- sky/provision/kubernetes/utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sky/provision/kubernetes/utils.py b/sky/provision/kubernetes/utils.py index c460f453f82..041280b6492 100644 --- a/sky/provision/kubernetes/utils.py +++ b/sky/provision/kubernetes/utils.py @@ -108,13 +108,14 @@ def get_gfd_accelerator_from_value(value: str) -> str: ] for canonical_name in canonical_gpu_names: - if canonical_name.lower() in value.lower(): + if canonical_name in value: return canonical_name # If we didn't find a canonical name: # 1. remove 'NVIDIA ' if present (e.g., 'NVIDIA RTX A6000' -> 'RTX A6000') # 2. remove 'GeForce ' if present (e.g., 'NVIDIA GeForce RTX 3070' -> 'RTX 3070') # 3. replace 'RTX ' with 'RTX' (without spaces) (e.g., 'RTX 6000' -> 'RTX6000') + # 4. replace any other spaces with dashes (e.g. 'RTX 2080 Ti' -> 'RTX2080-Ti') return gpu_name.lower().replace('nvidia ', '').replace('geforce ', '').replace('rtx ', 'rtx').replace(' ', '-') From 95302b4bac18435d2efb5becefe002047822b984 Mon Sep 17 00:00:00 2001 From: Andrew Aikawa Date: Wed, 1 May 2024 18:31:37 +0000 Subject: [PATCH 03/26] format --- sky/provision/kubernetes/utils.py | 27 ++++++++++++++++---------- tests/kubernetes/scripts/deploy_k3s.sh | 17 +++++++++++----- 2 files changed, 29 insertions(+), 15 deletions(-) diff --git a/sky/provision/kubernetes/utils.py b/sky/provision/kubernetes/utils.py index 041280b6492..721fa64e301 100644 --- a/sky/provision/kubernetes/utils.py +++ b/sky/provision/kubernetes/utils.py @@ -95,28 +95,30 @@ def get_gke_accelerator_name(accelerator: str) -> str: else: return 'nvidia-tesla-{}'.format(accelerator.lower()) + def get_gfd_accelerator_from_value(value: str) -> str: - """Returns the accelerator name for GPU feature discovery (GFD) labeled nodes. - + """Returns the accelerator name for GPU feature discovery (GFD) + labeled nodes. + Searches against a canonical list of NVIDIA GPUs and pattern matches the canonical GPU name against the GFD label. Taken from sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml """ canonical_gpu_names = [ - 'A100-80GB', 'A100', 'A10G', 'H100', 'K80', 'M60', 'T4g', 'T4', 'V100', + 'A100-80GB', 'A100', 'A10G', 'H100', 'K80', 'M60', 'T4g', 'T4', 'V100', 'A10', 'P100', 'P40', 'P4', 'L4' ] for canonical_name in canonical_gpu_names: if canonical_name in value: return canonical_name - + # If we didn't find a canonical name: - # 1. remove 'NVIDIA ' if present (e.g., 'NVIDIA RTX A6000' -> 'RTX A6000') - # 2. remove 'GeForce ' if present (e.g., 'NVIDIA GeForce RTX 3070' -> 'RTX 3070') - # 3. replace 'RTX ' with 'RTX' (without spaces) (e.g., 'RTX 6000' -> 'RTX6000') - # 4. replace any other spaces with dashes (e.g. 'RTX 2080 Ti' -> 'RTX2080-Ti') - return gpu_name.lower().replace('nvidia ', '').replace('geforce ', '').replace('rtx ', 'rtx').replace(' ', '-') + # 1. remove 'NVIDIA-' (e.g., 'NVIDIA-RTX-A6000' -> 'RTX-A6000') + # 2. remove 'GEFORCE-' (e.g., 'NVIDIA-GEFORCE-RTX-3070' -> 'RTX-3070') + # 3. replace 'RTX-' with 'RTX' (e.g., 'RTX-6000' -> 'RTX6000') + return value.replace('NVIDIA-', '').replace('GEFORCE-', + '').replace('RTX-', 'RTX') class SkyPilotLabelFormatter(GPULabelFormatter): @@ -219,6 +221,10 @@ class GFDLabelFormatter(GPULabelFormatter): def get_label_key(cls) -> str: return cls.LABEL_KEY + @classmethod + def get_label_value(cls, accelerator: str) -> str: + return accelerator.upper() + @classmethod def get_accelerator_from_label_value(cls, value: str) -> str: return get_gfd_accelerator_from_value(value) @@ -228,7 +234,8 @@ def get_accelerator_from_label_value(cls, value: str) -> str: # discover the accelerator type from. The order of the list is important, as # it will be used to determine the priority of the label formats. LABEL_FORMATTER_REGISTRY = [ - SkyPilotLabelFormatter, CoreWeaveLabelFormatter, GKELabelFormatter, GFDLabelFormatter + SkyPilotLabelFormatter, CoreWeaveLabelFormatter, GKELabelFormatter, + GFDLabelFormatter ] diff --git a/tests/kubernetes/scripts/deploy_k3s.sh b/tests/kubernetes/scripts/deploy_k3s.sh index fb202d135e9..132feec810d 100644 --- a/tests/kubernetes/scripts/deploy_k3s.sh +++ b/tests/kubernetes/scripts/deploy_k3s.sh @@ -5,6 +5,9 @@ # sky launch -c k3s --cloud gcp --gpus T4:1 # scp deploy_k3s.sh k3s:~/ # ssh k3s +# # (optional) skip the skypilot labeler job +# export SKY_SKIP_K8S_LABEL=1 +# # deploy k3s # chmod +x deploy_k3s.sh && ./deploy_k3s.sh set -e @@ -71,6 +74,7 @@ sudo chown $(id -u):$(id -g) $HOME/.kube/config # Wait for k3s to be ready echo "Waiting for k3s to be ready" +sleep 5 kubectl wait --for=condition=ready node --all --timeout=5m # =========== GPU support =========== @@ -113,11 +117,14 @@ metadata: handler: nvidia EOF -# Label nodes with GPUs -echo "Labelling nodes with GPUs..." -python -m sky.utils.kubernetes.gpu_labeler +if [ ! "$SKY_SKIP_K8S_LABEL" -eq 1 ] +then + # Label nodes with GPUs + echo "Labelling nodes with GPUs..." + python -m sky.utils.kubernetes.gpu_labeler -# Wait for all the GPU labeling jobs to complete -wait_for_gpu_labeling_jobs + # Wait for all the GPU labeling jobs to complete + wait_for_gpu_labeling_jobs +fi echo "K3s cluster ready! Run sky check to setup Kubernetes access." From 5d3b360c407ba0b9114e08ddcbe76c82022272c6 Mon Sep 17 00:00:00 2001 From: Andrew Aikawa Date: Wed, 1 May 2024 22:15:24 +0000 Subject: [PATCH 04/26] substring match against k8s labels instead of strict matching --- sky/provision/kubernetes/utils.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/sky/provision/kubernetes/utils.py b/sky/provision/kubernetes/utils.py index 721fa64e301..e45d0b530c0 100644 --- a/sky/provision/kubernetes/utils.py +++ b/sky/provision/kubernetes/utils.py @@ -67,6 +67,9 @@ def get_label_value(cls, accelerator: str) -> str: def get_accelerator_from_label_value(cls, value: str) -> str: """Given a label value, returns the GPU type""" raise NotImplementedError + + @classmethod + def @classmethod def validate_label_value(cls, value: str) -> Tuple[bool, str]: @@ -453,7 +456,7 @@ def get_gpu_label_key_value(acc_type: str, check_mode=False) -> Tuple[str, str]: for node_name, label_list in node_labels.items(): for label, value in label_list: if (label == k8s_acc_label_key and - value == k8s_acc_label_value): + k8s_acc_label_value in value): # If a node is found, we can break out of the loop # and proceed to deploy. return k8s_acc_label_key, k8s_acc_label_value From acf9968431a803a412c482f2e20ecd17c050e620 Mon Sep 17 00:00:00 2001 From: Andrew Aikawa Date: Wed, 1 May 2024 22:16:12 +0000 Subject: [PATCH 05/26] cleanup --- sky/provision/kubernetes/utils.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/sky/provision/kubernetes/utils.py b/sky/provision/kubernetes/utils.py index e45d0b530c0..48809603cbf 100644 --- a/sky/provision/kubernetes/utils.py +++ b/sky/provision/kubernetes/utils.py @@ -67,9 +67,6 @@ def get_label_value(cls, accelerator: str) -> str: def get_accelerator_from_label_value(cls, value: str) -> str: """Given a label value, returns the GPU type""" raise NotImplementedError - - @classmethod - def @classmethod def validate_label_value(cls, value: str) -> Tuple[bool, str]: From e3bbde924ed42caebc7542849368aa3aa26a184c Mon Sep 17 00:00:00 2001 From: Andrew Aikawa Date: Wed, 1 May 2024 22:44:40 +0000 Subject: [PATCH 06/26] use k8s label --- sky/provision/kubernetes/utils.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/sky/provision/kubernetes/utils.py b/sky/provision/kubernetes/utils.py index 48809603cbf..8c5614e7536 100644 --- a/sky/provision/kubernetes/utils.py +++ b/sky/provision/kubernetes/utils.py @@ -116,9 +116,7 @@ def get_gfd_accelerator_from_value(value: str) -> str: # If we didn't find a canonical name: # 1. remove 'NVIDIA-' (e.g., 'NVIDIA-RTX-A6000' -> 'RTX-A6000') # 2. remove 'GEFORCE-' (e.g., 'NVIDIA-GEFORCE-RTX-3070' -> 'RTX-3070') - # 3. replace 'RTX-' with 'RTX' (e.g., 'RTX-6000' -> 'RTX6000') - return value.replace('NVIDIA-', '').replace('GEFORCE-', - '').replace('RTX-', 'RTX') + return value.replace('NVIDIA-', '').replace('GEFORCE-', '') class SkyPilotLabelFormatter(GPULabelFormatter): @@ -456,7 +454,7 @@ def get_gpu_label_key_value(acc_type: str, check_mode=False) -> Tuple[str, str]: k8s_acc_label_value in value): # If a node is found, we can break out of the loop # and proceed to deploy. - return k8s_acc_label_key, k8s_acc_label_value + return k8s_acc_label_key, value # If no node is found with the requested acc_type, raise error with ux_utils.print_exception_no_traceback(): suffix = '' From c87a40145929400025c075d4cc737e864c1a3a0a Mon Sep 17 00:00:00 2001 From: Andrew Aikawa Date: Wed, 1 May 2024 23:39:57 +0000 Subject: [PATCH 07/26] map k8s label value to accelerator instead of accelerator to label value --- sky/provision/kubernetes/utils.py | 29 +++-------------------------- 1 file changed, 3 insertions(+), 26 deletions(-) diff --git a/sky/provision/kubernetes/utils.py b/sky/provision/kubernetes/utils.py index 8c5614e7536..c2760333e2c 100644 --- a/sky/provision/kubernetes/utils.py +++ b/sky/provision/kubernetes/utils.py @@ -58,11 +58,6 @@ def get_label_key(cls) -> str: """Returns the label key for GPU type used by the Kubernetes cluster""" raise NotImplementedError - @classmethod - def get_label_value(cls, accelerator: str) -> str: - """Given a GPU type, returns the label value to be used""" - raise NotImplementedError - @classmethod def get_accelerator_from_label_value(cls, value: str) -> str: """Given a label value, returns the GPU type""" @@ -132,12 +127,6 @@ class SkyPilotLabelFormatter(GPULabelFormatter): def get_label_key(cls) -> str: return cls.LABEL_KEY - @classmethod - def get_label_value(cls, accelerator: str) -> str: - # For SkyPilot formatter, we use the accelerator str directly. - # See sky.utils.kubernetes.gpu_labeler. - return accelerator.lower() - @classmethod def get_accelerator_from_label_value(cls, value: str) -> str: return value.upper() @@ -164,10 +153,6 @@ class CoreWeaveLabelFormatter(GPULabelFormatter): def get_label_key(cls) -> str: return cls.LABEL_KEY - @classmethod - def get_label_value(cls, accelerator: str) -> str: - return accelerator.upper() - @classmethod def get_accelerator_from_label_value(cls, value: str) -> str: return value @@ -186,10 +171,6 @@ class GKELabelFormatter(GPULabelFormatter): def get_label_key(cls) -> str: return cls.LABEL_KEY - @classmethod - def get_label_value(cls, accelerator: str) -> str: - return get_gke_accelerator_name(accelerator) - @classmethod def get_accelerator_from_label_value(cls, value: str) -> str: if value.startswith('nvidia-tesla-'): @@ -219,10 +200,6 @@ class GFDLabelFormatter(GPULabelFormatter): def get_label_key(cls) -> str: return cls.LABEL_KEY - @classmethod - def get_label_value(cls, accelerator: str) -> str: - return accelerator.upper() - @classmethod def get_accelerator_from_label_value(cls, value: str) -> str: return get_gfd_accelerator_from_value(value) @@ -441,7 +418,6 @@ def get_gpu_label_key_value(acc_type: str, check_mode=False) -> Tuple[str, str]: # conclude that the cluster is setup correctly and return. return '', '' k8s_acc_label_key = label_formatter.get_label_key() - k8s_acc_label_value = label_formatter.get_label_value(acc_type) # Search in node_labels to see if any node has the requested # GPU type. # Note - this only checks if the label is available on a @@ -451,10 +427,11 @@ def get_gpu_label_key_value(acc_type: str, check_mode=False) -> Tuple[str, str]: for node_name, label_list in node_labels.items(): for label, value in label_list: if (label == k8s_acc_label_key and - k8s_acc_label_value in value): + label_formatter.get_accelerator_from_label_value( + value) == acc_type): # If a node is found, we can break out of the loop # and proceed to deploy. - return k8s_acc_label_key, value + return label, value # If no node is found with the requested acc_type, raise error with ux_utils.print_exception_no_traceback(): suffix = '' From d1b7b4c25fc6de099cb97d3e25d96a004aed1c28 Mon Sep 17 00:00:00 2001 From: Andrew Aikawa Date: Thu, 2 May 2024 00:35:14 +0000 Subject: [PATCH 08/26] remove unused get_gke_accelerator_name --- sky/provision/kubernetes/utils.py | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/sky/provision/kubernetes/utils.py b/sky/provision/kubernetes/utils.py index c2760333e2c..5259a016910 100644 --- a/sky/provision/kubernetes/utils.py +++ b/sky/provision/kubernetes/utils.py @@ -78,19 +78,6 @@ def validate_label_value(cls, value: str) -> Tuple[bool, str]: return True, '' -def get_gke_accelerator_name(accelerator: str) -> str: - """Returns the accelerator name for GKE clusters - - Uses the format - nvidia-tesla-. - A100-80GB, H100-80GB and L4 are an exception. They use nvidia-. - """ - if accelerator in ('A100-80GB', 'L4', 'H100-80GB'): - # A100-80GB, L4 and H100-80GB have a different name pattern. - return 'nvidia-{}'.format(accelerator.lower()) - else: - return 'nvidia-tesla-{}'.format(accelerator.lower()) - - def get_gfd_accelerator_from_value(value: str) -> str: """Returns the accelerator name for GPU feature discovery (GFD) labeled nodes. From db9a091e0e9f999c785cb441dd0f77684a49390f Mon Sep 17 00:00:00 2001 From: Andrew Aikawa Date: Thu, 2 May 2024 00:40:32 +0000 Subject: [PATCH 09/26] remove get acc from value func --- sky/provision/kubernetes/utils.py | 41 +++++++++++++------------------ 1 file changed, 17 insertions(+), 24 deletions(-) diff --git a/sky/provision/kubernetes/utils.py b/sky/provision/kubernetes/utils.py index 5259a016910..c06b97604f9 100644 --- a/sky/provision/kubernetes/utils.py +++ b/sky/provision/kubernetes/utils.py @@ -78,29 +78,6 @@ def validate_label_value(cls, value: str) -> Tuple[bool, str]: return True, '' -def get_gfd_accelerator_from_value(value: str) -> str: - """Returns the accelerator name for GPU feature discovery (GFD) - labeled nodes. - - Searches against a canonical list of NVIDIA GPUs and pattern - matches the canonical GPU name against the GFD label. Taken from - sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml - """ - canonical_gpu_names = [ - 'A100-80GB', 'A100', 'A10G', 'H100', 'K80', 'M60', 'T4g', 'T4', 'V100', - 'A10', 'P100', 'P40', 'P4', 'L4' - ] - - for canonical_name in canonical_gpu_names: - if canonical_name in value: - return canonical_name - - # If we didn't find a canonical name: - # 1. remove 'NVIDIA-' (e.g., 'NVIDIA-RTX-A6000' -> 'RTX-A6000') - # 2. remove 'GEFORCE-' (e.g., 'NVIDIA-GEFORCE-RTX-3070' -> 'RTX-3070') - return value.replace('NVIDIA-', '').replace('GEFORCE-', '') - - class SkyPilotLabelFormatter(GPULabelFormatter): """Custom label formatter for SkyPilot @@ -189,7 +166,23 @@ def get_label_key(cls) -> str: @classmethod def get_accelerator_from_label_value(cls, value: str) -> str: - return get_gfd_accelerator_from_value(value) + """Searches against a canonical list of NVIDIA GPUs and pattern + matches the canonical GPU name against the GFD label. Taken from + sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml + """ + canonical_gpu_names = [ + 'A100-80GB', 'A100', 'A10G', 'H100', 'K80', 'M60', 'T4g', 'T4', + 'V100', 'A10', 'P100', 'P40', 'P4', 'L4' + ] + + for canonical_name in canonical_gpu_names: + if canonical_name in value: + return canonical_name + + # If we didn't find a canonical name: + # 1. remove 'NVIDIA-' (e.g., 'NVIDIA-RTX-A6000' -> 'RTX-A6000') + # 2. remove 'GEFORCE-' (e.g., 'NVIDIA-GEFORCE-RTX-3070' -> 'RTX-3070') + return value.replace('NVIDIA-', '').replace('GEFORCE-', '') # LABEL_FORMATTER_REGISTRY stores the label formats SkyPilot will try to From 3da382cdafa271e60d869c5a79ac393d4a37d5dc Mon Sep 17 00:00:00 2001 From: Andrew Aikawa Date: Thu, 2 May 2024 18:31:54 +0000 Subject: [PATCH 10/26] pattern match against A100' --- sky/provision/kubernetes/utils.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/sky/provision/kubernetes/utils.py b/sky/provision/kubernetes/utils.py index c06b97604f9..bc083fc0b17 100644 --- a/sky/provision/kubernetes/utils.py +++ b/sky/provision/kubernetes/utils.py @@ -176,7 +176,10 @@ def get_accelerator_from_label_value(cls, value: str) -> str: ] for canonical_name in canonical_gpu_names: - if canonical_name in value: + # A100-80G accelerator can be labeled as A100-SXM-80GB or A100-PCIE-80GB + if canonical_name == 'A100-80G' and if re.match(r'A100.*-80G', value): + return canonical_name + elif canonical_name in value: return canonical_name # If we didn't find a canonical name: From 4be3589dda73c9dd0e71b1f9af01bbc48f859234 Mon Sep 17 00:00:00 2001 From: Andrew Aikawa Date: Thu, 2 May 2024 18:32:09 +0000 Subject: [PATCH 11/26] pattern match against A100' --- sky/provision/kubernetes/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sky/provision/kubernetes/utils.py b/sky/provision/kubernetes/utils.py index bc083fc0b17..be67ef2b929 100644 --- a/sky/provision/kubernetes/utils.py +++ b/sky/provision/kubernetes/utils.py @@ -177,7 +177,7 @@ def get_accelerator_from_label_value(cls, value: str) -> str: for canonical_name in canonical_gpu_names: # A100-80G accelerator can be labeled as A100-SXM-80GB or A100-PCIE-80GB - if canonical_name == 'A100-80G' and if re.match(r'A100.*-80G', value): + if canonical_name == 'A100-80G' and re.match(r'A100.*-80G', value): return canonical_name elif canonical_name in value: return canonical_name From 57e0f14e2253567bef8b655f14d2deb12d42d50a Mon Sep 17 00:00:00 2001 From: Andrew Aikawa Date: Thu, 2 May 2024 18:35:01 +0000 Subject: [PATCH 12/26] format --- sky/provision/kubernetes/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sky/provision/kubernetes/utils.py b/sky/provision/kubernetes/utils.py index be67ef2b929..b2c54a3d274 100644 --- a/sky/provision/kubernetes/utils.py +++ b/sky/provision/kubernetes/utils.py @@ -176,7 +176,7 @@ def get_accelerator_from_label_value(cls, value: str) -> str: ] for canonical_name in canonical_gpu_names: - # A100-80G accelerator can be labeled as A100-SXM-80GB or A100-PCIE-80GB + # A100-80G accelerator is A100-SXM-80GB or A100-PCIE-80GB if canonical_name == 'A100-80G' and re.match(r'A100.*-80G', value): return canonical_name elif canonical_name in value: From 2c0613623df7d4d410decb91fe69ffc6c3b56c83 Mon Sep 17 00:00:00 2001 From: Andrew Aikawa Date: Thu, 2 May 2024 18:40:44 +0000 Subject: [PATCH 13/26] fix typo --- sky/provision/kubernetes/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sky/provision/kubernetes/utils.py b/sky/provision/kubernetes/utils.py index b2c54a3d274..44171ed9b66 100644 --- a/sky/provision/kubernetes/utils.py +++ b/sky/provision/kubernetes/utils.py @@ -177,7 +177,7 @@ def get_accelerator_from_label_value(cls, value: str) -> str: for canonical_name in canonical_gpu_names: # A100-80G accelerator is A100-SXM-80GB or A100-PCIE-80GB - if canonical_name == 'A100-80G' and re.match(r'A100.*-80G', value): + if canonical_name == 'A100-80GB' and re.match(r'A100.*-80GB', value): return canonical_name elif canonical_name in value: return canonical_name From 4103455749bc622d240a5ccc7567467f7fa9e7e3 Mon Sep 17 00:00:00 2001 From: Andrew Aikawa Date: Thu, 2 May 2024 18:41:57 +0000 Subject: [PATCH 14/26] format --- sky/provision/kubernetes/utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sky/provision/kubernetes/utils.py b/sky/provision/kubernetes/utils.py index 44171ed9b66..9f04eb19ace 100644 --- a/sky/provision/kubernetes/utils.py +++ b/sky/provision/kubernetes/utils.py @@ -177,7 +177,8 @@ def get_accelerator_from_label_value(cls, value: str) -> str: for canonical_name in canonical_gpu_names: # A100-80G accelerator is A100-SXM-80GB or A100-PCIE-80GB - if canonical_name == 'A100-80GB' and re.match(r'A100.*-80GB', value): + if canonical_name == 'A100-80GB' and re.match( + r'A100.*-80GB', value): return canonical_name elif canonical_name in value: return canonical_name From ac6a51ed4824d38033b78756f0614327ba6261b5 Mon Sep 17 00:00:00 2001 From: Andrew Aikawa Date: Fri, 3 May 2024 00:37:21 +0000 Subject: [PATCH 15/26] re.search --- sky/provision/kubernetes/utils.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/sky/provision/kubernetes/utils.py b/sky/provision/kubernetes/utils.py index 9f04eb19ace..7ab8265ec54 100644 --- a/sky/provision/kubernetes/utils.py +++ b/sky/provision/kubernetes/utils.py @@ -174,10 +174,9 @@ def get_accelerator_from_label_value(cls, value: str) -> str: 'A100-80GB', 'A100', 'A10G', 'H100', 'K80', 'M60', 'T4g', 'T4', 'V100', 'A10', 'P100', 'P40', 'P4', 'L4' ] - for canonical_name in canonical_gpu_names: # A100-80G accelerator is A100-SXM-80GB or A100-PCIE-80GB - if canonical_name == 'A100-80GB' and re.match( + if canonical_name == 'A100-80GB' and re.search( r'A100.*-80GB', value): return canonical_name elif canonical_name in value: @@ -413,8 +412,6 @@ def get_gpu_label_key_value(acc_type: str, check_mode=False) -> Tuple[str, str]: if (label == k8s_acc_label_key and label_formatter.get_accelerator_from_label_value( value) == acc_type): - # If a node is found, we can break out of the loop - # and proceed to deploy. return label, value # If no node is found with the requested acc_type, raise error with ux_utils.print_exception_no_traceback(): From d703095338898d1613520d98e11b5f8ddf4bcb37 Mon Sep 17 00:00:00 2001 From: Andrew Date: Fri, 3 May 2024 16:31:30 -0700 Subject: [PATCH 16/26] compare strings --- tests/kubernetes/scripts/deploy_k3s.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/kubernetes/scripts/deploy_k3s.sh b/tests/kubernetes/scripts/deploy_k3s.sh index 132feec810d..eef43bb6422 100644 --- a/tests/kubernetes/scripts/deploy_k3s.sh +++ b/tests/kubernetes/scripts/deploy_k3s.sh @@ -117,7 +117,7 @@ metadata: handler: nvidia EOF -if [ ! "$SKY_SKIP_K8S_LABEL" -eq 1 ] +if [ ! "$SKY_SKIP_K8S_LABEL" == "1" ] then # Label nodes with GPUs echo "Labelling nodes with GPUs..." From f82c2a178b2f54c4781021bed2485866b3980475 Mon Sep 17 00:00:00 2001 From: Andrew Date: Sun, 5 May 2024 23:38:56 -0700 Subject: [PATCH 17/26] add P4000 --- sky/provision/kubernetes/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sky/provision/kubernetes/utils.py b/sky/provision/kubernetes/utils.py index 7ab8265ec54..e435a8326e3 100644 --- a/sky/provision/kubernetes/utils.py +++ b/sky/provision/kubernetes/utils.py @@ -172,7 +172,7 @@ def get_accelerator_from_label_value(cls, value: str) -> str: """ canonical_gpu_names = [ 'A100-80GB', 'A100', 'A10G', 'H100', 'K80', 'M60', 'T4g', 'T4', - 'V100', 'A10', 'P100', 'P40', 'P4', 'L4' + 'V100', 'A10', 'P4000', 'P100', 'P40', 'P4', 'L4' ] for canonical_name in canonical_gpu_names: # A100-80G accelerator is A100-SXM-80GB or A100-PCIE-80GB From bc8fbd79b91814829a1c3113ea384cc98addd2bc Mon Sep 17 00:00:00 2001 From: Andrew Date: Fri, 10 May 2024 02:03:01 -0700 Subject: [PATCH 18/26] format --- sky/provision/kubernetes/utils.py | 37 +++++++++++++++++++++++++++++-- 1 file changed, 35 insertions(+), 2 deletions(-) diff --git a/sky/provision/kubernetes/utils.py b/sky/provision/kubernetes/utils.py index fc05436e05d..44a641cb8d7 100644 --- a/sky/provision/kubernetes/utils.py +++ b/sky/provision/kubernetes/utils.py @@ -65,6 +65,11 @@ def get_label_key(cls) -> str: """Returns the label key for GPU type used by the Kubernetes cluster""" raise NotImplementedError + @classmethod + def get_label_value(cls, accelerator: str) -> str: + """Given a GPU type, returns the label value to be used""" + raise NotImplementedError + @classmethod def get_accelerator_from_label_value(cls, value: str) -> str: """Given a label value, returns the GPU type""" @@ -85,6 +90,19 @@ def validate_label_value(cls, value: str) -> Tuple[bool, str]: return True, '' +def get_gke_accelerator_name(accelerator: str) -> str: + """Returns the accelerator name for GKE clusters + + Uses the format - nvidia-tesla-. + A100-80GB, H100-80GB and L4 are an exception. They use nvidia-. + """ + if accelerator in ('A100-80GB', 'L4', 'H100-80GB'): + # A100-80GB, L4 and H100-80GB have a different name pattern. + return 'nvidia-{}'.format(accelerator.lower()) + else: + return 'nvidia-tesla-{}'.format(accelerator.lower()) + + class SkyPilotLabelFormatter(GPULabelFormatter): """Custom label formatter for SkyPilot @@ -98,6 +116,12 @@ class SkyPilotLabelFormatter(GPULabelFormatter): def get_label_key(cls) -> str: return cls.LABEL_KEY + @classmethod + def get_label_value(cls, accelerator: str) -> str: + # For SkyPilot formatter, we use the accelerator str directly. + # See sky.utils.kubernetes.gpu_labeler. + return accelerator.lower() + @classmethod def get_accelerator_from_label_value(cls, value: str) -> str: return value.upper() @@ -124,6 +148,10 @@ class CoreWeaveLabelFormatter(GPULabelFormatter): def get_label_key(cls) -> str: return cls.LABEL_KEY + @classmethod + def get_label_value(cls, accelerator: str) -> str: + return accelerator.upper() + @classmethod def get_accelerator_from_label_value(cls, value: str) -> str: return value @@ -142,6 +170,10 @@ class GKELabelFormatter(GPULabelFormatter): def get_label_key(cls) -> str: return cls.LABEL_KEY + @classmethod + def get_label_value(cls, accelerator: str) -> str: + return get_gke_accelerator_name(accelerator) + @classmethod def get_accelerator_from_label_value(cls, value: str) -> str: if value.startswith('nvidia-tesla-'): @@ -193,7 +225,8 @@ def get_accelerator_from_label_value(cls, value: str) -> str: # 1. remove 'NVIDIA-' (e.g., 'NVIDIA-RTX-A6000' -> 'RTX-A6000') # 2. remove 'GEFORCE-' (e.g., 'NVIDIA-GEFORCE-RTX-3070' -> 'RTX-3070') return value.replace('NVIDIA-', '').replace('GEFORCE-', '') - + + class KarpenterLabelFormatter(SkyPilotLabelFormatter): """Karpeneter label formatter Karpenter uses the label `karpenter.k8s.aws/instance-gpu-name` to identify @@ -209,7 +242,7 @@ class KarpenterLabelFormatter(SkyPilotLabelFormatter): # auto-detecting the GPU label type. LABEL_FORMATTER_REGISTRY = [ SkyPilotLabelFormatter, CoreWeaveLabelFormatter, GKELabelFormatter, - GFDLabelFormatter, KarpenterLabelFormatter + KarpenterLabelFormatter, GFDLabelFormatter ] # Mapping of autoscaler type to label formatter From 031921564bf51ce8920f23ea81c912328a3b8726 Mon Sep 17 00:00:00 2001 From: Andrew Aikawa Date: Fri, 10 May 2024 12:22:20 -0700 Subject: [PATCH 19/26] lower case for check Co-authored-by: Zhanghao Wu --- sky/provision/kubernetes/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sky/provision/kubernetes/utils.py b/sky/provision/kubernetes/utils.py index 44a641cb8d7..3491487b27a 100644 --- a/sky/provision/kubernetes/utils.py +++ b/sky/provision/kubernetes/utils.py @@ -483,7 +483,7 @@ def get_gpu_label_key_value(acc_type: str, check_mode=False) -> Tuple[str, str]: for label, value in label_list: if (label == k8s_acc_label_key and label_formatter.get_accelerator_from_label_value( - value) == acc_type): + value).lower() == acc_type.lower()): return label, value # If no node is found with the requested acc_type, raise error with ux_utils.print_exception_no_traceback(): From 75e43966802912a5b8ff9ab4036e6061eb3fbe58 Mon Sep 17 00:00:00 2001 From: Andrew Date: Thu, 16 May 2024 14:03:10 -0700 Subject: [PATCH 20/26] force upper case --- sky/provision/kubernetes/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sky/provision/kubernetes/utils.py b/sky/provision/kubernetes/utils.py index 3491487b27a..47980498372 100644 --- a/sky/provision/kubernetes/utils.py +++ b/sky/provision/kubernetes/utils.py @@ -224,7 +224,7 @@ def get_accelerator_from_label_value(cls, value: str) -> str: # If we didn't find a canonical name: # 1. remove 'NVIDIA-' (e.g., 'NVIDIA-RTX-A6000' -> 'RTX-A6000') # 2. remove 'GEFORCE-' (e.g., 'NVIDIA-GEFORCE-RTX-3070' -> 'RTX-3070') - return value.replace('NVIDIA-', '').replace('GEFORCE-', '') + return value.upper().replace('NVIDIA-', '').replace('GEFORCE-', '') class KarpenterLabelFormatter(SkyPilotLabelFormatter): From 7d4d6bb3edff3412b23336e178e26f747368e061 Mon Sep 17 00:00:00 2001 From: Andrew Date: Thu, 16 May 2024 14:14:49 -0700 Subject: [PATCH 21/26] match skypilot labeler logic --- sky/provision/kubernetes/utils.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/sky/provision/kubernetes/utils.py b/sky/provision/kubernetes/utils.py index 47980498372..e3361d0de86 100644 --- a/sky/provision/kubernetes/utils.py +++ b/sky/provision/kubernetes/utils.py @@ -206,8 +206,7 @@ def get_label_key(cls) -> str: @classmethod def get_accelerator_from_label_value(cls, value: str) -> str: """Searches against a canonical list of NVIDIA GPUs and pattern - matches the canonical GPU name against the GFD label. Taken from - sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml + matches the canonical GPU name against the GFD label. """ canonical_gpu_names = [ 'A100-80GB', 'A100', 'A10G', 'H100', 'K80', 'M60', 'T4g', 'T4', @@ -224,7 +223,10 @@ def get_accelerator_from_label_value(cls, value: str) -> str: # If we didn't find a canonical name: # 1. remove 'NVIDIA-' (e.g., 'NVIDIA-RTX-A6000' -> 'RTX-A6000') # 2. remove 'GEFORCE-' (e.g., 'NVIDIA-GEFORCE-RTX-3070' -> 'RTX-3070') - return value.upper().replace('NVIDIA-', '').replace('GEFORCE-', '') + # 3. remove 'RTX-' (e.g. 'RTX-6000' -> 'RTX6000') + # Same logic, but uppercased, as the Skypilot labeler job found in + # sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml + return value.upper().replace('NVIDIA-', '').replace('GEFORCE-', '').replace('RTX-', 'RTX') class KarpenterLabelFormatter(SkyPilotLabelFormatter): From a85014ad27c38c7e92ed78f4bc74e2ed278eb2fb Mon Sep 17 00:00:00 2001 From: Andrew Date: Thu, 16 May 2024 14:18:01 -0700 Subject: [PATCH 22/26] format.sh --- sky/provision/kubernetes/utils.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/sky/provision/kubernetes/utils.py b/sky/provision/kubernetes/utils.py index e3361d0de86..1a131878b5c 100644 --- a/sky/provision/kubernetes/utils.py +++ b/sky/provision/kubernetes/utils.py @@ -226,7 +226,9 @@ def get_accelerator_from_label_value(cls, value: str) -> str: # 3. remove 'RTX-' (e.g. 'RTX-6000' -> 'RTX6000') # Same logic, but uppercased, as the Skypilot labeler job found in # sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml - return value.upper().replace('NVIDIA-', '').replace('GEFORCE-', '').replace('RTX-', 'RTX') + return value.upper().replace('NVIDIA-', + '').replace('GEFORCE-', + '').replace('RTX-', 'RTX') class KarpenterLabelFormatter(SkyPilotLabelFormatter): @@ -485,7 +487,7 @@ def get_gpu_label_key_value(acc_type: str, check_mode=False) -> Tuple[str, str]: for label, value in label_list: if (label == k8s_acc_label_key and label_formatter.get_accelerator_from_label_value( - value).lower() == acc_type.lower()): + value) == acc_type): return label, value # If no node is found with the requested acc_type, raise error with ux_utils.print_exception_no_traceback(): From 83005e8ab9e51329717b40c90251667bff1edb16 Mon Sep 17 00:00:00 2001 From: Andrew Date: Mon, 3 Jun 2024 17:10:55 -0700 Subject: [PATCH 23/26] add docstring --- sky/provision/kubernetes/utils.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/sky/provision/kubernetes/utils.py b/sky/provision/kubernetes/utils.py index 1a131878b5c..0d85864d978 100644 --- a/sky/provision/kubernetes/utils.py +++ b/sky/provision/kubernetes/utils.py @@ -203,6 +203,13 @@ class GFDLabelFormatter(GPULabelFormatter): def get_label_key(cls) -> str: return cls.LABEL_KEY + @classmethod + def get_label_value(cls, accelerator: str) -> str: + """An accelerator can map to many Nvidia GFD labels + (e.g., A100-80GB-PCIE vs. A100-SXM4-80GB). + As a result, we do not support get_label_value for GFDLabelFormatter.""" + raise NotImplementedError + @classmethod def get_accelerator_from_label_value(cls, value: str) -> str: """Searches against a canonical list of NVIDIA GPUs and pattern From a2cae46bb841ebada73f551d72e00a5b546f6da7 Mon Sep 17 00:00:00 2001 From: Andrew Date: Mon, 3 Jun 2024 17:30:27 -0700 Subject: [PATCH 24/26] fix class docstring --- sky/provision/kubernetes/utils.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sky/provision/kubernetes/utils.py b/sky/provision/kubernetes/utils.py index 0d85864d978..7a22972958a 100644 --- a/sky/provision/kubernetes/utils.py +++ b/sky/provision/kubernetes/utils.py @@ -195,6 +195,9 @@ class GFDLabelFormatter(GPULabelFormatter): GPU feature discovery is included as part of the NVIDIA GPU Operator: https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/overview.html + + This LabelFormatter cannot be used in autoscaling clusters since accelerators + map to multiple label, so we're not implementing `get_label_value` """ LABEL_KEY = 'nvidia.com/gpu.product' From a45a34d23a234d0e50cc53dce753d3962ff93611 Mon Sep 17 00:00:00 2001 From: Andrew Date: Mon, 3 Jun 2024 17:31:08 -0700 Subject: [PATCH 25/26] grammar fix --- sky/provision/kubernetes/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sky/provision/kubernetes/utils.py b/sky/provision/kubernetes/utils.py index 7a22972958a..fffa1a2a2eb 100644 --- a/sky/provision/kubernetes/utils.py +++ b/sky/provision/kubernetes/utils.py @@ -197,7 +197,7 @@ class GFDLabelFormatter(GPULabelFormatter): https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/overview.html This LabelFormatter cannot be used in autoscaling clusters since accelerators - map to multiple label, so we're not implementing `get_label_value` + may map to multiple label, so we're not implementing `get_label_value` """ LABEL_KEY = 'nvidia.com/gpu.product' From 98016110143433ac325bd83f32d348af408c9f58 Mon Sep 17 00:00:00 2001 From: Andrew Date: Mon, 3 Jun 2024 18:02:48 -0700 Subject: [PATCH 26/26] format --- sky/provision/kubernetes/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sky/provision/kubernetes/utils.py b/sky/provision/kubernetes/utils.py index fffa1a2a2eb..3fd053c7eed 100644 --- a/sky/provision/kubernetes/utils.py +++ b/sky/provision/kubernetes/utils.py @@ -196,7 +196,7 @@ class GFDLabelFormatter(GPULabelFormatter): NVIDIA GPU Operator: https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/overview.html - This LabelFormatter cannot be used in autoscaling clusters since accelerators + This LabelFormatter can't be used in autoscaling clusters since accelerators may map to multiple label, so we're not implementing `get_label_value` """