From 578b36d1ee93426589f40c3a48399efb02c0dca7 Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Fri, 15 Sep 2023 17:31:33 -0700 Subject: [PATCH] [k8s] Run GPU labelling job only on nodes with gpus (#2550) * update gpu_labeler to label only nodes with gpus * updates --- sky/utils/kubernetes/gpu_labeler.py | 27 ++++++++++++++++++++------- tests/kubernetes/README.md | 8 ++++---- 2 files changed, 24 insertions(+), 11 deletions(-) diff --git a/sky/utils/kubernetes/gpu_labeler.py b/sky/utils/kubernetes/gpu_labeler.py index 03722514ca4..b6f89514204 100644 --- a/sky/utils/kubernetes/gpu_labeler.py +++ b/sky/utils/kubernetes/gpu_labeler.py @@ -89,8 +89,16 @@ def label(): # Iterate over nodes nodes = v1.list_node().items - # TODO(romilb): Run this only on nodes with GPUs. + + # Get the list of nodes with GPUs + gpu_nodes = [] for node in nodes: + if 'nvidia.com/gpu' in node.status.capacity: + gpu_nodes.append(node) + + print(f'Found {len(gpu_nodes)} GPU nodes in the cluster') + + for node in gpu_nodes: node_name = node.metadata.name # Modify the job manifest for the current node @@ -103,12 +111,17 @@ def label(): # Create the job for this node` batch_v1.create_namespaced_job(namespace, job_manifest) print(f'Created GPU labeler job for node {node_name}') - print('GPU labeling started - this may take a few minutes to complete.' - '\nTo check the status of GPU labeling jobs, run ' - '`kubectl get jobs --namespace=kube-system -l job=sky-gpu-labeler`' - '\nYou can check if nodes have been labeled by running ' - '`kubectl describe nodes` and looking for labels of the format ' - '`skypilot.co/accelerators: `. ') + if len(gpu_nodes) == 0: + print('No GPU nodes found in the cluster. If you have GPU nodes, ' + 'please ensure that they have the label ' + '`nvidia.com/gpu: `') + else: + print('GPU labeling started - this may take a few minutes to complete.' + '\nTo check the status of GPU labeling jobs, run ' + '`kubectl get jobs -n kube-system -l job=sky-gpu-labeler`' + '\nYou can check if nodes have been labeled by running ' + '`kubectl describe nodes` and looking for labels of the format ' + '`skypilot.co/accelerators: `. ') def main(): diff --git a/tests/kubernetes/README.md b/tests/kubernetes/README.md index 3bf1f9c455b..f94c9dc50fd 100644 --- a/tests/kubernetes/README.md +++ b/tests/kubernetes/README.md @@ -82,14 +82,14 @@ sky local up ```bash kubectl get jobs -n kube-system ``` - Note that some jobs may be in pending state if your cluster contains CPU nodes. To clean up these jobs after you're done, run: - ```bash - python -m sky.utils.kubernetes.gpu_labeler --cleanup - ``` After the jobs are done, you can verify the GPU labels are setup correctly by looking for `skypilot.co/accelerator` label in the output of: ```bash kubectl describe nodes ``` + In case something goes wrong, you can clean up these jobs by running: + ```bash + python -m sky.utils.kubernetes.gpu_labeler --cleanup + ``` 5. Run `sky check`. ```bash sky check