Skip to content

Commit

Permalink
update gpu_labeler to label only nodes with gpus
Browse files Browse the repository at this point in the history
  • Loading branch information
romilbhardwaj committed Sep 13, 2023
1 parent 9e115c9 commit 7039b04
Showing 1 changed file with 20 additions and 7 deletions.
27 changes: 20 additions & 7 deletions sky/utils/kubernetes/gpu_labeler.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,8 +89,16 @@ def label():

# Iterate over nodes
nodes = v1.list_node().items
# TODO(romilb): Run this only on nodes with GPUs.

# Get the list of nodes with GPUs
gpu_nodes = []
for node in nodes:
if 'nvidia.com/gpu' in node.status.capacity:
gpu_nodes.append(node)

print(f'Found {len(gpu_nodes)} GPU nodes in the cluster')

for node in gpu_nodes:
node_name = node.metadata.name

# Modify the job manifest for the current node
Expand All @@ -103,12 +111,17 @@ def label():
# Create the job for this node`
batch_v1.create_namespaced_job(namespace, job_manifest)
print(f'Created GPU labeler job for node {node_name}')
print('GPU labeling started - this may take a few minutes to complete.'
'\nTo check the status of GPU labeling jobs, run '
'`kubectl get jobs --namespace=kube-system -l job=sky-gpu-labeler`'
'\nYou can check if nodes have been labeled by running '
'`kubectl describe nodes` and looking for labels of the format '
'`skypilot.co/accelerators: <gpu_name>`. ')
if len(gpu_nodes) == 0:
print('No GPU nodes found in the cluster. If you have GPU nodes, '
'please ensure that they have the label '
'`nvidia.com/gpu: <number of GPUs>`')
else:
print('GPU labeling started - this may take a few minutes to complete.'
'\nTo check the status of GPU labeling jobs, run '
'`kubectl get jobs -n kube-system -l job=sky-gpu-labeler`'
'\nYou can check if nodes have been labeled by running '
'`kubectl describe nodes` and looking for labels of the format '
'`skypilot.co/accelerators: <gpu_name>`. ')


def main():
Expand Down

0 comments on commit 7039b04

Please sign in to comment.