Skip to content

Commit

Permalink
Merge branch 'master' of github.com:skypilot-org/skypilot into k8s_ze…
Browse files Browse the repository at this point in the history
…roconf_networking

# Conflicts:
#	tests/kubernetes/README.md
  • Loading branch information
romilbhardwaj committed Sep 16, 2023
2 parents 9c4e338 + 578b36d commit add29dd
Show file tree
Hide file tree
Showing 2 changed files with 25 additions and 12 deletions.
27 changes: 20 additions & 7 deletions sky/utils/kubernetes/gpu_labeler.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,8 +89,16 @@ def label():

# Iterate over nodes
nodes = v1.list_node().items
# TODO(romilb): Run this only on nodes with GPUs.

# Get the list of nodes with GPUs
gpu_nodes = []
for node in nodes:
if 'nvidia.com/gpu' in node.status.capacity:
gpu_nodes.append(node)

print(f'Found {len(gpu_nodes)} GPU nodes in the cluster')

for node in gpu_nodes:
node_name = node.metadata.name

# Modify the job manifest for the current node
Expand All @@ -103,12 +111,17 @@ def label():
# Create the job for this node`
batch_v1.create_namespaced_job(namespace, job_manifest)
print(f'Created GPU labeler job for node {node_name}')
print('GPU labeling started - this may take a few minutes to complete.'
'\nTo check the status of GPU labeling jobs, run '
'`kubectl get jobs --namespace=kube-system -l job=sky-gpu-labeler`'
'\nYou can check if nodes have been labeled by running '
'`kubectl describe nodes` and looking for labels of the format '
'`skypilot.co/accelerators: <gpu_name>`. ')
if len(gpu_nodes) == 0:
print('No GPU nodes found in the cluster. If you have GPU nodes, '
'please ensure that they have the label '
'`nvidia.com/gpu: <number of GPUs>`')
else:
print('GPU labeling started - this may take a few minutes to complete.'
'\nTo check the status of GPU labeling jobs, run '
'`kubectl get jobs -n kube-system -l job=sky-gpu-labeler`'
'\nYou can check if nodes have been labeled by running '
'`kubectl describe nodes` and looking for labels of the format '
'`skypilot.co/accelerators: <gpu_name>`. ')


def main():
Expand Down
10 changes: 5 additions & 5 deletions tests/kubernetes/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -81,15 +81,15 @@ NOTE - If are using nodeport networking, make sure port 32100 is open in your no
```bash
kubectl get jobs -n kube-system
```
Note that some jobs may be in pending state if your cluster contains CPU nodes. To clean up these jobs after you're done, run:
```bash
python -m sky.utils.kubernetes.gpu_labeler --cleanup
```
After the jobs are done, you can verify the GPU labels are setup correctly by looking for `skypilot.co/accelerator` label in the output of:
```bash
kubectl describe nodes
```
4. Run `sky check`.
In case something goes wrong, you can clean up these jobs by running:
```bash
python -m sky.utils.kubernetes.gpu_labeler --cleanup
```
5. Run `sky check`.
```bash
sky check
```
Expand Down

0 comments on commit add29dd

Please sign in to comment.