Skip to content

Commit

Permalink
[k8s] Show per node status in sky show-gpus (#3816)
Browse files Browse the repository at this point in the history
* per node gpu availability

* per node gpu availability
  • Loading branch information
romilbhardwaj authored Aug 8, 2024
1 parent d8642fe commit 7f64d60
Show file tree
Hide file tree
Showing 3 changed files with 90 additions and 0 deletions.
20 changes: 20 additions & 0 deletions sky/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -3072,6 +3072,19 @@ def _get_kubernetes_realtime_gpu_table(
])
return realtime_gpu_table

def _get_kubernetes_node_info_table():
node_table = log_utils.create_table(
['NODE_NAME', 'GPU_NAME', 'TOTAL_GPUS', 'FREE_GPUS'])

node_info_dict = kubernetes_utils.get_kubernetes_node_info()
for node_name, node_info in node_info_dict.items():
node_table.add_row([
node_name, node_info.gpu_type,
node_info.total['nvidia.com/gpu'],
node_info.free['nvidia.com/gpu']
])
return node_table

def _output():
gpu_table = log_utils.create_table(
['COMMON_GPU', 'AVAILABLE_QUANTITIES'])
Expand Down Expand Up @@ -3112,6 +3125,12 @@ def _output():
yield (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
f'Kubernetes GPUs{colorama.Style.RESET_ALL}\n')
yield from k8s_realtime_table.get_string()
k8s_node_table = _get_kubernetes_node_info_table()
yield '\n\n'
yield (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
f'Kubernetes per node GPU availability'
f'{colorama.Style.RESET_ALL}\n')
yield from k8s_node_table.get_string()
if kubernetes_autoscaling:
k8s_messages += (
'\n' + kubernetes_utils.KUBERNETES_AUTOSCALER_NOTE)
Expand Down Expand Up @@ -3199,6 +3218,7 @@ def _output():
print_section_titles = True
yield (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
f'Kubernetes GPUs{colorama.Style.RESET_ALL}\n')
# TODO(romilb): Show filtered per node GPU availability here as well
try:
k8s_realtime_table = _get_kubernetes_realtime_gpu_table(
name_filter=name, quantity_filter=quantity)
Expand Down
2 changes: 2 additions & 0 deletions sky/clouds/service_catalog/kubernetes_catalog.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,8 @@ def list_accelerators_realtime(
require_price: bool = True
) -> Tuple[Dict[str, List[common.InstanceTypeInfo]], Dict[str, int], Dict[str,
int]]:
# TODO(romilb): This should be refactored to use get_kubernetes_node_info()
# function from kubernetes_utils.
del all_regions, require_price # Unused.
k8s_cloud = Kubernetes()
if not any(
Expand Down
68 changes: 68 additions & 0 deletions sky/provision/kubernetes/utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Kubernetes utilities for SkyPilot."""
import dataclasses
import json
import math
import os
Expand Down Expand Up @@ -1652,3 +1653,70 @@ def __init__(self, obj):

fake_kube_response = FakeKubeResponse(object_dict)
return kubernetes.api_client().deserialize(fake_kube_response, object_type)


@dataclasses.dataclass
class KubernetesNodeInfo:
"""Dataclass to store Kubernetes node information."""
name: str
gpu_type: Optional[str]
# Resources available on the node. E.g., {'nvidia.com/gpu': '2'}
total: Dict[str, int]
free: Dict[str, int]


def get_kubernetes_node_info() -> Dict[str, KubernetesNodeInfo]:
"""Gets the resource information for all the nodes in the cluster.
Currently only GPU resources are supported. The function returns the total
number of GPUs available on the node and the number of free GPUs on the
node.
Returns:
Dict[str, KubernetesNodeInfo]: Dictionary containing the node name as
key and the KubernetesNodeInfo object as value
"""
nodes = get_kubernetes_nodes()
# Get the pods to get the real-time resource usage
pods = get_kubernetes_pods()

label_formatter, _ = detect_gpu_label_formatter()
if not label_formatter:
label_key = None
else:
label_key = label_formatter.get_label_key()

node_info_dict: Dict[str, KubernetesNodeInfo] = {}

for node in nodes:
allocated_qty = 0
if label_formatter is not None and label_key in node.metadata.labels:
accelerator_name = label_formatter.get_accelerator_from_label_value(
node.metadata.labels.get(label_key))
else:
accelerator_name = None

accelerator_count = int(node.status.allocatable.get(
'nvidia.com/gpu', 0))

for pod in pods:
# Get all the pods running on the node
if (pod.spec.node_name == node.metadata.name and
pod.status.phase in ['Running', 'Pending']):
# Iterate over all the containers in the pod and sum the
# GPU requests
for container in pod.spec.containers:
if container.resources.requests:
allocated_qty += int(
container.resources.requests.get(
'nvidia.com/gpu', 0))

accelerators_available = accelerator_count - allocated_qty

node_info_dict[node.metadata.name] = KubernetesNodeInfo(
name=node.metadata.name,
gpu_type=accelerator_name,
total={'nvidia.com/gpu': int(accelerator_count)},
free={'nvidia.com/gpu': int(accelerators_available)})

return node_info_dict

0 comments on commit 7f64d60

Please sign in to comment.