Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[k8s] Show per node status in sky show-gpus #3816

Merged
merged 2 commits into from
Aug 8, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions sky/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -3072,6 +3072,19 @@ def _get_kubernetes_realtime_gpu_table(
])
return realtime_gpu_table

def _get_kubernetes_node_info_table():
node_table = log_utils.create_table(
['NODE_NAME', 'GPU_NAME', 'TOTAL_GPUS', 'FREE_GPUS'])

node_info_dict = kubernetes_utils.get_kubernetes_node_info()
for node_name, node_info in node_info_dict.items():
node_table.add_row([
node_name, node_info.gpu_type,
node_info.total['nvidia.com/gpu'],
node_info.free['nvidia.com/gpu']
])
return node_table

def _output():
gpu_table = log_utils.create_table(
['COMMON_GPU', 'AVAILABLE_QUANTITIES'])
Expand Down Expand Up @@ -3112,6 +3125,12 @@ def _output():
yield (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
f'Kubernetes GPUs{colorama.Style.RESET_ALL}\n')
yield from k8s_realtime_table.get_string()
k8s_node_table = _get_kubernetes_node_info_table()
yield '\n\n'
yield (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
f'Kubernetes per node GPU availability'
f'{colorama.Style.RESET_ALL}\n')
yield from k8s_node_table.get_string()
if kubernetes_autoscaling:
k8s_messages += (
'\n' + kubernetes_utils.KUBERNETES_AUTOSCALER_NOTE)
Expand Down Expand Up @@ -3199,6 +3218,7 @@ def _output():
print_section_titles = True
yield (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
f'Kubernetes GPUs{colorama.Style.RESET_ALL}\n')
# TODO(romilb): Show filtered per node GPU availability here as well
try:
k8s_realtime_table = _get_kubernetes_realtime_gpu_table(
name_filter=name, quantity_filter=quantity)
Expand Down
2 changes: 2 additions & 0 deletions sky/clouds/service_catalog/kubernetes_catalog.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,8 @@ def list_accelerators_realtime(
require_price: bool = True
) -> Tuple[Dict[str, List[common.InstanceTypeInfo]], Dict[str, int], Dict[str,
int]]:
# TODO(romilb): This should be refactored to use get_kubernetes_node_info()
# function from kubernetes_utils.
del all_regions, require_price # Unused.
k8s_cloud = Kubernetes()
if not any(
Expand Down
68 changes: 68 additions & 0 deletions sky/provision/kubernetes/utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Kubernetes utilities for SkyPilot."""
import dataclasses
import json
import math
import os
Expand Down Expand Up @@ -1652,3 +1653,70 @@ def __init__(self, obj):

fake_kube_response = FakeKubeResponse(object_dict)
return kubernetes.api_client().deserialize(fake_kube_response, object_type)


@dataclasses.dataclass
class KubernetesNodeInfo:
"""Dataclass to store Kubernetes node information."""
name: str
gpu_type: Optional[str]
# Resources available on the node. E.g., {'nvidia.com/gpu': '2'}
total: Dict[str, int]
free: Dict[str, int]


def get_kubernetes_node_info() -> Dict[str, KubernetesNodeInfo]:
"""Gets the resource information for all the nodes in the cluster.

Currently only GPU resources are supported. The function returns the total
number of GPUs available on the node and the number of free GPUs on the
node.

Returns:
Dict[str, KubernetesNodeInfo]: Dictionary containing the node name as
key and the KubernetesNodeInfo object as value
"""
nodes = get_kubernetes_nodes()
# Get the pods to get the real-time resource usage
pods = get_kubernetes_pods()

label_formatter, _ = detect_gpu_label_formatter()
if not label_formatter:
label_key = None
else:
label_key = label_formatter.get_label_key()

node_info_dict: Dict[str, KubernetesNodeInfo] = {}

for node in nodes:
allocated_qty = 0
if label_formatter is not None and label_key in node.metadata.labels:
accelerator_name = label_formatter.get_accelerator_from_label_value(
node.metadata.labels.get(label_key))
else:
accelerator_name = None

accelerator_count = int(node.status.allocatable.get(
'nvidia.com/gpu', 0))

for pod in pods:
# Get all the pods running on the node
if (pod.spec.node_name == node.metadata.name and
pod.status.phase in ['Running', 'Pending']):
# Iterate over all the containers in the pod and sum the
# GPU requests
for container in pod.spec.containers:
if container.resources.requests:
allocated_qty += int(
container.resources.requests.get(
'nvidia.com/gpu', 0))
Comment on lines +1702 to +1712
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: it might be interesting to show the number of running pods on each nodes as well, but this is minor if a user does not request that. : )

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good point - though one challenge is a node may be running many pods across different namespaces.. perhaps we can show only the pods in the user's configured namespace. We can add this if users ask :)


accelerators_available = accelerator_count - allocated_qty

node_info_dict[node.metadata.name] = KubernetesNodeInfo(
name=node.metadata.name,
gpu_type=accelerator_name,
total={'nvidia.com/gpu': int(accelerator_count)},
free={'nvidia.com/gpu': int(accelerators_available)})

return node_info_dict
Loading