diff --git a/sky/cli.py b/sky/cli.py index 3717138f80b..e50aca011a6 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -3072,6 +3072,19 @@ def _get_kubernetes_realtime_gpu_table( ]) return realtime_gpu_table + def _get_kubernetes_node_info_table(): + node_table = log_utils.create_table( + ['NODE_NAME', 'GPU_NAME', 'TOTAL_GPUS', 'FREE_GPUS']) + + node_info_dict = kubernetes_utils.get_kubernetes_node_info() + for node_name, node_info in node_info_dict.items(): + node_table.add_row([ + node_name, node_info.gpu_type, + node_info.total['nvidia.com/gpu'], + node_info.free['nvidia.com/gpu'] + ]) + return node_table + def _output(): gpu_table = log_utils.create_table( ['COMMON_GPU', 'AVAILABLE_QUANTITIES']) @@ -3112,6 +3125,12 @@ def _output(): yield (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}' f'Kubernetes GPUs{colorama.Style.RESET_ALL}\n') yield from k8s_realtime_table.get_string() + k8s_node_table = _get_kubernetes_node_info_table() + yield '\n\n' + yield (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}' + f'Kubernetes per node GPU availability' + f'{colorama.Style.RESET_ALL}\n') + yield from k8s_node_table.get_string() if kubernetes_autoscaling: k8s_messages += ( '\n' + kubernetes_utils.KUBERNETES_AUTOSCALER_NOTE) @@ -3199,6 +3218,7 @@ def _output(): print_section_titles = True yield (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}' f'Kubernetes GPUs{colorama.Style.RESET_ALL}\n') + # TODO(romilb): Show filtered per node GPU availability here as well try: k8s_realtime_table = _get_kubernetes_realtime_gpu_table( name_filter=name, quantity_filter=quantity) diff --git a/sky/clouds/service_catalog/kubernetes_catalog.py b/sky/clouds/service_catalog/kubernetes_catalog.py index a64aa8f72e9..9365d693cbd 100644 --- a/sky/clouds/service_catalog/kubernetes_catalog.py +++ b/sky/clouds/service_catalog/kubernetes_catalog.py @@ -65,6 +65,8 @@ def list_accelerators_realtime( require_price: bool = True ) -> Tuple[Dict[str, List[common.InstanceTypeInfo]], Dict[str, int], Dict[str, int]]: + # TODO(romilb): This should be refactored to use get_kubernetes_node_info() + # function from kubernetes_utils. del all_regions, require_price # Unused. k8s_cloud = Kubernetes() if not any( diff --git a/sky/provision/kubernetes/utils.py b/sky/provision/kubernetes/utils.py index f042750d627..5cccc5f84ed 100644 --- a/sky/provision/kubernetes/utils.py +++ b/sky/provision/kubernetes/utils.py @@ -1,4 +1,5 @@ """Kubernetes utilities for SkyPilot.""" +import dataclasses import json import math import os @@ -1652,3 +1653,70 @@ def __init__(self, obj): fake_kube_response = FakeKubeResponse(object_dict) return kubernetes.api_client().deserialize(fake_kube_response, object_type) + + +@dataclasses.dataclass +class KubernetesNodeInfo: + """Dataclass to store Kubernetes node information.""" + name: str + gpu_type: Optional[str] + # Resources available on the node. E.g., {'nvidia.com/gpu': '2'} + total: Dict[str, int] + free: Dict[str, int] + + +def get_kubernetes_node_info() -> Dict[str, KubernetesNodeInfo]: + """Gets the resource information for all the nodes in the cluster. + + Currently only GPU resources are supported. The function returns the total + number of GPUs available on the node and the number of free GPUs on the + node. + + Returns: + Dict[str, KubernetesNodeInfo]: Dictionary containing the node name as + key and the KubernetesNodeInfo object as value + """ + nodes = get_kubernetes_nodes() + # Get the pods to get the real-time resource usage + pods = get_kubernetes_pods() + + label_formatter, _ = detect_gpu_label_formatter() + if not label_formatter: + label_key = None + else: + label_key = label_formatter.get_label_key() + + node_info_dict: Dict[str, KubernetesNodeInfo] = {} + + for node in nodes: + allocated_qty = 0 + if label_formatter is not None and label_key in node.metadata.labels: + accelerator_name = label_formatter.get_accelerator_from_label_value( + node.metadata.labels.get(label_key)) + else: + accelerator_name = None + + accelerator_count = int(node.status.allocatable.get( + 'nvidia.com/gpu', 0)) + + for pod in pods: + # Get all the pods running on the node + if (pod.spec.node_name == node.metadata.name and + pod.status.phase in ['Running', 'Pending']): + # Iterate over all the containers in the pod and sum the + # GPU requests + for container in pod.spec.containers: + if container.resources.requests: + allocated_qty += int( + container.resources.requests.get( + 'nvidia.com/gpu', 0)) + + accelerators_available = accelerator_count - allocated_qty + + node_info_dict[node.metadata.name] = KubernetesNodeInfo( + name=node.metadata.name, + gpu_type=accelerator_name, + total={'nvidia.com/gpu': int(accelerator_count)}, + free={'nvidia.com/gpu': int(accelerators_available)}) + + return node_info_dict