skypilot-org · romilbhardwaj · Aug 8, 2024 · Aug 8, 2024 · Aug 8, 2024 · Michaelvll
diff --git a/sky/cli.py b/sky/cli.py
@@ -3072,6 +3072,19 @@ def _get_kubernetes_realtime_gpu_table(
             ])
         return realtime_gpu_table
 
+    def _get_kubernetes_node_info_table():
+        node_table = log_utils.create_table(
+            ['NODE_NAME', 'GPU_NAME', 'TOTAL_GPUS', 'FREE_GPUS'])
+
+        node_info_dict = kubernetes_utils.get_kubernetes_node_info()
+        for node_name, node_info in node_info_dict.items():
+            node_table.add_row([
+                node_name, node_info.gpu_type,
+                node_info.total['nvidia.com/gpu'],
+                node_info.free['nvidia.com/gpu']
+            ])
+        return node_table
+
     def _output():
         gpu_table = log_utils.create_table(
             ['COMMON_GPU', 'AVAILABLE_QUANTITIES'])
@@ -3112,6 +3125,12 @@ def _output():
                     yield (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
                            f'Kubernetes GPUs{colorama.Style.RESET_ALL}\n')
                     yield from k8s_realtime_table.get_string()
+                    k8s_node_table = _get_kubernetes_node_info_table()
+                    yield '\n\n'
+                    yield (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
+                           f'Kubernetes per node GPU availability'
+                           f'{colorama.Style.RESET_ALL}\n')
+                    yield from k8s_node_table.get_string()
                 if kubernetes_autoscaling:
                     k8s_messages += (
                         '\n' + kubernetes_utils.KUBERNETES_AUTOSCALER_NOTE)
@@ -3199,6 +3218,7 @@ def _output():
             print_section_titles = True
             yield (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
                    f'Kubernetes GPUs{colorama.Style.RESET_ALL}\n')
+            # TODO(romilb): Show filtered per node GPU availability here as well
             try:
                 k8s_realtime_table = _get_kubernetes_realtime_gpu_table(
                     name_filter=name, quantity_filter=quantity)

diff --git a/sky/clouds/service_catalog/kubernetes_catalog.py b/sky/clouds/service_catalog/kubernetes_catalog.py
@@ -65,6 +65,8 @@ def list_accelerators_realtime(
     require_price: bool = True
 ) -> Tuple[Dict[str, List[common.InstanceTypeInfo]], Dict[str, int], Dict[str,
                                                                           int]]:
+    # TODO(romilb): This should be refactored to use get_kubernetes_node_info()
+    #   function from kubernetes_utils.
     del all_regions, require_price  # Unused.
     k8s_cloud = Kubernetes()
     if not any(

diff --git a/sky/provision/kubernetes/utils.py b/sky/provision/kubernetes/utils.py
@@ -1,4 +1,5 @@
 """Kubernetes utilities for SkyPilot."""
+import dataclasses
 import json
 import math
 import os
@@ -1652,3 +1653,70 @@ def __init__(self, obj):
 
     fake_kube_response = FakeKubeResponse(object_dict)
     return kubernetes.api_client().deserialize(fake_kube_response, object_type)
+
+
+@dataclasses.dataclass
+class KubernetesNodeInfo:
+    """Dataclass to store Kubernetes node information."""
+    name: str
+    gpu_type: Optional[str]
+    # Resources available on the node. E.g., {'nvidia.com/gpu': '2'}
+    total: Dict[str, int]
+    free: Dict[str, int]
+
+
+def get_kubernetes_node_info() -> Dict[str, KubernetesNodeInfo]:
+    """Gets the resource information for all the nodes in the cluster.
+
+    Currently only GPU resources are supported. The function returns the total
+    number of GPUs available on the node and the number of free GPUs on the
+    node.
+
+    Returns:
+        Dict[str, KubernetesNodeInfo]: Dictionary containing the node name as
+            key and the KubernetesNodeInfo object as value
+    """
+    nodes = get_kubernetes_nodes()
+    # Get the pods to get the real-time resource usage
+    pods = get_kubernetes_pods()
+
+    label_formatter, _ = detect_gpu_label_formatter()
+    if not label_formatter:
+        label_key = None
+    else:
+        label_key = label_formatter.get_label_key()
+
+    node_info_dict: Dict[str, KubernetesNodeInfo] = {}
+
+    for node in nodes:
+        allocated_qty = 0
+        if label_formatter is not None and label_key in node.metadata.labels:
+            accelerator_name = label_formatter.get_accelerator_from_label_value(
+                node.metadata.labels.get(label_key))
+        else:
+            accelerator_name = None
+
+        accelerator_count = int(node.status.allocatable.get(
+            'nvidia.com/gpu', 0))
+
+        for pod in pods:
+            # Get all the pods running on the node
+            if (pod.spec.node_name == node.metadata.name and
+                    pod.status.phase in ['Running', 'Pending']):
+                # Iterate over all the containers in the pod and sum the
+                # GPU requests
+                for container in pod.spec.containers:
+                    if container.resources.requests:
+                        allocated_qty += int(
+                            container.resources.requests.get(
+                                'nvidia.com/gpu', 0))
+
+        accelerators_available = accelerator_count - allocated_qty
+
+        node_info_dict[node.metadata.name] = KubernetesNodeInfo(
+            name=node.metadata.name,
+            gpu_type=accelerator_name,
+            total={'nvidia.com/gpu': int(accelerator_count)},
+            free={'nvidia.com/gpu': int(accelerators_available)})
+
+    return node_info_dict