[k8s] support to use custom gpu resource name if it's not nvidia.com/gpu

Signed-off-by: nkwangleiGIT <[email protected]>
skypilot-org · Nov 13, 2024 · 8538f07 · 8538f07
1 parent 140125e
commit 8538f07
Show file tree

Hide file tree

Showing 7 changed files with 47 additions and 18 deletions.
diff --git a/sky/cli.py b/sky/cli.py
@@ -3161,11 +3161,13 @@ def _get_kubernetes_node_info_table(context: Optional[str]):
 
         node_info_dict = kubernetes_utils.get_kubernetes_node_info(context)
         for node_name, node_info in node_info_dict.items():
-            available = node_info.free['nvidia.com/gpu'] if node_info.free[
-                'nvidia.com/gpu'] != -1 else no_permissions_str
+            available = node_info.free[kubernetes_utils.get_gpu_resource_name(
+            )] if node_info.free[kubernetes_utils.get_gpu_resource_name(
+            )] != -1 else no_permissions_str
             node_table.add_row([
                 node_name, node_info.gpu_type,
-                node_info.total['nvidia.com/gpu'], available
+                node_info.total[kubernetes_utils.get_gpu_resource_name()],
+                available
             ])
         return node_table
 

diff --git a/sky/clouds/kubernetes.py b/sky/clouds/kubernetes.py
@@ -412,6 +412,7 @@ def make_deploy_resources_variables(
             'custom_resources': custom_resources,
             'cpus': str(cpus),
             'memory': str(mem),
+            'gpu_resource_name': kubernetes_utils.get_gpu_resource_name(),
             'accelerator_count': str(acc_count),
             'timeout': str(timeout),
             'k8s_port_mode': port_mode.value,

diff --git a/sky/clouds/service_catalog/kubernetes_catalog.py b/sky/clouds/service_catalog/kubernetes_catalog.py
@@ -146,7 +146,8 @@ def list_accelerators_realtime(
                 continue
 
             accelerator_count = int(
-                node.status.allocatable.get('nvidia.com/gpu', 0))
+                node.status.allocatable.get(
+                    kubernetes_utils.get_gpu_resource_name(), 0))
 
             # Generate the GPU quantities for the accelerators
             if accelerator_name and accelerator_count > 0:
@@ -184,7 +185,8 @@ def list_accelerators_realtime(
                         if container.resources.requests:
                             allocated_qty += int(
                                 container.resources.requests.get(
-                                    'nvidia.com/gpu', 0))
+                                    kubernetes_utils.get_gpu_resource_name(),
+                                    0))
 
             accelerators_available = accelerator_count - allocated_qty
 

diff --git a/sky/provision/kubernetes/instance.py b/sky/provision/kubernetes/instance.py
@@ -634,7 +634,7 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
                        'For more details, refer to https://skypilot.readthedocs.io/en/latest/reference/config.html')  # pylint: disable=line-too-long
 
     needs_gpus = (pod_spec['spec']['containers'][0].get('resources', {}).get(
-        'limits', {}).get('nvidia.com/gpu', 0) > 0)
+        'limits', {}).get(kubernetes_utils.get_gpu_resource_name(), 0) > 0)
     if nvidia_runtime_exists and needs_gpus:
         pod_spec['spec']['runtimeClassName'] = 'nvidia'
 

diff --git a/sky/provision/kubernetes/utils.py b/sky/provision/kubernetes/utils.py
@@ -38,6 +38,8 @@
 DEFAULT_NAMESPACE = 'default'
 IN_CLUSTER_REGION = 'in-cluster'
 
+DEFAULT_GPU_RESOURCE_NAME = 'nvidia.com/gpu'
+
 DEFAULT_SERVICE_ACCOUNT_NAME = 'skypilot-service-account'
 
 MEMORY_SIZE_UNITS = {
@@ -361,8 +363,7 @@ def detect_gpu_resource(context: Optional[str]) -> Tuple[bool, Set[str]]:
     nodes = get_kubernetes_nodes(context)
     for node in nodes:
         cluster_resources.update(node.status.allocatable.keys())
-    has_gpu = 'nvidia.com/gpu' in cluster_resources
-
+    has_gpu = get_gpu_resource_name() in cluster_resources
     return has_gpu, cluster_resources
 
 
@@ -1825,6 +1826,7 @@ def get_kubernetes_node_info(
         label_key = label_formatter.get_label_key()
 
     node_info_dict: Dict[str, KubernetesNodeInfo] = {}
+    gpu_resource_name = get_gpu_resource_name()
 
     for node in nodes:
         allocated_qty = 0
@@ -1834,8 +1836,8 @@ def get_kubernetes_node_info(
         else:
             accelerator_name = None
 
-        accelerator_count = int(node.status.allocatable.get(
-            'nvidia.com/gpu', 0))
+        accelerator_count = int(
+            node.status.allocatable.get(gpu_resource_name, 0))
 
         if pods is None:
             accelerators_available = -1
@@ -1851,15 +1853,14 @@ def get_kubernetes_node_info(
                         if container.resources.requests:
                             allocated_qty += int(
                                 container.resources.requests.get(
-                                    'nvidia.com/gpu', 0))
+                                    gpu_resource_name, 0))
             accelerators_available = accelerator_count - allocated_qty
 
         node_info_dict[node.metadata.name] = KubernetesNodeInfo(
             name=node.metadata.name,
             gpu_type=accelerator_name,
-            total={'nvidia.com/gpu': int(accelerator_count)},
-            free={'nvidia.com/gpu': int(accelerators_available)})
-
+            total={gpu_resource_name: int(accelerator_count)},
+            free={gpu_resource_name: int(accelerators_available)})
     return node_info_dict
 
 
@@ -2095,7 +2096,7 @@ def process_skypilot_pods(
                 unit='G')
             gpu_count = parse_cpu_or_gpu_resource(
                 pod.spec.containers[0].resources.requests.get(
-                    'nvidia.com/gpu', '0'))
+                    get_gpu_resource_name(), '0'))
             gpu_name = None
             if gpu_count > 0:
                 label_formatter, _ = (detect_gpu_label_formatter(context))
@@ -2148,3 +2149,26 @@ def process_skypilot_pods(
         num_pods = len(cluster.pods)
         cluster.resources_str = f'{num_pods}x {cluster.resources}'
     return list(clusters.values()), jobs_controllers, serve_controllers
+
+
+def get_gpu_resource_name():
+    """Get the GPU resource name to use in kubernetes.
+
+    The function first checks for an environment variable.
+    If defined, it uses its value; otherwise, it returns the default value.
+
+    Args:
+        name (str): Default GPU resource name, default is "nvidia.com/gpu".
+
+    Returns:
+        str: The selected GPU resource name.
+    """
+    # Retrieve GPU resource name from environment variable, if set.
+    # E.g., can be nvidia.com/gpu-h100, amd.com/gpu etc.
+    custom_name = os.getenv('CUSTOM_GPU_RESOURCE_NAME')
+
+    # If the environment variable is not defined, return the default name
+    if custom_name is None:
+        return DEFAULT_GPU_RESOURCE_NAME
+
+    return custom_name
diff --git a/sky/templates/kubernetes-ray.yml.j2 b/sky/templates/kubernetes-ray.yml.j2
@@ -409,14 +409,14 @@ available_node_types:
             requests:
               cpu: {{cpus}}
               memory: {{memory}}G
-              nvidia.com/gpu: {{accelerator_count}}
+              {{gpu_resource_name}}: {{accelerator_count}}
               {% if k8s_fuse_device_required %}
               # Kubernetes resource exposed by the fuse device manager
               # https://gitlab.com/arm-research/smarter/smarter-device-manager
               smarter-devices/fuse: "1"
               {% endif %}
             limits:
-              nvidia.com/gpu: {{accelerator_count}} # Limits need to be defined for GPU requests
+              {{gpu_resource_name}}: {{accelerator_count}} # Limits need to be defined for GPU requests
               {% if k8s_fuse_device_required %}
               smarter-devices/fuse: "1"
               {% endif %}

diff --git a/sky/utils/kubernetes/gpu_labeler.py b/sky/utils/kubernetes/gpu_labeler.py
@@ -101,7 +101,7 @@ def label():
         # Get the list of nodes with GPUs
         gpu_nodes = []
         for node in nodes:
-            if 'nvidia.com/gpu' in node.status.capacity:
+            if kubernetes_utils.get_gpu_resource_name() in node.status.capacity:
                 gpu_nodes.append(node)
 
         print(f'Found {len(gpu_nodes)} GPU nodes in the cluster')