Skip to content

Commit

Permalink
[k8s] support to use custom gpu resource name if it's not nvidia.com/gpu
Browse files Browse the repository at this point in the history
Signed-off-by: nkwangleiGIT <[email protected]>
  • Loading branch information
nkwangleiGIT committed Nov 13, 2024
1 parent 140125e commit 8538f07
Show file tree
Hide file tree
Showing 7 changed files with 47 additions and 18 deletions.
8 changes: 5 additions & 3 deletions sky/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -3161,11 +3161,13 @@ def _get_kubernetes_node_info_table(context: Optional[str]):

node_info_dict = kubernetes_utils.get_kubernetes_node_info(context)
for node_name, node_info in node_info_dict.items():
available = node_info.free['nvidia.com/gpu'] if node_info.free[
'nvidia.com/gpu'] != -1 else no_permissions_str
available = node_info.free[kubernetes_utils.get_gpu_resource_name(
)] if node_info.free[kubernetes_utils.get_gpu_resource_name(
)] != -1 else no_permissions_str
node_table.add_row([
node_name, node_info.gpu_type,
node_info.total['nvidia.com/gpu'], available
node_info.total[kubernetes_utils.get_gpu_resource_name()],
available
])
return node_table

Expand Down
1 change: 1 addition & 0 deletions sky/clouds/kubernetes.py
Original file line number Diff line number Diff line change
Expand Up @@ -412,6 +412,7 @@ def make_deploy_resources_variables(
'custom_resources': custom_resources,
'cpus': str(cpus),
'memory': str(mem),
'gpu_resource_name': kubernetes_utils.get_gpu_resource_name(),
'accelerator_count': str(acc_count),
'timeout': str(timeout),
'k8s_port_mode': port_mode.value,
Expand Down
6 changes: 4 additions & 2 deletions sky/clouds/service_catalog/kubernetes_catalog.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,8 @@ def list_accelerators_realtime(
continue

accelerator_count = int(
node.status.allocatable.get('nvidia.com/gpu', 0))
node.status.allocatable.get(
kubernetes_utils.get_gpu_resource_name(), 0))

# Generate the GPU quantities for the accelerators
if accelerator_name and accelerator_count > 0:
Expand Down Expand Up @@ -184,7 +185,8 @@ def list_accelerators_realtime(
if container.resources.requests:
allocated_qty += int(
container.resources.requests.get(
'nvidia.com/gpu', 0))
kubernetes_utils.get_gpu_resource_name(),
0))

accelerators_available = accelerator_count - allocated_qty

Expand Down
2 changes: 1 addition & 1 deletion sky/provision/kubernetes/instance.py
Original file line number Diff line number Diff line change
Expand Up @@ -634,7 +634,7 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
'For more details, refer to https://skypilot.readthedocs.io/en/latest/reference/config.html') # pylint: disable=line-too-long

needs_gpus = (pod_spec['spec']['containers'][0].get('resources', {}).get(
'limits', {}).get('nvidia.com/gpu', 0) > 0)
'limits', {}).get(kubernetes_utils.get_gpu_resource_name(), 0) > 0)
if nvidia_runtime_exists and needs_gpus:
pod_spec['spec']['runtimeClassName'] = 'nvidia'

Expand Down
42 changes: 33 additions & 9 deletions sky/provision/kubernetes/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@
DEFAULT_NAMESPACE = 'default'
IN_CLUSTER_REGION = 'in-cluster'

DEFAULT_GPU_RESOURCE_NAME = 'nvidia.com/gpu'

DEFAULT_SERVICE_ACCOUNT_NAME = 'skypilot-service-account'

MEMORY_SIZE_UNITS = {
Expand Down Expand Up @@ -361,8 +363,7 @@ def detect_gpu_resource(context: Optional[str]) -> Tuple[bool, Set[str]]:
nodes = get_kubernetes_nodes(context)
for node in nodes:
cluster_resources.update(node.status.allocatable.keys())
has_gpu = 'nvidia.com/gpu' in cluster_resources

has_gpu = get_gpu_resource_name() in cluster_resources
return has_gpu, cluster_resources


Expand Down Expand Up @@ -1825,6 +1826,7 @@ def get_kubernetes_node_info(
label_key = label_formatter.get_label_key()

node_info_dict: Dict[str, KubernetesNodeInfo] = {}
gpu_resource_name = get_gpu_resource_name()

for node in nodes:
allocated_qty = 0
Expand All @@ -1834,8 +1836,8 @@ def get_kubernetes_node_info(
else:
accelerator_name = None

accelerator_count = int(node.status.allocatable.get(
'nvidia.com/gpu', 0))
accelerator_count = int(
node.status.allocatable.get(gpu_resource_name, 0))

if pods is None:
accelerators_available = -1
Expand All @@ -1851,15 +1853,14 @@ def get_kubernetes_node_info(
if container.resources.requests:
allocated_qty += int(
container.resources.requests.get(
'nvidia.com/gpu', 0))
gpu_resource_name, 0))
accelerators_available = accelerator_count - allocated_qty

node_info_dict[node.metadata.name] = KubernetesNodeInfo(
name=node.metadata.name,
gpu_type=accelerator_name,
total={'nvidia.com/gpu': int(accelerator_count)},
free={'nvidia.com/gpu': int(accelerators_available)})

total={gpu_resource_name: int(accelerator_count)},
free={gpu_resource_name: int(accelerators_available)})
return node_info_dict


Expand Down Expand Up @@ -2095,7 +2096,7 @@ def process_skypilot_pods(
unit='G')
gpu_count = parse_cpu_or_gpu_resource(
pod.spec.containers[0].resources.requests.get(
'nvidia.com/gpu', '0'))
get_gpu_resource_name(), '0'))
gpu_name = None
if gpu_count > 0:
label_formatter, _ = (detect_gpu_label_formatter(context))
Expand Down Expand Up @@ -2148,3 +2149,26 @@ def process_skypilot_pods(
num_pods = len(cluster.pods)
cluster.resources_str = f'{num_pods}x {cluster.resources}'
return list(clusters.values()), jobs_controllers, serve_controllers


def get_gpu_resource_name():
"""Get the GPU resource name to use in kubernetes.
The function first checks for an environment variable.
If defined, it uses its value; otherwise, it returns the default value.
Args:
name (str): Default GPU resource name, default is "nvidia.com/gpu".
Returns:
str: The selected GPU resource name.
"""
# Retrieve GPU resource name from environment variable, if set.
# E.g., can be nvidia.com/gpu-h100, amd.com/gpu etc.
custom_name = os.getenv('CUSTOM_GPU_RESOURCE_NAME')

# If the environment variable is not defined, return the default name
if custom_name is None:
return DEFAULT_GPU_RESOURCE_NAME

return custom_name
4 changes: 2 additions & 2 deletions sky/templates/kubernetes-ray.yml.j2
Original file line number Diff line number Diff line change
Expand Up @@ -409,14 +409,14 @@ available_node_types:
requests:
cpu: {{cpus}}
memory: {{memory}}G
nvidia.com/gpu: {{accelerator_count}}
{{gpu_resource_name}}: {{accelerator_count}}
{% if k8s_fuse_device_required %}
# Kubernetes resource exposed by the fuse device manager
# https://gitlab.com/arm-research/smarter/smarter-device-manager
smarter-devices/fuse: "1"
{% endif %}
limits:
nvidia.com/gpu: {{accelerator_count}} # Limits need to be defined for GPU requests
{{gpu_resource_name}}: {{accelerator_count}} # Limits need to be defined for GPU requests
{% if k8s_fuse_device_required %}
smarter-devices/fuse: "1"
{% endif %}
Expand Down
2 changes: 1 addition & 1 deletion sky/utils/kubernetes/gpu_labeler.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ def label():
# Get the list of nodes with GPUs
gpu_nodes = []
for node in nodes:
if 'nvidia.com/gpu' in node.status.capacity:
if kubernetes_utils.get_gpu_resource_name() in node.status.capacity:
gpu_nodes.append(node)

print(f'Found {len(gpu_nodes)} GPU nodes in the cluster')
Expand Down

0 comments on commit 8538f07

Please sign in to comment.