Skip to content

Commit

Permalink
per node gpu availability
Browse files Browse the repository at this point in the history
  • Loading branch information
romilbhardwaj committed Aug 8, 2024
1 parent 67cb9b4 commit 7ee556c
Show file tree
Hide file tree
Showing 2 changed files with 9 additions and 7 deletions.
6 changes: 4 additions & 2 deletions sky/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -3079,7 +3079,8 @@ def _get_kubernetes_node_info_table():
node_info_dict = kubernetes_utils.get_kubernetes_node_info()
for node_name, node_info in node_info_dict.items():
node_table.add_row([
node_name, node_info.gpu_type, node_info.total['nvidia.com/gpu'],
node_name, node_info.gpu_type,
node_info.total['nvidia.com/gpu'],
node_info.free['nvidia.com/gpu']
])
return node_table
Expand Down Expand Up @@ -3127,7 +3128,8 @@ def _output():
k8s_node_table = _get_kubernetes_node_info_table()
yield '\n\n'
yield (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
f'Kubernetes per node GPU availability{colorama.Style.RESET_ALL}\n')
f'Kubernetes per node GPU availability'
f'{colorama.Style.RESET_ALL}\n')
yield from k8s_node_table.get_string()
if kubernetes_autoscaling:
k8s_messages += (
Expand Down
10 changes: 5 additions & 5 deletions sky/provision/kubernetes/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -1661,8 +1661,8 @@ class KubernetesNodeInfo:
name: str
gpu_type: Optional[str]
# Resources available on the node. E.g., {'nvidia.com/gpu': '2'}
total: Dict[str, str]
free: Dict[str, str]
total: Dict[str, int]
free: Dict[str, int]


def get_kubernetes_node_info() -> Dict[str, KubernetesNodeInfo]:
Expand Down Expand Up @@ -1690,14 +1690,14 @@ def get_kubernetes_node_info() -> Dict[str, KubernetesNodeInfo]:

for node in nodes:
allocated_qty = 0
if label_key is not None and label_key in node.metadata.labels:
if label_formatter is not None and label_key in node.metadata.labels:
accelerator_name = label_formatter.get_accelerator_from_label_value(
node.metadata.labels.get(label_key))
else:
accelerator_name = None

accelerator_count = int(
node.status.allocatable.get('nvidia.com/gpu', 0))
accelerator_count = int(node.status.allocatable.get(
'nvidia.com/gpu', 0))

for pod in pods:
# Get all the pods running on the node
Expand Down

0 comments on commit 7ee556c

Please sign in to comment.