Skip to content

Commit

Permalink
handle network error from pulling docker image
Browse files Browse the repository at this point in the history
  • Loading branch information
landscapepainter committed Sep 13, 2023
1 parent dd9c5d2 commit 3d5e97b
Showing 1 changed file with 6 additions and 3 deletions.
9 changes: 6 additions & 3 deletions sky/skylet/providers/kubernetes/node_provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -229,14 +229,17 @@ def create_node(self, node_config, tags, count):
for node in new_nodes:
pod = kubernetes.core_api().read_namespaced_pod(
node.metadata.name, self.namespace)
if pod.status.phase == 'Pending':
if pod.status.phase == 'Running':
continue
elif pod.status.phase == 'Pending':
# Iterate over each pod to check their status
if pod.status.container_statuses is not None:
for container_status in pod.status.container_statuses:
# Continue if container status is ContainerCreating
# This indicates this pod has been scheduled.
if container_status.state.waiting is not None and container_status.state.waiting.reason == 'ContainerCreating':
continue
if container_status.state.waiting is not None and container_status.state.waiting.reason == 'ErrImagePull':
if 'rpc error: code = Unknown' in container_status.state.waiting.message:
raise config.KubernetesError(f'Failed to pull docker image while launching the node. Please check your network connection. Error:{container_status.state.waiting.message}')
else:
# If the container wasn't in creating state,
# then we know pod wasn't scheduled or had some
Expand Down

0 comments on commit 3d5e97b

Please sign in to comment.