Skip to content

Commit

Permalink
separate scheduing and post-scheduling
Browse files Browse the repository at this point in the history
  • Loading branch information
landscapepainter committed Sep 16, 2023
1 parent 2bf6968 commit 88fe005
Show file tree
Hide file tree
Showing 2 changed files with 39 additions and 19 deletions.
2 changes: 1 addition & 1 deletion sky/clouds/kubernetes.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ class Kubernetes(clouds.Cloud):
# itself, which can be upto 2-3 seconds.
# For non-autoscaling clusters, we conservatively set this to 10s.
# TODO(romilb): Make the timeout configurable.
TIMEOUT = 30
TIMEOUT = 10

_DEFAULT_NUM_VCPUS = 2
_DEFAULT_MEMORY_CPU_RATIO = 1
Expand Down
56 changes: 38 additions & 18 deletions sky/skylet/providers/kubernetes/node_provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -229,40 +229,60 @@ def create_node(self, node_config, tags, count):
for node in new_nodes:
pod = kubernetes.core_api().read_namespaced_pod(
node.metadata.name, self.namespace)
# Continue if pod phase is Running
# This indicates all the containers are created within the pod
if pod.status.phase == 'Running':
continue
elif pod.status.phase == 'Pending':
if pod.status.phase == 'Pending':
# Iterate over each pod to check their status
if pod.status.container_statuses is not None:
for container_status in pod.status.container_statuses:
# Continue if container status is ContainerCreating
# This indicates this pod has been scheduled.
if container_status.state.waiting is not None:
if container_status.state.waiting.reason == 'ErrImagePull':
if 'rpc error: code = Unknown' in container_status.state.waiting.message:
raise config.KubernetesError(
'Failed to pull docker image while '
'launching the node. Please check '
'your network connection. Error: '
f'{container_status.state.waiting.message}.'
)
if container_status.state.waiting is not None and container_status.state.waiting.reason == 'ContainerCreating':
continue
else:
# If the container wasn't in creating state,
# then we know pod wasn't scheduled or had some
# other error, such as image pull error.
# See list of possible reasons for waiting here:
# https://stackoverflow.com/a/57886025
all_ready = False
else:
# If container_statuses is None, then the pod hasn't
# been scheduled yet.
all_ready = False
else:
# If container_statuses is None, then the pod hasn't
# been scheduled yet.
all_ready = False
if all_ready:
break
time.sleep(1)

# After pods are successfully scheduled, we wait for all the containers
# within pods to be up and running.
while True:
all_pods_running = True
# Iterate over each pod to check their status
for node in new_nodes:
pod = kubernetes.core_api().read_namespaced_pod(
node.metadata.name, self.namespace)
# Continue if pod phase is Running. This indicates all the
# containers are successfully created within the pod.
if pod.status.phase == 'Running':
continue
else:
all_pods_running = False
if pod.status.phase == 'Pending':
# Iterate over each container in pod to check their status
for container_status in pod.status.container_statuses:
if container_status.state.waiting is not None:
if container_status.state.waiting.reason == 'ErrImagePull':
if 'rpc error: code = Unknown' in container_status.state.waiting.message:
raise config.KubernetesError(
'Failed to pull docker image while '
'launching the node. Please check '
'your network connection. Error details: '
f'{container_status.state.waiting.message}.'
)

if all_pods_running:
break
time.sleep(1)

def terminate_node(self, node_id):
logger.info(config.log_prefix + 'calling delete_namespaced_pod')
try:
Expand Down

0 comments on commit 88fe005

Please sign in to comment.