From 3d5e97b743ad8fad58cdd1a6b40c6c3af65eac64 Mon Sep 17 00:00:00 2001 From: Doyoung Kim Date: Wed, 13 Sep 2023 06:13:38 +0000 Subject: [PATCH 01/24] handle network error from pulling docker image --- sky/skylet/providers/kubernetes/node_provider.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/sky/skylet/providers/kubernetes/node_provider.py b/sky/skylet/providers/kubernetes/node_provider.py index 77222e72ab5..2c87409941c 100644 --- a/sky/skylet/providers/kubernetes/node_provider.py +++ b/sky/skylet/providers/kubernetes/node_provider.py @@ -229,14 +229,17 @@ def create_node(self, node_config, tags, count): for node in new_nodes: pod = kubernetes.core_api().read_namespaced_pod( node.metadata.name, self.namespace) - if pod.status.phase == 'Pending': + if pod.status.phase == 'Running': + continue + elif pod.status.phase == 'Pending': # Iterate over each pod to check their status if pod.status.container_statuses is not None: for container_status in pod.status.container_statuses: # Continue if container status is ContainerCreating # This indicates this pod has been scheduled. - if container_status.state.waiting is not None and container_status.state.waiting.reason == 'ContainerCreating': - continue + if container_status.state.waiting is not None and container_status.state.waiting.reason == 'ErrImagePull': + if 'rpc error: code = Unknown' in container_status.state.waiting.message: + raise config.KubernetesError(f'Failed to pull docker image while launching the node. Please check your network connection. Error:{container_status.state.waiting.message}') else: # If the container wasn't in creating state, # then we know pod wasn't scheduled or had some From f6f93bea45bd538fd0bd71e3eb9dc69193342835 Mon Sep 17 00:00:00 2001 From: Doyoung Kim Date: Wed, 13 Sep 2023 06:17:37 +0000 Subject: [PATCH 02/24] update timeout --- sky/clouds/kubernetes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sky/clouds/kubernetes.py b/sky/clouds/kubernetes.py index 4bea49c155c..ce1defbed7e 100644 --- a/sky/clouds/kubernetes.py +++ b/sky/clouds/kubernetes.py @@ -36,7 +36,7 @@ class Kubernetes(clouds.Cloud): # itself, which can be upto 2-3 seconds. # For non-autoscaling clusters, we conservatively set this to 10s. # TODO(romilb): Make the timeout configurable. - TIMEOUT = 10 + TIMEOUT = 30 _DEFAULT_NUM_VCPUS = 2 _DEFAULT_MEMORY_CPU_RATIO = 1 From 7e84a1b8326dd6eea4ab89df5b35270619993802 Mon Sep 17 00:00:00 2001 From: Doyoung Kim Date: Thu, 14 Sep 2023 05:06:20 +0000 Subject: [PATCH 03/24] nit --- sky/skylet/providers/kubernetes/node_provider.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sky/skylet/providers/kubernetes/node_provider.py b/sky/skylet/providers/kubernetes/node_provider.py index 2c87409941c..1bb5989a47c 100644 --- a/sky/skylet/providers/kubernetes/node_provider.py +++ b/sky/skylet/providers/kubernetes/node_provider.py @@ -229,6 +229,7 @@ def create_node(self, node_config, tags, count): for node in new_nodes: pod = kubernetes.core_api().read_namespaced_pod( node.metadata.name, self.namespace) + # phase is set to Running when all the containers are created if pod.status.phase == 'Running': continue elif pod.status.phase == 'Pending': From 2bf6968fe7c011de9ef82d020a87c0a5ca25f78b Mon Sep 17 00:00:00 2001 From: Doyoung Kim Date: Thu, 14 Sep 2023 09:21:29 +0000 Subject: [PATCH 04/24] nit --- .../providers/kubernetes/node_provider.py | 23 ++++++++++++------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/sky/skylet/providers/kubernetes/node_provider.py b/sky/skylet/providers/kubernetes/node_provider.py index 1bb5989a47c..7a254458e4a 100644 --- a/sky/skylet/providers/kubernetes/node_provider.py +++ b/sky/skylet/providers/kubernetes/node_provider.py @@ -229,7 +229,8 @@ def create_node(self, node_config, tags, count): for node in new_nodes: pod = kubernetes.core_api().read_namespaced_pod( node.metadata.name, self.namespace) - # phase is set to Running when all the containers are created + # Continue if pod phase is Running + # This indicates all the containers are created within the pod if pod.status.phase == 'Running': continue elif pod.status.phase == 'Pending': @@ -238,9 +239,15 @@ def create_node(self, node_config, tags, count): for container_status in pod.status.container_statuses: # Continue if container status is ContainerCreating # This indicates this pod has been scheduled. - if container_status.state.waiting is not None and container_status.state.waiting.reason == 'ErrImagePull': - if 'rpc error: code = Unknown' in container_status.state.waiting.message: - raise config.KubernetesError(f'Failed to pull docker image while launching the node. Please check your network connection. Error:{container_status.state.waiting.message}') + if container_status.state.waiting is not None: + if container_status.state.waiting.reason == 'ErrImagePull': + if 'rpc error: code = Unknown' in container_status.state.waiting.message: + raise config.KubernetesError( + 'Failed to pull docker image while ' + 'launching the node. Please check ' + 'your network connection. Error: ' + f'{container_status.state.waiting.message}.' + ) else: # If the container wasn't in creating state, # then we know pod wasn't scheduled or had some @@ -248,10 +255,10 @@ def create_node(self, node_config, tags, count): # See list of possible reasons for waiting here: # https://stackoverflow.com/a/57886025 all_ready = False - else: - # If container_statuses is None, then the pod hasn't - # been scheduled yet. - all_ready = False + else: + # If container_statuses is None, then the pod hasn't + # been scheduled yet. + all_ready = False if all_ready: break time.sleep(1) From 88fe0057b985cbf2ecb4c30ed66b88b85c27a469 Mon Sep 17 00:00:00 2001 From: Doyoung Kim Date: Sat, 16 Sep 2023 01:33:39 +0000 Subject: [PATCH 05/24] separate scheduing and post-scheduling --- sky/clouds/kubernetes.py | 2 +- .../providers/kubernetes/node_provider.py | 56 +++++++++++++------ 2 files changed, 39 insertions(+), 19 deletions(-) diff --git a/sky/clouds/kubernetes.py b/sky/clouds/kubernetes.py index ce1defbed7e..4bea49c155c 100644 --- a/sky/clouds/kubernetes.py +++ b/sky/clouds/kubernetes.py @@ -36,7 +36,7 @@ class Kubernetes(clouds.Cloud): # itself, which can be upto 2-3 seconds. # For non-autoscaling clusters, we conservatively set this to 10s. # TODO(romilb): Make the timeout configurable. - TIMEOUT = 30 + TIMEOUT = 10 _DEFAULT_NUM_VCPUS = 2 _DEFAULT_MEMORY_CPU_RATIO = 1 diff --git a/sky/skylet/providers/kubernetes/node_provider.py b/sky/skylet/providers/kubernetes/node_provider.py index 7a254458e4a..ad9f06c21b7 100644 --- a/sky/skylet/providers/kubernetes/node_provider.py +++ b/sky/skylet/providers/kubernetes/node_provider.py @@ -229,25 +229,14 @@ def create_node(self, node_config, tags, count): for node in new_nodes: pod = kubernetes.core_api().read_namespaced_pod( node.metadata.name, self.namespace) - # Continue if pod phase is Running - # This indicates all the containers are created within the pod - if pod.status.phase == 'Running': - continue - elif pod.status.phase == 'Pending': + if pod.status.phase == 'Pending': # Iterate over each pod to check their status if pod.status.container_statuses is not None: for container_status in pod.status.container_statuses: # Continue if container status is ContainerCreating # This indicates this pod has been scheduled. - if container_status.state.waiting is not None: - if container_status.state.waiting.reason == 'ErrImagePull': - if 'rpc error: code = Unknown' in container_status.state.waiting.message: - raise config.KubernetesError( - 'Failed to pull docker image while ' - 'launching the node. Please check ' - 'your network connection. Error: ' - f'{container_status.state.waiting.message}.' - ) + if container_status.state.waiting is not None and container_status.state.waiting.reason == 'ContainerCreating': + continue else: # If the container wasn't in creating state, # then we know pod wasn't scheduled or had some @@ -255,14 +244,45 @@ def create_node(self, node_config, tags, count): # See list of possible reasons for waiting here: # https://stackoverflow.com/a/57886025 all_ready = False - else: - # If container_statuses is None, then the pod hasn't - # been scheduled yet. - all_ready = False + else: + # If container_statuses is None, then the pod hasn't + # been scheduled yet. + all_ready = False if all_ready: break time.sleep(1) + # After pods are successfully scheduled, we wait for all the containers + # within pods to be up and running. + while True: + all_pods_running = True + # Iterate over each pod to check their status + for node in new_nodes: + pod = kubernetes.core_api().read_namespaced_pod( + node.metadata.name, self.namespace) + # Continue if pod phase is Running. This indicates all the + # containers are successfully created within the pod. + if pod.status.phase == 'Running': + continue + else: + all_pods_running = False + if pod.status.phase == 'Pending': + # Iterate over each container in pod to check their status + for container_status in pod.status.container_statuses: + if container_status.state.waiting is not None: + if container_status.state.waiting.reason == 'ErrImagePull': + if 'rpc error: code = Unknown' in container_status.state.waiting.message: + raise config.KubernetesError( + 'Failed to pull docker image while ' + 'launching the node. Please check ' + 'your network connection. Error details: ' + f'{container_status.state.waiting.message}.' + ) + + if all_pods_running: + break + time.sleep(1) + def terminate_node(self, node_id): logger.info(config.log_prefix + 'calling delete_namespaced_pod') try: From 565765fddd1771fa04e17e7236a687cdf971f080 Mon Sep 17 00:00:00 2001 From: Doyoung Kim Date: Tue, 19 Sep 2023 06:28:58 +0000 Subject: [PATCH 06/24] nit --- sky/skylet/providers/kubernetes/node_provider.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sky/skylet/providers/kubernetes/node_provider.py b/sky/skylet/providers/kubernetes/node_provider.py index b6405e932d5..3f3f50bdfd4 100644 --- a/sky/skylet/providers/kubernetes/node_provider.py +++ b/sky/skylet/providers/kubernetes/node_provider.py @@ -300,8 +300,8 @@ def create_node(self, node_config, tags, count): break time.sleep(1) - # Wait for pod containers to be ready - they may be pulling images or - # may be in the process of container creation. + # Wait for pods and their containers to be ready - they may be + # pulling images or may be in the process of container creation. while True: all_pods_running = True # Iterate over each pod to check their status @@ -312,7 +312,7 @@ def create_node(self, node_config, tags, count): # pod are succesfully created and running. if pod.status.phase == 'Running' \ and all( - [container.state.running + [container.state.running for container in pod.status.container_statuses]): continue else: @@ -329,6 +329,7 @@ def create_node(self, node_config, tags, count): 'your network connection. Error details: ' f'{container_status.state.waiting.message}.' ) + break if all_pods_running: break @@ -363,7 +364,6 @@ def create_node(self, node_config, tags, count): tty=False, _request_timeout=kubernetes.API_TIMEOUT) - def terminate_node(self, node_id): logger.info(config.log_prefix + 'calling delete_namespaced_pod') try: From 5a7e8436c0dc5bb6b03ba55287b41d4e07ea3262 Mon Sep 17 00:00:00 2001 From: Doyoung Kim Date: Tue, 19 Sep 2023 07:59:39 +0000 Subject: [PATCH 07/24] nit --- sky/skylet/providers/kubernetes/node_provider.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sky/skylet/providers/kubernetes/node_provider.py b/sky/skylet/providers/kubernetes/node_provider.py index 3f3f50bdfd4..16a3676f6ed 100644 --- a/sky/skylet/providers/kubernetes/node_provider.py +++ b/sky/skylet/providers/kubernetes/node_provider.py @@ -177,6 +177,8 @@ def _raise_pod_scheduling_errors(self, new_nodes): sorted(events.items, key=lambda e: e.metadata.creation_timestamp, reverse=True) + + event_message = None for event in events_desc_by_time: if event.reason == 'FailedScheduling': event_message = event.message @@ -216,6 +218,7 @@ def _raise_pod_scheduling_errors(self, new_nodes): f'Pod status: {pod_status}' f'Details: \'{event_message}\' ') raise config.KubernetesError(f'{timeout_err_msg}') + def create_node(self, node_config, tags, count): conf = copy.deepcopy(node_config) From 980446f69986941903de5dee92592c19ee855e84 Mon Sep 17 00:00:00 2001 From: Doyoung Kim Date: Thu, 21 Sep 2023 03:41:11 +0000 Subject: [PATCH 08/24] nit --- sky/skylet/providers/kubernetes/node_provider.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sky/skylet/providers/kubernetes/node_provider.py b/sky/skylet/providers/kubernetes/node_provider.py index 16a3676f6ed..c09098bfb6a 100644 --- a/sky/skylet/providers/kubernetes/node_provider.py +++ b/sky/skylet/providers/kubernetes/node_provider.py @@ -218,7 +218,6 @@ def _raise_pod_scheduling_errors(self, new_nodes): f'Pod status: {pod_status}' f'Details: \'{event_message}\' ') raise config.KubernetesError(f'{timeout_err_msg}') - def create_node(self, node_config, tags, count): conf = copy.deepcopy(node_config) From e808b8679eff0431ed30a7ec781946bb19add524 Mon Sep 17 00:00:00 2001 From: Doyoung Kim Date: Tue, 26 Sep 2023 07:27:16 +0000 Subject: [PATCH 09/24] refactor pod scheduling check --- .../providers/kubernetes/node_provider.py | 80 ++++++++++--------- 1 file changed, 44 insertions(+), 36 deletions(-) diff --git a/sky/skylet/providers/kubernetes/node_provider.py b/sky/skylet/providers/kubernetes/node_provider.py index c09098bfb6a..6fd39fb53cb 100644 --- a/sky/skylet/providers/kubernetes/node_provider.py +++ b/sky/skylet/providers/kubernetes/node_provider.py @@ -220,6 +220,36 @@ def _raise_pod_scheduling_errors(self, new_nodes): raise config.KubernetesError(f'{timeout_err_msg}') def create_node(self, node_config, tags, count): + + def is_pod_scheduled(pod) -> bool: + """Check if a pod is scheduled.""" + if pod.status.phase != 'Pending': + return True + # If container_statuses is None, then the pod hasn't + # been scheduled yet. + if not pod.status.container_statuses: + return False + for container_status in pod.status.container_statuses: + waiting = container_status.state.waiting + # Continue if container status is ContainerCreating + # This indicates this pod has been scheduled. + if waiting and waiting.reason == 'ContainerCreating': + continue + # If the container wasn't in creating state, + # then we know pod wasn't scheduled or had some + # other error, such as image pull error. + # See list of possible reasons for waiting here: + # https://stackoverflow.com/a/57886025 + return False + return True + + def is_pod_running(pod) -> bool: + """Check if a pod and its containers are running.""" + if pod.status.phase != 'Running': + return False + return all(container.state.running + for container in pod.status.container_statuses) + conf = copy.deepcopy(node_config) pod_spec = conf.get('pod', conf) service_spec = conf.get('service') @@ -275,30 +305,13 @@ def create_node(self, node_config, tags, count): 'for pod scheduling failure. ' f'Error: {common_utils.format_exception(e)}') from None - all_ready = True for node in new_nodes: pod = kubernetes.core_api().read_namespaced_pod( node.metadata.name, self.namespace) - if pod.status.phase == 'Pending': - # Iterate over each pod to check their status - if pod.status.container_statuses is not None: - for container_status in pod.status.container_statuses: - # Continue if container status is ContainerCreating - # This indicates this pod has been scheduled. - if container_status.state.waiting is not None and container_status.state.waiting.reason == 'ContainerCreating': - continue - else: - # If the container wasn't in creating state, - # then we know pod wasn't scheduled or had some - # other error, such as image pull error. - # See list of possible reasons for waiting here: - # https://stackoverflow.com/a/57886025 - all_ready = False - else: - # If container_statuses is None, then the pod hasn't - # been scheduled yet. - all_ready = False - if all_ready: + if not is_pod_scheduled(pod): + break + else: + # All pods are scheduled break time.sleep(1) @@ -312,25 +325,20 @@ def create_node(self, node_config, tags, count): node.metadata.name, self.namespace) # Continue if pod and all the containers within the # pod are succesfully created and running. - if pod.status.phase == 'Running' \ - and all( - [container.state.running - for container in pod.status.container_statuses]): - continue - else: + if not is_pod_running(pod): all_pods_running = False if pod.status.phase == 'Pending': # Iterate over each container in pod to check their status for container_status in pod.status.container_statuses: - if container_status.state.waiting is not None: - if container_status.state.waiting.reason == 'ErrImagePull': - if 'rpc error: code = Unknown' in container_status.state.waiting.message: - raise config.KubernetesError( - 'Failed to pull docker image while ' - 'launching the node. Please check ' - 'your network connection. Error details: ' - f'{container_status.state.waiting.message}.' - ) + waiting = container_status.state.waiting + if waiting and container_status.state.waiting.reason == 'ErrImagePull': + if 'rpc error: code = Unknown' in container_status.state.waiting.message: + raise config.KubernetesError( + 'Failed to pull docker image while ' + 'launching the node. Please check ' + 'your network connection. Error details: ' + f'{container_status.state.waiting.message}.' + ) break if all_pods_running: From 30d1220e2b25223977cbc68e6c07ef89d5de2dbe Mon Sep 17 00:00:00 2001 From: Doyoung Kim Date: Wed, 27 Sep 2023 05:50:52 +0000 Subject: [PATCH 10/24] refactor create_node --- .../providers/kubernetes/node_provider.py | 234 +++++++++--------- 1 file changed, 122 insertions(+), 112 deletions(-) diff --git a/sky/skylet/providers/kubernetes/node_provider.py b/sky/skylet/providers/kubernetes/node_provider.py index 6fd39fb53cb..5263c726f91 100644 --- a/sky/skylet/providers/kubernetes/node_provider.py +++ b/sky/skylet/providers/kubernetes/node_provider.py @@ -219,37 +219,128 @@ def _raise_pod_scheduling_errors(self, new_nodes): f'Details: \'{event_message}\' ') raise config.KubernetesError(f'{timeout_err_msg}') - def create_node(self, node_config, tags, count): + def wait_for_pods_to_schedule(self, new_nodes): + """Wait for all pods to be scheduled. + + Failure to schedule before timeout would cause to raise an error. + """ + start_time = time.time() + while time.time() - start_time <= self.timeout: + all_pods_scheduled = True + for node in new_nodes: + # Iterate over each pod to check their status + pod = kubernetes.core_api().read_namespaced_pod( + node.metadata.name, self.namespace) + if pod.status.phase == 'Pending': + # If container_statuses is None, then the pod hasn't + # been scheduled yet. + if not pod.status.container_statuses: + all_pods_scheduled = False + break + + for container_status in pod.status.container_statuses: + # If the container wasn't in 'ContainerCreating' + # state, then we know pod wasn't scheduled or + # had some other error, such as image pull error. + # See list of possible reasons for waiting here: + # https://stackoverflow.com/a/57886025 + waiting = container_status.state.waiting + if waiting and waiting.reason != 'ContainerCreating': + all_pods_scheduled = False + break + + if all_pods_scheduled: + return + time.sleep(1) - def is_pod_scheduled(pod) -> bool: - """Check if a pod is scheduled.""" - if pod.status.phase != 'Pending': - return True - # If container_statuses is None, then the pod hasn't - # been scheduled yet. - if not pod.status.container_statuses: - return False - for container_status in pod.status.container_statuses: - waiting = container_status.state.waiting - # Continue if container status is ContainerCreating - # This indicates this pod has been scheduled. - if waiting and waiting.reason == 'ContainerCreating': + # Handle pod scheduling errors + try: + self._raise_pod_scheduling_errors(new_nodes) + except config.KubernetesError: + raise + except Exception as e: + raise config.KubernetesError( + 'An error occurred while trying to fetch the reason ' + 'for pod scheduling failure. ' + f'Error: {common_utils.format_exception(e)}') from None + + def wait_for_pods_to_run(self, new_nodes): + """Wait for pods and their containers to be ready. + + Pods may be pulling images or may be in the process of container + creation. + """ + while True: + all_pods_running = True + # Iterate over each pod to check their status + for node in new_nodes: + pod = kubernetes.core_api().read_namespaced_pod( + node.metadata.name, self.namespace) + + # Continue if pod and all the containers within the + # pod are succesfully created and running. + if pod.status.phase == 'Running' \ + and all([container.state.running + for container in pod.status.container_statuses]): continue - # If the container wasn't in creating state, - # then we know pod wasn't scheduled or had some - # other error, such as image pull error. - # See list of possible reasons for waiting here: - # https://stackoverflow.com/a/57886025 - return False - return True - - def is_pod_running(pod) -> bool: - """Check if a pod and its containers are running.""" - if pod.status.phase != 'Running': - return False - return all(container.state.running - for container in pod.status.container_statuses) + all_pods_running = False + if pod.status.phase == 'Pending': + # Iterate over each container in pod to check their status + for container_status in pod.status.container_statuses: + waiting = container_status.state.waiting + if waiting and waiting.reason == 'ErrImagePull': + if 'rpc error: code = Unknown' in waiting.message: + raise config.KubernetesError( + 'Failed to pull docker image while ' + 'launching the node. Please check ' + 'your network connection. Error details: ' + f'{container_status.state.waiting.message}.' + ) + # If we reached here, one of the pods had an issue, + # so break out of the loop + break + + if all_pods_running: + break + time.sleep(1) + + def set_env_vars_in_pods(self, new_nodes): + """Setting environment variables in pods. + + Once all containers are ready, we can exec into them and set env vars. + Kubernetes automatically populates containers with critical + environment variables, such as those for discovering services running + in the cluster and CUDA/nvidia environment variables. We need to + make sure these env vars are available in every task and ssh session. + This is needed for GPU support and service discovery. + See https://github.com/skypilot-org/skypilot/issues/2287 for + more details. + + To do so, we capture env vars from the pod's runtime and write them to + /etc/profile.d/, making them available for all users in future + shell sessions. + """ + set_k8s_env_var_cmd = [ + '/bin/sh', '-c', + ('printenv | awk -F "=" \'{print "export " $1 "=\\047" $2 "\\047"}\' > ~/k8s_env_var.sh && ' + 'mv ~/k8s_env_var.sh /etc/profile.d/k8s_env_var.sh || ' + 'sudo mv ~/k8s_env_var.sh /etc/profile.d/k8s_env_var.sh') + ] + + for new_node in new_nodes: + kubernetes.stream()( + kubernetes.core_api().connect_get_namespaced_pod_exec, + new_node.metadata.name, + self.namespace, + command=set_k8s_env_var_cmd, + stderr=True, + stdin=False, + stdout=True, + tty=False, + _request_timeout=kubernetes.API_TIMEOUT) + + def create_node(self, node_config, tags, count): conf = copy.deepcopy(node_config) pod_spec = conf.get('pod', conf) service_spec = conf.get('service') @@ -289,90 +380,9 @@ def is_pod_running(pod) -> bool: self.namespace, service_spec) new_svcs.append(svc) - # Wait for all pods to be scheduled, and if it exceeds the timeout, raise an - # exception. If pod's container is ContainerCreating, then we can assume - # that resources have been allocated and we can exit. - start = time.time() - while True: - if time.time() - start > self.timeout: - try: - self._raise_pod_scheduling_errors(new_nodes) - except config.KubernetesError: - raise - except Exception as e: - raise config.KubernetesError( - 'An error occurred while trying to fetch the reason ' - 'for pod scheduling failure. ' - f'Error: {common_utils.format_exception(e)}') from None - - for node in new_nodes: - pod = kubernetes.core_api().read_namespaced_pod( - node.metadata.name, self.namespace) - if not is_pod_scheduled(pod): - break - else: - # All pods are scheduled - break - time.sleep(1) - - # Wait for pods and their containers to be ready - they may be - # pulling images or may be in the process of container creation. - while True: - all_pods_running = True - # Iterate over each pod to check their status - for node in new_nodes: - pod = kubernetes.core_api().read_namespaced_pod( - node.metadata.name, self.namespace) - # Continue if pod and all the containers within the - # pod are succesfully created and running. - if not is_pod_running(pod): - all_pods_running = False - if pod.status.phase == 'Pending': - # Iterate over each container in pod to check their status - for container_status in pod.status.container_statuses: - waiting = container_status.state.waiting - if waiting and container_status.state.waiting.reason == 'ErrImagePull': - if 'rpc error: code = Unknown' in container_status.state.waiting.message: - raise config.KubernetesError( - 'Failed to pull docker image while ' - 'launching the node. Please check ' - 'your network connection. Error details: ' - f'{container_status.state.waiting.message}.' - ) - break - - if all_pods_running: - break - time.sleep(1) - - # Once all containers are ready, we can exec into them and set env vars. - # Kubernetes automatically populates containers with critical - # environment variables, such as those for discovering services running - # in the cluster and CUDA/nvidia environment variables. We need to - # make sure these env vars are available in every task and ssh session. - # This is needed for GPU support and service discovery. - # See https://github.com/skypilot-org/skypilot/issues/2287 for - # more details. - # To do so, we capture env vars from the pod's runtime and write them to - # /etc/profile.d/, making them available for all users in future - # shell sessions. - set_k8s_env_var_cmd = [ - '/bin/sh', '-c', - ('printenv | awk -F "=" \'{print "export " $1 "=\\047" $2 "\\047"}\' > ~/k8s_env_var.sh && ' - 'mv ~/k8s_env_var.sh /etc/profile.d/k8s_env_var.sh || ' - 'sudo mv ~/k8s_env_var.sh /etc/profile.d/k8s_env_var.sh') - ] - for new_node in new_nodes: - kubernetes.stream()( - kubernetes.core_api().connect_get_namespaced_pod_exec, - new_node.metadata.name, - self.namespace, - command=set_k8s_env_var_cmd, - stderr=True, - stdin=False, - stdout=True, - tty=False, - _request_timeout=kubernetes.API_TIMEOUT) + self.wait_for_pods_to_schedule(new_nodes) + self.wait_for_pods_to_run(new_nodes) + self.set_env_vars_in_pods(new_nodes) def terminate_node(self, node_id): logger.info(config.log_prefix + 'calling delete_namespaced_pod') From fc08d50ddb592fec3214d838d08cc8235e627dce Mon Sep 17 00:00:00 2001 From: Doyoung Kim Date: Wed, 27 Sep 2023 05:56:48 +0000 Subject: [PATCH 11/24] nit --- sky/skylet/providers/kubernetes/node_provider.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/sky/skylet/providers/kubernetes/node_provider.py b/sky/skylet/providers/kubernetes/node_provider.py index 5263c726f91..1494d79b17d 100644 --- a/sky/skylet/providers/kubernetes/node_provider.py +++ b/sky/skylet/providers/kubernetes/node_provider.py @@ -380,7 +380,11 @@ def create_node(self, node_config, tags, count): self.namespace, service_spec) new_svcs.append(svc) + # Wait until the pods are scheduled and surface cause for error + # if there is one self.wait_for_pods_to_schedule(new_nodes) + # Wait until the pods and their containers are up and running, and + # fail early if there is an error self.wait_for_pods_to_run(new_nodes) self.set_env_vars_in_pods(new_nodes) From 4039a51471a6ebd67b4da37e58dfe4a651e4830d Mon Sep 17 00:00:00 2001 From: Doyoung Kim Date: Wed, 27 Sep 2023 06:43:37 +0000 Subject: [PATCH 12/24] nit --- .../providers/kubernetes/node_provider.py | 22 +++++++++++-------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/sky/skylet/providers/kubernetes/node_provider.py b/sky/skylet/providers/kubernetes/node_provider.py index 1494d79b17d..df928a666d0 100644 --- a/sky/skylet/providers/kubernetes/node_provider.py +++ b/sky/skylet/providers/kubernetes/node_provider.py @@ -163,6 +163,11 @@ def _set_node_tags(self, node_id, tags): kubernetes.core_api().patch_namespaced_pod(node_id, self.namespace, pod) def _raise_pod_scheduling_errors(self, new_nodes): + """Raise pod scheduling failure reason. + + The reason for failure to schedule appears as events. Here, we read + the reason and surface it by raising an error. + """ for new_node in new_nodes: pod_status = new_node.status.phase pod_name = new_node._metadata._name @@ -225,7 +230,7 @@ def wait_for_pods_to_schedule(self, new_nodes): Failure to schedule before timeout would cause to raise an error. """ start_time = time.time() - while time.time() - start_time <= self.timeout: + while time.time() - start_time < self.timeout: all_pods_scheduled = True for node in new_nodes: # Iterate over each pod to check their status @@ -289,14 +294,13 @@ def wait_for_pods_to_run(self, new_nodes): # Iterate over each container in pod to check their status for container_status in pod.status.container_statuses: waiting = container_status.state.waiting - if waiting and waiting.reason == 'ErrImagePull': - if 'rpc error: code = Unknown' in waiting.message: - raise config.KubernetesError( - 'Failed to pull docker image while ' - 'launching the node. Please check ' - 'your network connection. Error details: ' - f'{container_status.state.waiting.message}.' - ) + if waiting and (waiting.reason == 'ErrImagePull' or \ + waiting.reason == 'ImagePullBackOff'): + raise config.KubernetesError( + 'Failed to pull docker image while ' + 'launching the node. Please check ' + 'your network connection. Error details: ' + f'{container_status.state.waiting.message}.') # If we reached here, one of the pods had an issue, # so break out of the loop break From 0d7fce554854aaa5611fc99cfae0960976c5dc25 Mon Sep 17 00:00:00 2001 From: Doyoung Kim Date: Wed, 27 Sep 2023 06:47:25 +0000 Subject: [PATCH 13/24] nit --- sky/skylet/providers/kubernetes/node_provider.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/sky/skylet/providers/kubernetes/node_provider.py b/sky/skylet/providers/kubernetes/node_provider.py index df928a666d0..31b6684129d 100644 --- a/sky/skylet/providers/kubernetes/node_provider.py +++ b/sky/skylet/providers/kubernetes/node_provider.py @@ -165,8 +165,9 @@ def _set_node_tags(self, node_id, tags): def _raise_pod_scheduling_errors(self, new_nodes): """Raise pod scheduling failure reason. - The reason for failure to schedule appears as events. Here, we read - the reason and surface it by raising an error. + When a pod fails to schedule in Kubernetes, the reasons for the failure + are recorded as events. This function retrieves those events and raises + descriptive errors for better debugging and user feedback. """ for new_node in new_nodes: pod_status = new_node.status.phase From 079caff4c95ef091b1d570e10f2e59bcc2fadaff Mon Sep 17 00:00:00 2001 From: Doyoung Kim Date: Sun, 8 Oct 2023 23:45:59 +0000 Subject: [PATCH 14/24] testing images --- sky/clouds/kubernetes.py | 12 +++++++----- sky/utils/kubernetes/create_cluster.sh | 6 ++++-- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/sky/clouds/kubernetes.py b/sky/clouds/kubernetes.py index fede03c9044..d99d46101b9 100644 --- a/sky/clouds/kubernetes.py +++ b/sky/clouds/kubernetes.py @@ -211,12 +211,14 @@ def make_deploy_resources_variables( # so the image_id should start with 'skypilot:'. # In the future we may want to get image_id from the resources object. assert image_id.startswith('skypilot:') - image_id = service_catalog.get_image_id_from_tag(image_id, - clouds='kubernetes') + #image_id = service_catalog.get_image_id_from_tag(image_id, + # clouds='kubernetes') # TODO(romilb): Create a lightweight image for SSH jump host - ssh_jump_image = service_catalog.get_image_id_from_tag( - self.IMAGE_CPU, clouds='kubernetes') - + #ssh_jump_image = service_catalog.get_image_id_from_tag( + # self.IMAGE_CPU, clouds='kubernetes') + image_id = 'us-central1-docker.pkg.dev/skypilot-375900/skypilotk8s-test-doyoung/no-conda:latest' + ssh_jump_image = 'us-central1-docker.pkg.dev/skypilot-375900/skypilotk8s-test-doyoung/no-conda:latest' + k8s_acc_label_key = None k8s_acc_label_value = None diff --git a/sky/utils/kubernetes/create_cluster.sh b/sky/utils/kubernetes/create_cluster.sh index c5b74f6819d..670a9885a44 100755 --- a/sky/utils/kubernetes/create_cluster.sh +++ b/sky/utils/kubernetes/create_cluster.sh @@ -36,8 +36,10 @@ kind create cluster --config /tmp/skypilot-kind.yaml --name skypilot # Load local skypilot image on to the cluster for faster startup echo "Loading local skypilot image on to the cluster" -docker pull us-central1-docker.pkg.dev/skypilot-375900/skypilotk8s/skypilot:latest -kind load docker-image --name skypilot us-central1-docker.pkg.dev/skypilot-375900/skypilotk8s/skypilot:latest +#docker pull us-central1-docker.pkg.dev/skypilot-375900/skypilotk8s/skypilot:latest +docker pull us-central1-docker.pkg.dev/skypilot-375900/skypilotk8s-test-doyoung/no-conda:latest +#kind load docker-image --name skypilot us-central1-docker.pkg.dev/skypilot-375900/skypilotk8s/skypilot:latest +kind load docker-image --name us-central1-docker.pkg.dev/skypilot-375900/skypilotk8s-test-doyoung/no-conda:latest # Print CPUs available on the local cluster NUM_CPUS=$(kubectl get nodes -o jsonpath='{.items[0].status.capacity.cpu}') From a98a6b43d34999714fa06f5a9e60f77f41e090e6 Mon Sep 17 00:00:00 2001 From: Doyoung Kim Date: Wed, 18 Oct 2023 04:40:06 +0000 Subject: [PATCH 15/24] back --- sky/clouds/kubernetes.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/sky/clouds/kubernetes.py b/sky/clouds/kubernetes.py index d99d46101b9..8dd1c36897c 100644 --- a/sky/clouds/kubernetes.py +++ b/sky/clouds/kubernetes.py @@ -211,13 +211,11 @@ def make_deploy_resources_variables( # so the image_id should start with 'skypilot:'. # In the future we may want to get image_id from the resources object. assert image_id.startswith('skypilot:') - #image_id = service_catalog.get_image_id_from_tag(image_id, - # clouds='kubernetes') + image_id = service_catalog.get_image_id_from_tag(image_id, + clouds='kubernetes') # TODO(romilb): Create a lightweight image for SSH jump host - #ssh_jump_image = service_catalog.get_image_id_from_tag( - # self.IMAGE_CPU, clouds='kubernetes') - image_id = 'us-central1-docker.pkg.dev/skypilot-375900/skypilotk8s-test-doyoung/no-conda:latest' - ssh_jump_image = 'us-central1-docker.pkg.dev/skypilot-375900/skypilotk8s-test-doyoung/no-conda:latest' + ssh_jump_image = service_catalog.get_image_id_from_tag( + self.IMAGE_CPU, clouds='kubernetes') k8s_acc_label_key = None k8s_acc_label_value = None From 324c6e1d4f55b569a995ebcf4ee9c8ff3ef9fc99 Mon Sep 17 00:00:00 2001 From: Doyoung Kim Date: Tue, 7 Nov 2023 01:42:28 +0000 Subject: [PATCH 16/24] format --- sky/clouds/kubernetes.py | 2 +- sky/skylet/providers/kubernetes/node_provider.py | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/sky/clouds/kubernetes.py b/sky/clouds/kubernetes.py index 8dd1c36897c..fede03c9044 100644 --- a/sky/clouds/kubernetes.py +++ b/sky/clouds/kubernetes.py @@ -216,7 +216,7 @@ def make_deploy_resources_variables( # TODO(romilb): Create a lightweight image for SSH jump host ssh_jump_image = service_catalog.get_image_id_from_tag( self.IMAGE_CPU, clouds='kubernetes') - + k8s_acc_label_key = None k8s_acc_label_value = None diff --git a/sky/skylet/providers/kubernetes/node_provider.py b/sky/skylet/providers/kubernetes/node_provider.py index 7490f1e8710..e4122aa4d6a 100644 --- a/sky/skylet/providers/kubernetes/node_provider.py +++ b/sky/skylet/providers/kubernetes/node_provider.py @@ -174,7 +174,7 @@ def _raise_pod_scheduling_errors(self, new_nodes): new_node.metadata.name, self.namespace) pod_status = pod.status.phase # When there are multiple pods involved while launching instance, - # there may be a single pod causing issue while others are + # there may be a single pod causing issue while others successfully # scheduled. In this case, we make sure to not surface the error # message from the pod that is already scheduled. if pod_status != 'Pending': @@ -411,7 +411,6 @@ def create_node(self, node_config, tags, count): self._wait_for_pods_to_run(new_nodes) self._set_env_vars_in_pods(new_nodes) - def terminate_node(self, node_id): logger.info(config.log_prefix + 'calling delete_namespaced_pod') try: From 7cd299d801fa0fe346cb5f082cf544064dc8ad09 Mon Sep 17 00:00:00 2001 From: Doyoung Kim Date: Tue, 7 Nov 2023 01:43:28 +0000 Subject: [PATCH 17/24] nit --- sky/skylet/providers/kubernetes/node_provider.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sky/skylet/providers/kubernetes/node_provider.py b/sky/skylet/providers/kubernetes/node_provider.py index e4122aa4d6a..9b1dd5a31c4 100644 --- a/sky/skylet/providers/kubernetes/node_provider.py +++ b/sky/skylet/providers/kubernetes/node_provider.py @@ -306,7 +306,7 @@ def _wait_for_pods_to_run(self, new_nodes): # Iterate over each container in pod to check their status for container_status in pod.status.container_statuses: waiting = container_status.state.waiting - if waiting and (waiting.reason == 'ErrImagePull' or \ + if waiting and (waiting.reason == 'ErrImagePull' or waiting.reason == 'ImagePullBackOff'): raise config.KubernetesError( 'Failed to pull docker image while ' From 67e02eb0a2b3de8e6e977f7100b889447888daac Mon Sep 17 00:00:00 2001 From: Doyoung Kim Date: Tue, 7 Nov 2023 05:08:40 +0000 Subject: [PATCH 18/24] nit --- sky/skylet/providers/kubernetes/node_provider.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sky/skylet/providers/kubernetes/node_provider.py b/sky/skylet/providers/kubernetes/node_provider.py index 9b1dd5a31c4..e61e5b87c78 100644 --- a/sky/skylet/providers/kubernetes/node_provider.py +++ b/sky/skylet/providers/kubernetes/node_provider.py @@ -307,7 +307,7 @@ def _wait_for_pods_to_run(self, new_nodes): for container_status in pod.status.container_statuses: waiting = container_status.state.waiting if waiting and (waiting.reason == 'ErrImagePull' or - waiting.reason == 'ImagePullBackOff'): + waiting.reason == 'ImagePullBackOff'): raise config.KubernetesError( 'Failed to pull docker image while ' 'launching the node. Please check ' From 5e92f8a96ca6f543dd66a3b2fc29fa5fb762e31b Mon Sep 17 00:00:00 2001 From: Doyoung Kim Date: Thu, 9 Nov 2023 07:04:33 +0000 Subject: [PATCH 19/24] nit --- sky/utils/kubernetes/create_cluster.sh | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/sky/utils/kubernetes/create_cluster.sh b/sky/utils/kubernetes/create_cluster.sh index 670a9885a44..c5b74f6819d 100755 --- a/sky/utils/kubernetes/create_cluster.sh +++ b/sky/utils/kubernetes/create_cluster.sh @@ -36,10 +36,8 @@ kind create cluster --config /tmp/skypilot-kind.yaml --name skypilot # Load local skypilot image on to the cluster for faster startup echo "Loading local skypilot image on to the cluster" -#docker pull us-central1-docker.pkg.dev/skypilot-375900/skypilotk8s/skypilot:latest -docker pull us-central1-docker.pkg.dev/skypilot-375900/skypilotk8s-test-doyoung/no-conda:latest -#kind load docker-image --name skypilot us-central1-docker.pkg.dev/skypilot-375900/skypilotk8s/skypilot:latest -kind load docker-image --name us-central1-docker.pkg.dev/skypilot-375900/skypilotk8s-test-doyoung/no-conda:latest +docker pull us-central1-docker.pkg.dev/skypilot-375900/skypilotk8s/skypilot:latest +kind load docker-image --name skypilot us-central1-docker.pkg.dev/skypilot-375900/skypilotk8s/skypilot:latest # Print CPUs available on the local cluster NUM_CPUS=$(kubectl get nodes -o jsonpath='{.items[0].status.capacity.cpu}') From 5ec4a648285b51f8c2989a99cedd61e160e75202 Mon Sep 17 00:00:00 2001 From: Doyoung Kim <34902420+landscapepainter@users.noreply.github.com> Date: Wed, 8 Nov 2023 23:05:11 -0800 Subject: [PATCH 20/24] Update sky/skylet/providers/kubernetes/node_provider.py Co-authored-by: Romil Bhardwaj --- sky/skylet/providers/kubernetes/node_provider.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sky/skylet/providers/kubernetes/node_provider.py b/sky/skylet/providers/kubernetes/node_provider.py index e61e5b87c78..8d82dc3677d 100644 --- a/sky/skylet/providers/kubernetes/node_provider.py +++ b/sky/skylet/providers/kubernetes/node_provider.py @@ -262,7 +262,7 @@ def _wait_for_pods_to_schedule(self, new_nodes): # See list of possible reasons for waiting here: # https://stackoverflow.com/a/57886025 waiting = container_status.state.waiting - if waiting and waiting.reason != 'ContainerCreating': + if waiting is not None and waiting.reason != 'ContainerCreating': all_pods_scheduled = False break From a364581b9d0db01e5bcc935c17a60a1b351857fb Mon Sep 17 00:00:00 2001 From: Doyoung Kim Date: Thu, 9 Nov 2023 07:07:47 +0000 Subject: [PATCH 21/24] nit --- sky/skylet/providers/kubernetes/node_provider.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/sky/skylet/providers/kubernetes/node_provider.py b/sky/skylet/providers/kubernetes/node_provider.py index 8d82dc3677d..95fd57a4928 100644 --- a/sky/skylet/providers/kubernetes/node_provider.py +++ b/sky/skylet/providers/kubernetes/node_provider.py @@ -174,9 +174,9 @@ def _raise_pod_scheduling_errors(self, new_nodes): new_node.metadata.name, self.namespace) pod_status = pod.status.phase # When there are multiple pods involved while launching instance, - # there may be a single pod causing issue while others successfully - # scheduled. In this case, we make sure to not surface the error - # message from the pod that is already scheduled. + # there may be a single pod causing issue while others are + # successfully scheduled. In this case, we make sure to not surface + # the error message from the pod that is already scheduled. if pod_status != 'Pending': continue pod_name = pod._metadata._name @@ -251,7 +251,7 @@ def _wait_for_pods_to_schedule(self, new_nodes): if pod.status.phase == 'Pending': # If container_statuses is None, then the pod hasn't # been scheduled yet. - if not pod.status.container_statuses: + if pod.status.container_statuses is None: all_pods_scheduled = False break @@ -309,7 +309,7 @@ def _wait_for_pods_to_run(self, new_nodes): if waiting and (waiting.reason == 'ErrImagePull' or waiting.reason == 'ImagePullBackOff'): raise config.KubernetesError( - 'Failed to pull docker image while ' + 'Failed to pull container image while ' 'launching the node. Please check ' 'your network connection. Error details: ' f'{container_status.state.waiting.message}.') From f590ace291ca7ba2dbef4b4aa10bf7222f74280a Mon Sep 17 00:00:00 2001 From: Doyoung Kim Date: Sun, 12 Nov 2023 05:45:37 +0000 Subject: [PATCH 22/24] update with more waiting.reason list --- .../providers/kubernetes/node_provider.py | 26 ++++++++++++------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/sky/skylet/providers/kubernetes/node_provider.py b/sky/skylet/providers/kubernetes/node_provider.py index 95fd57a4928..ca164955bd9 100644 --- a/sky/skylet/providers/kubernetes/node_provider.py +++ b/sky/skylet/providers/kubernetes/node_provider.py @@ -222,8 +222,10 @@ def _raise_pod_scheduling_errors(self, new_nodes): # TODO(romilb): We may have additional node # affinity selectors in the future - in that # case we will need to update this logic. - if 'Insufficient nvidia.com/gpu' in event_message or \ - 'didn\'t match Pod\'s node affinity/selector' in event_message: + if ('Insufficient nvidia.com/gpu' + in event_message or + 'didn\'t match Pod\'s node affinity/selector' + in event_message): raise config.KubernetesError( f'{lack_resource_msg.format(resource="GPU")} ' f'Verify if {pod.spec.node_selector[label_key]}' @@ -236,7 +238,7 @@ def _raise_pod_scheduling_errors(self, new_nodes): def _wait_for_pods_to_schedule(self, new_nodes): """Wait for all pods to be scheduled. - Wait for all pods including jump pod to be ready, and if it + Wait for all pods including jump pod to be scheduled, and if it exceeds the timeout, raise an exception. If pod's container is ContainerCreating, then we can assume that resources have been allocated and we can exit. @@ -296,9 +298,10 @@ def _wait_for_pods_to_run(self, new_nodes): # Continue if pod and all the containers within the # pod are succesfully created and running. - if pod.status.phase == 'Running' \ - and all([container.state.running - for container in pod.status.container_statuses]): + if pod.status.phase == 'Running' and all([ + container.state.running + for container in pod.status.container_statuses + ]): continue all_pods_running = False @@ -306,14 +309,19 @@ def _wait_for_pods_to_run(self, new_nodes): # Iterate over each container in pod to check their status for container_status in pod.status.container_statuses: waiting = container_status.state.waiting - if waiting and (waiting.reason == 'ErrImagePull' or - waiting.reason == 'ImagePullBackOff'): + if waiting and ( + waiting.reason == 'ErrImagePull' or + waiting.reason == 'ImagePullBackOff' or + waiting.reason == 'CrashLoopBackOff' or + waiting.reason == 'CreateContainerConfigError' + or waiting.reason == 'InvalidImageName' or + waiting.reason == 'CreateContainerError'): raise config.KubernetesError( 'Failed to pull container image while ' 'launching the node. Please check ' 'your network connection. Error details: ' f'{container_status.state.waiting.message}.') - # If we reached here, one of the pods had an issue, + # Reaching this point means that one of the pods had an issue, # so break out of the loop break From 87d41cf3c883565a4898271dc4125578f084073d Mon Sep 17 00:00:00 2001 From: Doyoung Kim Date: Sun, 12 Nov 2023 06:52:23 +0000 Subject: [PATCH 23/24] update waiting check locaion --- .../providers/kubernetes/node_provider.py | 29 +++++-------------- 1 file changed, 8 insertions(+), 21 deletions(-) diff --git a/sky/skylet/providers/kubernetes/node_provider.py b/sky/skylet/providers/kubernetes/node_provider.py index ca164955bd9..7e8096a4795 100644 --- a/sky/skylet/providers/kubernetes/node_provider.py +++ b/sky/skylet/providers/kubernetes/node_provider.py @@ -257,17 +257,6 @@ def _wait_for_pods_to_schedule(self, new_nodes): all_pods_scheduled = False break - for container_status in pod.status.container_statuses: - # If the container wasn't in 'ContainerCreating' - # state, then we know pod wasn't scheduled or - # had some other error, such as image pull error. - # See list of possible reasons for waiting here: - # https://stackoverflow.com/a/57886025 - waiting = container_status.state.waiting - if waiting is not None and waiting.reason != 'ContainerCreating': - all_pods_scheduled = False - break - if all_pods_scheduled: return time.sleep(1) @@ -308,18 +297,16 @@ def _wait_for_pods_to_run(self, new_nodes): if pod.status.phase == 'Pending': # Iterate over each container in pod to check their status for container_status in pod.status.container_statuses: + # If the container wasn't in 'ContainerCreating' + # state, then we know pod wasn't scheduled or + # had some other error, such as image pull error. + # See list of possible reasons for waiting here: + # https://stackoverflow.com/a/57886025 waiting = container_status.state.waiting - if waiting and ( - waiting.reason == 'ErrImagePull' or - waiting.reason == 'ImagePullBackOff' or - waiting.reason == 'CrashLoopBackOff' or - waiting.reason == 'CreateContainerConfigError' - or waiting.reason == 'InvalidImageName' or - waiting.reason == 'CreateContainerError'): + if waiting is not None and waiting.reason != 'ContainerCreating': raise config.KubernetesError( - 'Failed to pull container image while ' - 'launching the node. Please check ' - 'your network connection. Error details: ' + 'Failed to create container while launcing ' + 'the node. Error details: ' f'{container_status.state.waiting.message}.') # Reaching this point means that one of the pods had an issue, # so break out of the loop From c13199015f6497fc75505d00159571d59fe21235 Mon Sep 17 00:00:00 2001 From: Doyoung Kim Date: Thu, 16 Nov 2023 17:04:49 +0000 Subject: [PATCH 24/24] nit --- sky/skylet/providers/kubernetes/node_provider.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sky/skylet/providers/kubernetes/node_provider.py b/sky/skylet/providers/kubernetes/node_provider.py index 7e8096a4795..e2174100041 100644 --- a/sky/skylet/providers/kubernetes/node_provider.py +++ b/sky/skylet/providers/kubernetes/node_provider.py @@ -305,7 +305,7 @@ def _wait_for_pods_to_run(self, new_nodes): waiting = container_status.state.waiting if waiting is not None and waiting.reason != 'ContainerCreating': raise config.KubernetesError( - 'Failed to create container while launcing ' + 'Failed to create container while launching ' 'the node. Error details: ' f'{container_status.state.waiting.message}.') # Reaching this point means that one of the pods had an issue,