From 7048b0db368c82374e01008dbd90879569495dc2 Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Thu, 18 Jul 2024 09:13:42 -0700 Subject: [PATCH 1/5] minor fix --- llm/axolotl/axolotl-spot.yaml | 2 -- 1 file changed, 2 deletions(-) diff --git a/llm/axolotl/axolotl-spot.yaml b/llm/axolotl/axolotl-spot.yaml index 4832fa72c04..b22a8ae3fce 100644 --- a/llm/axolotl/axolotl-spot.yaml +++ b/llm/axolotl/axolotl-spot.yaml @@ -29,5 +29,3 @@ run: | envs: HF_TOKEN: # TODO: Fill with your own huggingface token, or use --env to pass. BUCKET: # TODO: Fill with your unique bucket name, or use --env to pass. - -4 From d1b275e5c3dd3dbe7236e0d1a03ac4470da6eba3 Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Thu, 18 Jul 2024 11:51:17 -0700 Subject: [PATCH 2/5] init container fix --- sky/provision/kubernetes/instance.py | 34 ++++++++++++++++++++++++---- 1 file changed, 30 insertions(+), 4 deletions(-) diff --git a/sky/provision/kubernetes/instance.py b/sky/provision/kubernetes/instance.py index 7668c7348aa..15f2162db71 100644 --- a/sky/provision/kubernetes/instance.py +++ b/sky/provision/kubernetes/instance.py @@ -222,6 +222,28 @@ def _wait_for_pods_to_run(namespace, new_nodes): Pods may be pulling images or may be in the process of container creation. """ + + def _check_init_containers(pod: 'kubernetes.client.V1Pod'): + # Check if any of the init containers failed + # to start. Could be because the init container + # command failed or failed to pull image etc. + for init_status in pod.status.init_container_statuses: + init_terminated = init_status.state.terminated + if init_terminated and init_terminated.exit_code != 0: + msg = init_terminated.message if ( + init_terminated.message) else str(init_terminated) + raise config_lib.KubernetesError( + f'Failed to run init container for pod {pod.metadata.name}.' + f' Error details: {msg}.') + init_waiting = init_status.state.waiting + if (init_waiting is not None and init_waiting.reason + not in ['ContainerCreating', 'PodInitializing']): + msg = init_waiting.message if ( + init_waiting.message) else str(init_waiting) + raise config_lib.KubernetesError( + 'Failed to create init container for pod ' + f'{pod.metadata.name}. Error details: {msg}.') + while True: all_pods_running = True # Iterate over each pod to check their status @@ -248,10 +270,14 @@ def _wait_for_pods_to_run(namespace, new_nodes): waiting = container_status.state.waiting if (waiting is not None and waiting.reason != 'ContainerCreating'): - raise config_lib.KubernetesError( - 'Failed to create container while launching ' - 'the node. Error details: ' - f'{container_status.state.waiting.message}.') + if waiting.reason == 'PodInitializing': + _check_init_containers(pod) + else: + msg = waiting.message if waiting.message else str( + waiting) + raise config_lib.KubernetesError( + 'Failed to create container while launching ' + f'the node. Error details: {msg}.') # Reaching this point means that one of the pods had an issue, # so break out of the loop, and wait until next second. break From 727a3cb7d10d2a4e43ab7cb8869e4d4376592bef Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Thu, 18 Jul 2024 12:02:44 -0700 Subject: [PATCH 3/5] init container fix --- sky/provision/kubernetes/instance.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sky/provision/kubernetes/instance.py b/sky/provision/kubernetes/instance.py index 15f2162db71..9fc233c3153 100644 --- a/sky/provision/kubernetes/instance.py +++ b/sky/provision/kubernetes/instance.py @@ -223,7 +223,7 @@ def _wait_for_pods_to_run(namespace, new_nodes): creation. """ - def _check_init_containers(pod: 'kubernetes.client.V1Pod'): + def _check_init_containers(pod): # Check if any of the init containers failed # to start. Could be because the init container # command failed or failed to pull image etc. From fd66d0325a560d20d95f438c8b244d5f04801f05 Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Wed, 24 Jul 2024 15:08:41 -0700 Subject: [PATCH 4/5] fix comments --- sky/provision/kubernetes/instance.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/sky/provision/kubernetes/instance.py b/sky/provision/kubernetes/instance.py index 9fc233c3153..93494761984 100644 --- a/sky/provision/kubernetes/instance.py +++ b/sky/provision/kubernetes/instance.py @@ -229,12 +229,14 @@ def _check_init_containers(pod): # command failed or failed to pull image etc. for init_status in pod.status.init_container_statuses: init_terminated = init_status.state.terminated - if init_terminated and init_terminated.exit_code != 0: - msg = init_terminated.message if ( - init_terminated.message) else str(init_terminated) - raise config_lib.KubernetesError( - f'Failed to run init container for pod {pod.metadata.name}.' - f' Error details: {msg}.') + if init_terminated: + if init_terminated.exit_code != 0: + msg = init_terminated.message if ( + init_terminated.message) else str(init_terminated) + raise config_lib.KubernetesError( + 'Failed to run init container for pod ' + f'{pod.metadata.name}. Error details: {msg}.') + return init_waiting = init_status.state.waiting if (init_waiting is not None and init_waiting.reason not in ['ContainerCreating', 'PodInitializing']): @@ -268,11 +270,10 @@ def _check_init_containers(pod): # See list of possible reasons for waiting here: # https://stackoverflow.com/a/57886025 waiting = container_status.state.waiting - if (waiting is not None and - waiting.reason != 'ContainerCreating'): + if waiting is not None: if waiting.reason == 'PodInitializing': _check_init_containers(pod) - else: + elif waiting.reason != 'ContainerCreating': msg = waiting.message if waiting.message else str( waiting) raise config_lib.KubernetesError( From 0d45a112a60312b1dd498b21b1c0511802417443 Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Wed, 24 Jul 2024 15:42:29 -0700 Subject: [PATCH 5/5] fix comments --- sky/provision/kubernetes/instance.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/sky/provision/kubernetes/instance.py b/sky/provision/kubernetes/instance.py index 93494761984..a5996abe028 100644 --- a/sky/provision/kubernetes/instance.py +++ b/sky/provision/kubernetes/instance.py @@ -236,10 +236,12 @@ def _check_init_containers(pod): raise config_lib.KubernetesError( 'Failed to run init container for pod ' f'{pod.metadata.name}. Error details: {msg}.') - return + continue init_waiting = init_status.state.waiting if (init_waiting is not None and init_waiting.reason not in ['ContainerCreating', 'PodInitializing']): + # TODO(romilb): There may be more states to check for. Add + # them as needed. msg = init_waiting.message if ( init_waiting.message) else str(init_waiting) raise config_lib.KubernetesError(