From 4ece0273f1504f804c479bdf2d55ca84dc87456d Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Thu, 5 Dec 2024 02:06:13 +0530 Subject: [PATCH 1/6] Add limits only if they exist --- sky/templates/kubernetes-ray.yml.j2 | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/sky/templates/kubernetes-ray.yml.j2 b/sky/templates/kubernetes-ray.yml.j2 index e572b263924..7fcaf64a69a 100644 --- a/sky/templates/kubernetes-ray.yml.j2 +++ b/sky/templates/kubernetes-ray.yml.j2 @@ -560,6 +560,7 @@ available_node_types: # https://gitlab.com/arm-research/smarter/smarter-device-manager smarter-devices/fuse: "1" {% endif %} + {% if k8s_resource_key is not none or k8s_fuse_device_required %} limits: # Limits need to be defined for GPU/TPU requests {% if k8s_resource_key is not none %} @@ -568,7 +569,8 @@ available_node_types: {% if k8s_fuse_device_required %} smarter-devices/fuse: "1" {% endif %} - + {% endif %} + setup_commands: # Disable `unattended-upgrades` to prevent apt-get from hanging. It should be called at the beginning before the process started to avoid being blocked. (This is a temporary fix.) # Create ~/.ssh/config file in case the file does not exist in the image. From c9e607afb24cb60b25a161f3ffdee286b16b7ac5 Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Thu, 12 Dec 2024 16:00:23 -0800 Subject: [PATCH 2/6] retry deletion --- sky/provision/kubernetes/instance.py | 70 ++++++++++++++++++++-------- 1 file changed, 50 insertions(+), 20 deletions(-) diff --git a/sky/provision/kubernetes/instance.py b/sky/provision/kubernetes/instance.py index 2b13e78fdf8..4b4a6bbf0d6 100644 --- a/sky/provision/kubernetes/instance.py +++ b/sky/provision/kubernetes/instance.py @@ -875,31 +875,61 @@ def stop_instances( raise NotImplementedError() -def _terminate_node(namespace: str, context: Optional[str], - pod_name: str) -> None: +def _terminate_node(namespace: str, context: Optional[str], pod_name: str) -> None: """Terminate a pod.""" logger.debug('terminate_instances: calling delete_namespaced_pod') - try: - kubernetes.core_api(context).delete_namespaced_service( - pod_name, namespace, _request_timeout=config_lib.DELETION_TIMEOUT) - kubernetes.core_api(context).delete_namespaced_service( - f'{pod_name}-ssh', - namespace, - _request_timeout=config_lib.DELETION_TIMEOUT) - except kubernetes.api_exception(): - pass + + def _delete_k8s_resource(delete_func: Callable, resource_type: str, + resource_name: str) -> None: + """Helper to delete Kubernetes resources with 404 handling and retries. + + Args: + delete_func: Function to call to delete the resource + resource_type: Type of resource being deleted (e.g. 'service'), + used in logging + resource_name: Name of the resource being deleted, used in logging + """ + max_retries = 3 + retry_delay = 5 # seconds + + for attempt in range(max_retries): + try: + delete_func() + return + except kubernetes.api_exception() as e: + if e.status == 404: + logger.warning( + f'terminate_instances: Tried to delete {resource_type} ' + f'{resource_name}, but the {resource_type} was not ' + 'found (404).') + return + elif attempt < max_retries - 1: + logger.warning( + f'terminate_instances: Failed to delete ' + f'{resource_type} {resource_name} (attempt ' + f'{attempt + 1}/{max_retries}). Error: {e}. ' + f'Retrying in {retry_delay} seconds...') + time.sleep(retry_delay) + else: + raise + + # Delete services for the pod + for service_name in [pod_name, f'{pod_name}-ssh']: + _delete_k8s_resource( + lambda: kubernetes.core_api(context).delete_namespaced_service( + service_name, namespace, + _request_timeout=config_lib.DELETION_TIMEOUT), + 'service', + service_name) + # Note - delete pod after all other resources are deleted. # This is to ensure there are no leftover resources if this down is run # from within the pod, e.g., for autodown. - try: - kubernetes.core_api(context).delete_namespaced_pod( - pod_name, namespace, _request_timeout=config_lib.DELETION_TIMEOUT) - except kubernetes.api_exception() as e: - if e.status == 404: - logger.warning('terminate_instances: Tried to delete pod ' - f'{pod_name}, but the pod was not found (404).') - else: - raise + _delete_k8s_resource( + lambda: kubernetes.core_api(context).delete_namespaced_pod( + pod_name, namespace, _request_timeout=config_lib.DELETION_TIMEOUT), + 'pod', + pod_name) def terminate_instances( From b2ec39ea9ced844af6b7f78798b13a9a0260f243 Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Thu, 12 Dec 2024 16:05:12 -0800 Subject: [PATCH 3/6] lint --- sky/provision/kubernetes/instance.py | 32 ++++++++++++++-------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/sky/provision/kubernetes/instance.py b/sky/provision/kubernetes/instance.py index 4b4a6bbf0d6..6a77807b95c 100644 --- a/sky/provision/kubernetes/instance.py +++ b/sky/provision/kubernetes/instance.py @@ -875,14 +875,15 @@ def stop_instances( raise NotImplementedError() -def _terminate_node(namespace: str, context: Optional[str], pod_name: str) -> None: +def _terminate_node(namespace: str, context: Optional[str], + pod_name: str) -> None: """Terminate a pod.""" logger.debug('terminate_instances: calling delete_namespaced_pod') - + def _delete_k8s_resource(delete_func: Callable, resource_type: str, - resource_name: str) -> None: + resource_name: str) -> None: """Helper to delete Kubernetes resources with 404 handling and retries. - + Args: delete_func: Function to call to delete the resource resource_type: Type of resource being deleted (e.g. 'service'), @@ -891,7 +892,7 @@ def _delete_k8s_resource(delete_func: Callable, resource_type: str, """ max_retries = 3 retry_delay = 5 # seconds - + for attempt in range(max_retries): try: delete_func() @@ -904,11 +905,10 @@ def _delete_k8s_resource(delete_func: Callable, resource_type: str, 'found (404).') return elif attempt < max_retries - 1: - logger.warning( - f'terminate_instances: Failed to delete ' - f'{resource_type} {resource_name} (attempt ' - f'{attempt + 1}/{max_retries}). Error: {e}. ' - f'Retrying in {retry_delay} seconds...') + logger.warning(f'terminate_instances: Failed to delete ' + f'{resource_type} {resource_name} (attempt ' + f'{attempt + 1}/{max_retries}). Error: {e}. ' + f'Retrying in {retry_delay} seconds...') time.sleep(retry_delay) else: raise @@ -916,10 +916,11 @@ def _delete_k8s_resource(delete_func: Callable, resource_type: str, # Delete services for the pod for service_name in [pod_name, f'{pod_name}-ssh']: _delete_k8s_resource( - lambda: kubernetes.core_api(context).delete_namespaced_service( - service_name, namespace, - _request_timeout=config_lib.DELETION_TIMEOUT), - 'service', + lambda name=service_name: kubernetes.core_api(context). + delete_namespaced_service( + name, + namespace, + _request_timeout=config_lib.DELETION_TIMEOUT), 'service', service_name) # Note - delete pod after all other resources are deleted. @@ -928,8 +929,7 @@ def _delete_k8s_resource(delete_func: Callable, resource_type: str, _delete_k8s_resource( lambda: kubernetes.core_api(context).delete_namespaced_pod( pod_name, namespace, _request_timeout=config_lib.DELETION_TIMEOUT), - 'pod', - pod_name) + 'pod', pod_name) def terminate_instances( From d3b5247db7a21c025684390aa16288f285035347 Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Thu, 12 Dec 2024 16:05:37 -0800 Subject: [PATCH 4/6] lint --- sky/provision/kubernetes/instance.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/sky/provision/kubernetes/instance.py b/sky/provision/kubernetes/instance.py index 6a77807b95c..53a9bb3f99e 100644 --- a/sky/provision/kubernetes/instance.py +++ b/sky/provision/kubernetes/instance.py @@ -886,7 +886,7 @@ def _delete_k8s_resource(delete_func: Callable, resource_type: str, Args: delete_func: Function to call to delete the resource - resource_type: Type of resource being deleted (e.g. 'service'), + resource_type: Type of resource being deleted (e.g. 'service'), used in logging resource_name: Name of the resource being deleted, used in logging """ @@ -915,13 +915,11 @@ def _delete_k8s_resource(delete_func: Callable, resource_type: str, # Delete services for the pod for service_name in [pod_name, f'{pod_name}-ssh']: - _delete_k8s_resource( - lambda name=service_name: kubernetes.core_api(context). - delete_namespaced_service( - name, - namespace, - _request_timeout=config_lib.DELETION_TIMEOUT), 'service', - service_name) + _delete_k8s_resource(lambda name=service_name: kubernetes.core_api( + context).delete_namespaced_service( + name, namespace, _request_timeout=config_lib.DELETION_TIMEOUT), + 'service', + service_name) # Note - delete pod after all other resources are deleted. # This is to ensure there are no leftover resources if this down is run From 69f9256907cba76806198093b9abdcdcb2d690ff Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Thu, 12 Dec 2024 18:16:40 -0800 Subject: [PATCH 5/6] comments --- sky/provision/kubernetes/instance.py | 29 +++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/sky/provision/kubernetes/instance.py b/sky/provision/kubernetes/instance.py index 53a9bb3f99e..b6d1c02fc61 100644 --- a/sky/provision/kubernetes/instance.py +++ b/sky/provision/kubernetes/instance.py @@ -880,8 +880,9 @@ def _terminate_node(namespace: str, context: Optional[str], """Terminate a pod.""" logger.debug('terminate_instances: calling delete_namespaced_pod') - def _delete_k8s_resource(delete_func: Callable, resource_type: str, - resource_name: str) -> None: + def _delete_k8s_resource_with_retry(delete_func: Callable, + resource_type: str, + resource_name: str) -> None: """Helper to delete Kubernetes resources with 404 handling and retries. Args: @@ -915,19 +916,25 @@ def _delete_k8s_resource(delete_func: Callable, resource_type: str, # Delete services for the pod for service_name in [pod_name, f'{pod_name}-ssh']: - _delete_k8s_resource(lambda name=service_name: kubernetes.core_api( - context).delete_namespaced_service( - name, namespace, _request_timeout=config_lib.DELETION_TIMEOUT), - 'service', - service_name) + _delete_k8s_resource_with_retry( + delete_func=lambda name=service_name: kubernetes.core_api( + context).delete_namespaced_service( + name=name, + namespace=namespace, + _request_timeout=config_lib.DELETION_TIMEOUT), + resource_type='service', + resource_name=service_name) # Note - delete pod after all other resources are deleted. # This is to ensure there are no leftover resources if this down is run # from within the pod, e.g., for autodown. - _delete_k8s_resource( - lambda: kubernetes.core_api(context).delete_namespaced_pod( - pod_name, namespace, _request_timeout=config_lib.DELETION_TIMEOUT), - 'pod', pod_name) + _delete_k8s_resource_with_retry( + delete_func=lambda: kubernetes.core_api(context).delete_namespaced_pod( + name=pod_name, + namespace=namespace, + _request_timeout=config_lib.DELETION_TIMEOUT), + resource_type='pod', + resource_name=pod_name) def terminate_instances( From 4d07e46a51bf501a0e2c18e962b3450224d77163 Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Thu, 12 Dec 2024 18:20:45 -0800 Subject: [PATCH 6/6] lint --- sky/provision/kubernetes/instance.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sky/provision/kubernetes/instance.py b/sky/provision/kubernetes/instance.py index b6d1c02fc61..11e3d2d80ad 100644 --- a/sky/provision/kubernetes/instance.py +++ b/sky/provision/kubernetes/instance.py @@ -918,10 +918,10 @@ def _delete_k8s_resource_with_retry(delete_func: Callable, for service_name in [pod_name, f'{pod_name}-ssh']: _delete_k8s_resource_with_retry( delete_func=lambda name=service_name: kubernetes.core_api( - context).delete_namespaced_service( - name=name, - namespace=namespace, - _request_timeout=config_lib.DELETION_TIMEOUT), + context).delete_namespaced_service(name=name, + namespace=namespace, + _request_timeout=config_lib. + DELETION_TIMEOUT), resource_type='service', resource_name=service_name)