From f74c9dfa55d31438ae657b3a2ecc599734e0e9f9 Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Fri, 15 Sep 2023 14:47:11 -0700 Subject: [PATCH] ssh jump refactor --- sky/authentication.py | 2 +- sky/clouds/kubernetes.py | 8 +- sky/skylet/providers/kubernetes/config.py | 8 +- .../providers/kubernetes/node_provider.py | 2 +- sky/templates/kubernetes-ray.yml.j2 | 4 +- ...jump.yml.j2 => kubernetes-ssh-jump.yml.j2} | 180 +++++++------- ...p_lcm.py => ssh_jump_lifecycle_manager.py} | 222 +++++++++--------- sky/utils/kubernetes_utils.py | 66 +++--- 8 files changed, 246 insertions(+), 246 deletions(-) rename sky/templates/{kubernetes-sshjump.yml.j2 => kubernetes-ssh-jump.yml.j2} (88%) rename sky/utils/kubernetes/{sshjump_lcm.py => ssh_jump_lifecycle_manager.py} (90%) diff --git a/sky/authentication.py b/sky/authentication.py index c8523fdb0d4..88930cbf688 100644 --- a/sky/authentication.py +++ b/sky/authentication.py @@ -435,7 +435,7 @@ def setup_kubernetes_authentication(config: Dict[str, Any]) -> Dict[str, Any]: # because we need to know the service IP address and port to set the # ssh_proxy_command in the autoscaler config. namespace = kubernetes_utils.get_current_kube_config_context_namespace() - kubernetes_utils.setup_sshjump_svc(ssh_jump_name, namespace, service_type) + kubernetes_utils.setup_ssh_jump_svc(ssh_jump_name, namespace, service_type) ssh_proxy_cmd = kubernetes_utils.get_ssh_proxy_command( PRIVATE_SSH_KEY_PATH, ssh_jump_name, network_mode, namespace, diff --git a/sky/clouds/kubernetes.py b/sky/clouds/kubernetes.py index d308d7c4c3d..3651d309282 100644 --- a/sky/clouds/kubernetes.py +++ b/sky/clouds/kubernetes.py @@ -28,7 +28,7 @@ class Kubernetes(clouds.Cloud): """Kubernetes.""" SKY_SSH_KEY_SECRET_NAME = f'sky-ssh-{common_utils.get_user_hash()}' - SKY_SSH_JUMP_NAME = f'sky-sshjump-{common_utils.get_user_hash()}' + SKY_SSH_JUMP_NAME = f'sky-ssh-jump-{common_utils.get_user_hash()}' PORT_FORWARD_PROXY_CMD_TEMPLATE = \ 'kubernetes-port-forward-proxy-command.sh.j2' PORT_FORWARD_PROXY_CMD_PATH = '~/.sky/port-forward-proxy-cmd.sh' @@ -213,7 +213,7 @@ def make_deploy_resources_variables( image_id = service_catalog.get_image_id_from_tag(image_id, clouds='kubernetes') # TODO(romilb): Create a lightweight image for SSH jump host - sshjump_image = service_catalog.get_image_id_from_tag(self.IMAGE_CPU, + ssh_jump_image = service_catalog.get_image_id_from_tag(self.IMAGE_CPU, clouds='kubernetes') k8s_acc_label_key = None @@ -235,8 +235,8 @@ def make_deploy_resources_variables( 'k8s_ssh_key_secret_name': self.SKY_SSH_KEY_SECRET_NAME, 'k8s_acc_label_key': k8s_acc_label_key, 'k8s_acc_label_value': k8s_acc_label_value, - 'k8s_sshjump_name': self.SKY_SSH_JUMP_NAME, - 'k8s_sshjump_image': sshjump_image, + 'k8s_ssh_jump_name': self.SKY_SSH_JUMP_NAME, + 'k8s_ssh_jump_image': ssh_jump_image, # TODO(romilb): Allow user to specify custom images 'image_id': image_id, } diff --git a/sky/skylet/providers/kubernetes/config.py b/sky/skylet/providers/kubernetes/config.py index 92ca5d0c13c..2ab466f03ed 100644 --- a/sky/skylet/providers/kubernetes/config.py +++ b/sky/skylet/providers/kubernetes/config.py @@ -267,8 +267,8 @@ def _configure_ssh_jump(namespace, config): """ pod_cfg = config['available_node_types']['ray_head_default']['node_config'] - sshjump_name = pod_cfg['metadata']['labels']['skypilot-sshjump'] - sshjump_image = config['provider']['sshjump_image'] + ssh_jump_name = pod_cfg['metadata']['labels']['skypilot-ssh-jump'] + ssh_jump_image = config['provider']['ssh_jump_image'] volumes = pod_cfg['spec']['volumes'] # find 'secret-volume' and get the secret name @@ -288,8 +288,8 @@ def _configure_ssh_jump(namespace, config): # and available before we create the SSH jump pod. If for any reason the # service is missing, we should raise an error. - kubernetes_utils.setup_sshjump_pod(sshjump_name, sshjump_image, - ssh_key_secret_name, namespace) + kubernetes_utils.setup_ssh_jump_pod(ssh_jump_name, ssh_jump_image, + ssh_key_secret_name, namespace) return config diff --git a/sky/skylet/providers/kubernetes/node_provider.py b/sky/skylet/providers/kubernetes/node_provider.py index b4496855006..8963225cc3f 100644 --- a/sky/skylet/providers/kubernetes/node_provider.py +++ b/sky/skylet/providers/kubernetes/node_provider.py @@ -347,7 +347,7 @@ def create_node(self, node_config, tags, count): def terminate_node(self, node_id): logger.info(config.log_prefix + 'calling delete_namespaced_pod') try: - kubernetes_utils.clean_zombie_sshjump_pod(self.namespace, node_id) + kubernetes_utils.clean_zombie_ssh_jump_pod(self.namespace, node_id) except Exception as e: logger.warning(config.log_prefix + f'Error occurred when analyzing SSH Jump pod: {e}') diff --git a/sky/templates/kubernetes-ray.yml.j2 b/sky/templates/kubernetes-ray.yml.j2 index 79a5e4ccbe1..da8e4253290 100644 --- a/sky/templates/kubernetes-ray.yml.j2 +++ b/sky/templates/kubernetes-ray.yml.j2 @@ -23,7 +23,7 @@ provider: timeout: {{timeout}} - sshjump_image: {{k8s_sshjump_image}} + ssh_jump_image: {{k8s_ssh_jump_image}} # ServiceAccount created by the autoscaler for the head node pod that it # runs in. If this field isn't provided, the head pod config below must @@ -130,7 +130,7 @@ available_node_types: component: {{cluster_name_on_cloud}}-ray-head skypilot-cluster: {{cluster_name_on_cloud}} # Identifies the SSH jump pod used by this pod. Used in life cycle management of the ssh jump pod. - skypilot-sshjump: {{k8s_sshjump_name}} + skypilot-ssh-jump: {{k8s_ssh_jump_name}} spec: # Change this if you altered the autoscaler_service_account above # or want to provide your own. diff --git a/sky/templates/kubernetes-sshjump.yml.j2 b/sky/templates/kubernetes-ssh-jump.yml.j2 similarity index 88% rename from sky/templates/kubernetes-sshjump.yml.j2 rename to sky/templates/kubernetes-ssh-jump.yml.j2 index 7c7c3fbc877..a4c9929fe1e 100644 --- a/sky/templates/kubernetes-sshjump.yml.j2 +++ b/sky/templates/kubernetes-ssh-jump.yml.j2 @@ -1,90 +1,90 @@ -pod_spec: - apiVersion: v1 - kind: Pod - metadata: - name: {{ name }} - labels: - component: {{ name }} - parent: skypilot - spec: - serviceAccountName: sky-sshjump-sa - volumes: - - name: secret-volume - secret: - secretName: {{ secret }} - containers: - - name: {{ name }} - imagePullPolicy: Always - image: {{ image }} - command: ["python3", "-u", "/skypilot/sky/utils/kubernetes/sshjump_lcm.py"] - ports: - - containerPort: 22 - volumeMounts: - - name: secret-volume - readOnly: true - mountPath: /etc/secret-volume - lifecycle: - postStart: - exec: - command: ["/bin/bash", "-c", "mkdir -p ~/.ssh && cp /etc/secret-volume/ssh-publickey ~/.ssh/authorized_keys && sudo service ssh restart"] - env: - - name: MY_POD_NAME - valueFrom: - fieldRef: - fieldPath: metadata.name - - name: MY_POD_NAMESPACE - valueFrom: - fieldRef: - fieldPath: metadata.namespace - - name: ALERT_THRESHOLD - # seconds - value: "600" - - name: RETRY_INTERVAL - # seconds - value: "60" - terminationGracePeriodSeconds: 0 -service_spec: - apiVersion: v1 - kind: Service - metadata: - name: {{ name }} - labels: - parent: skypilot - spec: - type: {{ service_type }} - selector: - component: {{ name }} - ports: - - protocol: TCP - port: 22 - targetPort: 22 -# The following ServiceAccount/Role/RoleBinding sets up an RBAC for life cycle -# management of the jump pod/service -service_account: - apiVersion: v1 - kind: ServiceAccount - metadata: - name: sky-sshjump-sa - parent: skypilot -role: - kind: Role - apiVersion: rbac.authorization.k8s.io/v1 - metadata: - name: sky-sshjump-role - rules: - - apiGroups: [""] - resources: ["pods", "pods/status", "pods/exec", "services"] - verbs: ["get", "list", "create", "delete"] -role_binding: - apiVersion: rbac.authorization.k8s.io/v1 - kind: RoleBinding - metadata: - name: sky-sshjump-rb - parent: skypilot - subjects: - - kind: ServiceAccount - name: sky-sshjump-sa - roleRef: - kind: Role - name: sky-sshjump-role - apiGroup: rbac.authorization.k8s.io +pod_spec: + apiVersion: v1 + kind: Pod + metadata: + name: {{ name }} + labels: + component: {{ name }} + parent: skypilot + spec: + serviceAccountName: sky-ssh-jump-sa + volumes: + - name: secret-volume + secret: + secretName: {{ secret }} + containers: + - name: {{ name }} + imagePullPolicy: Always + image: {{ image }} + command: ["python3", "-u", "/skypilot/sky/utils/kubernetes/ssh_jump_lifecycle_manager.py"] + ports: + - containerPort: 22 + volumeMounts: + - name: secret-volume + readOnly: true + mountPath: /etc/secret-volume + lifecycle: + postStart: + exec: + command: ["/bin/bash", "-c", "mkdir -p ~/.ssh && cp /etc/secret-volume/ssh-publickey ~/.ssh/authorized_keys && sudo service ssh restart"] + env: + - name: MY_POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: MY_POD_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: ALERT_THRESHOLD + # seconds + value: "600" + - name: RETRY_INTERVAL + # seconds + value: "60" + terminationGracePeriodSeconds: 0 +service_spec: + apiVersion: v1 + kind: Service + metadata: + name: {{ name }} + labels: + parent: skypilot + spec: + type: {{ service_type }} + selector: + component: {{ name }} + ports: + - protocol: TCP + port: 22 + targetPort: 22 +# The following ServiceAccount/Role/RoleBinding sets up an RBAC for life cycle +# management of the jump pod/service +service_account: + apiVersion: v1 + kind: ServiceAccount + metadata: + name: sky-ssh-jump-sa + parent: skypilot +role: + kind: Role + apiVersion: rbac.authorization.k8s.io/v1 + metadata: + name: sky-ssh-jump-role + rules: + - apiGroups: [""] + resources: ["pods", "pods/status", "pods/exec", "services"] + verbs: ["get", "list", "create", "delete"] +role_binding: + apiVersion: rbac.authorization.k8s.io/v1 + kind: RoleBinding + metadata: + name: sky-ssh-jump-rb + parent: skypilot + subjects: + - kind: ServiceAccount + name: sky-ssh-jump-sa + roleRef: + kind: Role + name: sky-ssh-jump-role + apiGroup: rbac.authorization.k8s.io diff --git a/sky/utils/kubernetes/sshjump_lcm.py b/sky/utils/kubernetes/ssh_jump_lifecycle_manager.py similarity index 90% rename from sky/utils/kubernetes/sshjump_lcm.py rename to sky/utils/kubernetes/ssh_jump_lifecycle_manager.py index 491d507a62d..05f6a8d7a42 100644 --- a/sky/utils/kubernetes/sshjump_lcm.py +++ b/sky/utils/kubernetes/ssh_jump_lifecycle_manager.py @@ -1,111 +1,111 @@ -"""Manages lifecycle of ssh jump pod. - -This script runs inside ssh jump pod as the main process (PID 1). - -It terminates itself (by removing sshjump service and pod via a call to -kubeapi), if it does not see ray pods in the duration of 10 minutes. If the -user re-launches a task before the duration is over, then ssh jump pod is being -reused and will terminate itself when it sees that no ray cluster exist in that -duration. -""" -import datetime -import os -import sys -import time - -from kubernetes import client -from kubernetes import config - -# Load kube config -config.load_incluster_config() - -v1 = client.CoreV1Api() - -current_name = os.getenv('MY_POD_NAME') -current_namespace = os.getenv('MY_POD_NAMESPACE') - -# The amount of time in seconds where no Ray pods exist in which after that time -# sshjump pod terminates itself -alert_threshold = int(os.getenv('ALERT_THRESHOLD', '600')) -# The amount of time in seconds to wait between Ray pods existence checks -retry_interval = int(os.getenv('RETRY_INTERVAL', '60')) - -# Ray pods are labeled with this value i.e., sshjump name which is unique per -# user (based on user hash) -label_selector = f'skypilot-sshjump={current_name}' - - -def poll(): - sys.stdout.write('Starting polling.\n') - - alert_delta = datetime.timedelta(seconds=alert_threshold) - - # Set delay for each retry - retry_interval_delta = datetime.timedelta(seconds=retry_interval) - - # Accumulated time of where no SkyPilot cluster exists. Used to compare - # against alert_threshold - nocluster_delta = datetime.timedelta() - - while True: - sys.stdout.write(f'Sleeping {retry_interval} seconds..\n') - time.sleep(retry_interval) - - # List the pods in the current namespace - try: - ret = v1.list_namespaced_pod(current_namespace, - label_selector=label_selector) - except Exception as e: - sys.stdout.write(f'Error: listing pods failed with error: {e}\n') - raise - - if len(ret.items) == 0: - sys.stdout.write(f'Did not find pods with label "{label_selector}" ' - f'in namespace {current_namespace}\n') - nocluster_delta = nocluster_delta + retry_interval_delta - sys.stdout.write( - f'Time since no pods found: {nocluster_delta}, alert ' - f'threshold: {alert_delta}\n') - else: - sys.stdout.write( - f'Found pods with label "{label_selector}" in namespace ' - f'{current_namespace}\n') - # reset .. - nocluster_delta = datetime.timedelta() - sys.stdout.write(f'noray_delta is reset: {nocluster_delta}\n') - - if nocluster_delta >= alert_delta: - sys.stdout.write( - f'nocluster_delta: {nocluster_delta} crossed alert threshold: ' - f'{alert_delta}. Time to terminate myself and my service.\n') - try: - # sshjump resources created under same name - v1.delete_namespaced_service(current_name, current_namespace) - v1.delete_namespaced_pod(current_name, current_namespace) - except Exception as e: - sys.stdout.write('[ERROR] Deletion failed. Exiting ' - f'poll() with error: {e}\n') - raise - - break - - sys.stdout.write('Done polling.\n') - - -def main(): - sys.stdout.write('SkyPilot SSH Jump Pod Lifecycle Manager\n') - sys.stdout.write(f'current_name: {current_name}\n') - sys.stdout.write(f'current_namespace: {current_namespace}\n') - sys.stdout.write(f'alert_threshold time: {alert_threshold}\n') - sys.stdout.write(f'retry_interval time: {retry_interval}\n') - sys.stdout.write(f'label_selector: {label_selector}\n') - - if not current_name or not current_namespace: - # Raise Exception with message to terminate pod - raise Exception('Missing environment variables MY_POD_NAME or ' - 'MY_POD_NAMESPACE') - poll() - - -if __name__ == '__main__': - main() +"""Manages lifecycle of ssh jump pod. + +This script runs inside ssh jump pod as the main process (PID 1). + +It terminates itself (by removing ssh jump service and pod via a call to +kubeapi), if it does not see ray pods in the duration of 10 minutes. If the +user re-launches a task before the duration is over, then ssh jump pod is being +reused and will terminate itself when it sees that no ray cluster exist in that +duration. +""" +import datetime +import os +import sys +import time + +from kubernetes import client +from kubernetes import config + +# Load kube config +config.load_incluster_config() + +v1 = client.CoreV1Api() + +current_name = os.getenv('MY_POD_NAME') +current_namespace = os.getenv('MY_POD_NAMESPACE') + +# The amount of time in seconds where no Ray pods exist in which after that time +# ssh jump pod terminates itself +alert_threshold = int(os.getenv('ALERT_THRESHOLD', '600')) +# The amount of time in seconds to wait between Ray pods existence checks +retry_interval = int(os.getenv('RETRY_INTERVAL', '60')) + +# Ray pods are labeled with this value i.e., ssh jump name which is unique per +# user (based on user hash) +label_selector = f'skypilot-ssh-jump={current_name}' + + +def poll(): + sys.stdout.write('Starting polling.\n') + + alert_delta = datetime.timedelta(seconds=alert_threshold) + + # Set delay for each retry + retry_interval_delta = datetime.timedelta(seconds=retry_interval) + + # Accumulated time of where no SkyPilot cluster exists. Used to compare + # against alert_threshold + nocluster_delta = datetime.timedelta() + + while True: + sys.stdout.write(f'Sleeping {retry_interval} seconds..\n') + time.sleep(retry_interval) + + # List the pods in the current namespace + try: + ret = v1.list_namespaced_pod(current_namespace, + label_selector=label_selector) + except Exception as e: + sys.stdout.write(f'Error: listing pods failed with error: {e}\n') + raise + + if len(ret.items) == 0: + sys.stdout.write(f'Did not find pods with label "{label_selector}" ' + f'in namespace {current_namespace}\n') + nocluster_delta = nocluster_delta + retry_interval_delta + sys.stdout.write( + f'Time since no pods found: {nocluster_delta}, alert ' + f'threshold: {alert_delta}\n') + else: + sys.stdout.write( + f'Found pods with label "{label_selector}" in namespace ' + f'{current_namespace}\n') + # reset .. + nocluster_delta = datetime.timedelta() + sys.stdout.write(f'noray_delta is reset: {nocluster_delta}\n') + + if nocluster_delta >= alert_delta: + sys.stdout.write( + f'nocluster_delta: {nocluster_delta} crossed alert threshold: ' + f'{alert_delta}. Time to terminate myself and my service.\n') + try: + # ssh jump resources created under same name + v1.delete_namespaced_service(current_name, current_namespace) + v1.delete_namespaced_pod(current_name, current_namespace) + except Exception as e: + sys.stdout.write('[ERROR] Deletion failed. Exiting ' + f'poll() with error: {e}\n') + raise + + break + + sys.stdout.write('Done polling.\n') + + +def main(): + sys.stdout.write('SkyPilot SSH Jump Pod Lifecycle Manager\n') + sys.stdout.write(f'current_name: {current_name}\n') + sys.stdout.write(f'current_namespace: {current_namespace}\n') + sys.stdout.write(f'alert_threshold time: {alert_threshold}\n') + sys.stdout.write(f'retry_interval time: {retry_interval}\n') + sys.stdout.write(f'label_selector: {label_selector}\n') + + if not current_name or not current_namespace: + # Raise Exception with message to terminate pod + raise Exception('Missing environment variables MY_POD_NAME or ' + 'MY_POD_NAMESPACE') + poll() + + +if __name__ == '__main__': + main() diff --git a/sky/utils/kubernetes_utils.py b/sky/utils/kubernetes_utils.py index b753ccb37dd..255012cb474 100644 --- a/sky/utils/kubernetes_utils.py +++ b/sky/utils/kubernetes_utils.py @@ -726,22 +726,22 @@ def get_ssh_proxy_command(private_key_path: str, ssh_jump_name: str, return ssh_jump_proxy_command -def setup_sshjump_svc(ssh_jump_name: str, namespace: str, - service_type: KubernetesServiceType): +def setup_ssh_jump_svc(ssh_jump_name: str, namespace: str, + service_type: KubernetesServiceType): """Sets up Kubernetes service resource to access for SSH jump pod. This method acts as a necessary complement to be run along with - setup_sshjump_pod(...) method. This service ensures the pod is accessible. + setup_ssh_jump_pod(...) method. This service ensures the pod is accessible. Args: - sshjump_name: Name to use for the SSH jump service + ssh_jump_name: Name to use for the SSH jump service namespace: Namespace to create the SSH jump service in service_type: Networking configuration on either to use NodePort or ClusterIP service to ssh in """ - # Fill in template - ssh_key_secret and sshjump_image are not required for + # Fill in template - ssh_key_secret and ssh_jump_image are not required for # the service spec, so we pass in empty strs. - content = fill_sshjump_template('', '', ssh_jump_name, service_type.value) + content = fill_ssh_jump_template('', '', ssh_jump_name, service_type.value) # Create service try: kubernetes.core_api().create_namespaced_service(namespace, @@ -788,8 +788,8 @@ def setup_sshjump_svc(ssh_jump_name: str, namespace: str, logger.info(f'Created SSH Jump Service {ssh_jump_name}.') -def setup_sshjump_pod(sshjump_name: str, sshjump_image: str, - ssh_key_secret: str, namespace: str): +def setup_ssh_jump_pod(ssh_jump_name: str, ssh_jump_image: str, + ssh_key_secret: str, namespace: str): """Sets up Kubernetes RBAC and pod for SSH jump host. Our Kubernetes implementation uses a SSH jump pod to reach SkyPilot clusters @@ -798,18 +798,18 @@ def setup_sshjump_pod(sshjump_name: str, sshjump_image: str, permission to watch for other SkyPilot pods and terminate itself if there are no SkyPilot pods running. - setup_sshjump_service must also be run to ensure that the SSH jump pod is + setup_ssh_jump_service must also be run to ensure that the SSH jump pod is reachable. Args: - sshjump_image: Container image to use for the SSH jump pod - sshjump_name: Name to use for the SSH jump pod + ssh_jump_image: Container image to use for the SSH jump pod + ssh_jump_name: Name to use for the SSH jump pod ssh_key_secret: Secret name for the SSH key stored in the cluster namespace: Namespace to create the SSH jump pod in """ # Fill in template - service is created separately so service_type is not # required, so we pass in empty str. - content = fill_sshjump_template(ssh_key_secret, sshjump_image, sshjump_name, + content = fill_ssh_jump_template(ssh_key_secret, ssh_jump_image, ssh_jump_name, '') # ServiceAccount try: @@ -855,20 +855,20 @@ def setup_sshjump_pod(sshjump_name: str, sshjump_image: str, except kubernetes.api_exception() as e: if e.status == 409: logger.info( - f'SSH Jump Host {sshjump_name} already exists in the cluster, ' + f'SSH Jump Host {ssh_jump_name} already exists in the cluster, ' 'using it.') else: raise else: - logger.info(f'Created SSH Jump Host {sshjump_name}.') + logger.info(f'Created SSH Jump Host {ssh_jump_name}.') -def clean_zombie_sshjump_pod(namespace: str, node_id: str): +def clean_zombie_ssh_jump_pod(namespace: str, node_id: str): """Analyzes SSH jump pod and removes if it is in a bad state Prevents the existence of a dangling SSH jump pod. This could happen in case the pod main container did not start properly (or failed). In that - case, jump pod lifecycle management (LCM) will not functioning properly to + case, jump pod lifecycle manager will not function properly to remove the pod and service automatically, and must be done manually. Args: @@ -890,48 +890,48 @@ def find(l, predicate): ' but the pod was not found (404).') raise else: - sshjump_name = pod.metadata.labels.get('skypilot-sshjump') + ssh_jump_name = pod.metadata.labels.get('skypilot-ssh-jump') try: - sshjump_pod = kubernetes.core_api().read_namespaced_pod( - sshjump_name, namespace) - cont_ready_cond = find(sshjump_pod.status.conditions, + ssh_jump_pod = kubernetes.core_api().read_namespaced_pod( + ssh_jump_name, namespace) + cont_ready_cond = find(ssh_jump_pod.status.conditions, lambda c: c.type == 'ContainersReady') if cont_ready_cond and \ cont_ready_cond.status == 'False': # The main container is not ready. To be on the safe side - # and prevent a dangling sshjump pod, lets remove it and + # and prevent a dangling ssh jump pod, lets remove it and # the service. Otherwise main container is ready and its lifecycle # management script takes care of the cleaning. - kubernetes.core_api().delete_namespaced_pod(sshjump_name, namespace) + kubernetes.core_api().delete_namespaced_pod(ssh_jump_name, namespace) kubernetes.core_api().delete_namespaced_service( - sshjump_name, namespace) + ssh_jump_name, namespace) # only warn and proceed as usual except kubernetes.api_exception() as e: - logger.warning(f'Tried to check sshjump pod {sshjump_name},' + logger.warning(f'Tried to check ssh jump pod {ssh_jump_name},' f' but got error {e}\n. Consider running `kubectl ' - f'delete pod {sshjump_name} -n {namespace}` to manually ' + f'delete pod {ssh_jump_name} -n {namespace}` to manually ' 'remove the pod if it has crashed.') - # We encountered an issue while checking sshjump pod. To be on + # We encountered an issue while checking ssh jump pod. To be on # the safe side, lets remove its service so the port is freed try: kubernetes.core_api().delete_namespaced_service( - sshjump_name, namespace) + ssh_jump_name, namespace) except kubernetes.api_exception(): pass -def fill_sshjump_template(ssh_key_secret: str, sshjump_image: str, - sshjump_name: str, service_type: str) -> Dict: +def fill_ssh_jump_template(ssh_key_secret: str, ssh_jump_image: str, + ssh_jump_name: str, service_type: str) -> Dict: template_path = os.path.join(sky.__root_dir__, 'templates', - 'kubernetes-sshjump.yml.j2') + 'kubernetes-ssh-jump.yml.j2') if not os.path.exists(template_path): raise FileNotFoundError( - 'Template "kubernetes-sshjump.j2" does not exist.') + 'Template "kubernetes-ssh-jump.j2" does not exist.') with open(template_path) as fin: template = fin.read() j2_template = jinja2.Template(template) - cont = j2_template.render(name=sshjump_name, - image=sshjump_image, + cont = j2_template.render(name=ssh_jump_name, + image=ssh_jump_image, secret=ssh_key_secret, service_type=service_type) content = yaml.safe_load(cont)