Skip to content

Commit

Permalink
ssh jump refactor
Browse files Browse the repository at this point in the history
  • Loading branch information
romilbhardwaj committed Sep 15, 2023
1 parent 9827bbb commit f74c9df
Show file tree
Hide file tree
Showing 8 changed files with 246 additions and 246 deletions.
2 changes: 1 addition & 1 deletion sky/authentication.py
Original file line number Diff line number Diff line change
Expand Up @@ -435,7 +435,7 @@ def setup_kubernetes_authentication(config: Dict[str, Any]) -> Dict[str, Any]:
# because we need to know the service IP address and port to set the
# ssh_proxy_command in the autoscaler config.
namespace = kubernetes_utils.get_current_kube_config_context_namespace()
kubernetes_utils.setup_sshjump_svc(ssh_jump_name, namespace, service_type)
kubernetes_utils.setup_ssh_jump_svc(ssh_jump_name, namespace, service_type)

ssh_proxy_cmd = kubernetes_utils.get_ssh_proxy_command(
PRIVATE_SSH_KEY_PATH, ssh_jump_name, network_mode, namespace,
Expand Down
8 changes: 4 additions & 4 deletions sky/clouds/kubernetes.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ class Kubernetes(clouds.Cloud):
"""Kubernetes."""

SKY_SSH_KEY_SECRET_NAME = f'sky-ssh-{common_utils.get_user_hash()}'
SKY_SSH_JUMP_NAME = f'sky-sshjump-{common_utils.get_user_hash()}'
SKY_SSH_JUMP_NAME = f'sky-ssh-jump-{common_utils.get_user_hash()}'
PORT_FORWARD_PROXY_CMD_TEMPLATE = \
'kubernetes-port-forward-proxy-command.sh.j2'
PORT_FORWARD_PROXY_CMD_PATH = '~/.sky/port-forward-proxy-cmd.sh'
Expand Down Expand Up @@ -213,7 +213,7 @@ def make_deploy_resources_variables(
image_id = service_catalog.get_image_id_from_tag(image_id,
clouds='kubernetes')
# TODO(romilb): Create a lightweight image for SSH jump host
sshjump_image = service_catalog.get_image_id_from_tag(self.IMAGE_CPU,
ssh_jump_image = service_catalog.get_image_id_from_tag(self.IMAGE_CPU,
clouds='kubernetes')

k8s_acc_label_key = None
Expand All @@ -235,8 +235,8 @@ def make_deploy_resources_variables(
'k8s_ssh_key_secret_name': self.SKY_SSH_KEY_SECRET_NAME,
'k8s_acc_label_key': k8s_acc_label_key,
'k8s_acc_label_value': k8s_acc_label_value,
'k8s_sshjump_name': self.SKY_SSH_JUMP_NAME,
'k8s_sshjump_image': sshjump_image,
'k8s_ssh_jump_name': self.SKY_SSH_JUMP_NAME,
'k8s_ssh_jump_image': ssh_jump_image,
# TODO(romilb): Allow user to specify custom images
'image_id': image_id,
}
Expand Down
8 changes: 4 additions & 4 deletions sky/skylet/providers/kubernetes/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -267,8 +267,8 @@ def _configure_ssh_jump(namespace, config):
"""
pod_cfg = config['available_node_types']['ray_head_default']['node_config']

sshjump_name = pod_cfg['metadata']['labels']['skypilot-sshjump']
sshjump_image = config['provider']['sshjump_image']
ssh_jump_name = pod_cfg['metadata']['labels']['skypilot-ssh-jump']
ssh_jump_image = config['provider']['ssh_jump_image']

volumes = pod_cfg['spec']['volumes']
# find 'secret-volume' and get the secret name
Expand All @@ -288,8 +288,8 @@ def _configure_ssh_jump(namespace, config):
# and available before we create the SSH jump pod. If for any reason the
# service is missing, we should raise an error.

kubernetes_utils.setup_sshjump_pod(sshjump_name, sshjump_image,
ssh_key_secret_name, namespace)
kubernetes_utils.setup_ssh_jump_pod(ssh_jump_name, ssh_jump_image,
ssh_key_secret_name, namespace)
return config


Expand Down
2 changes: 1 addition & 1 deletion sky/skylet/providers/kubernetes/node_provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -347,7 +347,7 @@ def create_node(self, node_config, tags, count):
def terminate_node(self, node_id):
logger.info(config.log_prefix + 'calling delete_namespaced_pod')
try:
kubernetes_utils.clean_zombie_sshjump_pod(self.namespace, node_id)
kubernetes_utils.clean_zombie_ssh_jump_pod(self.namespace, node_id)
except Exception as e:
logger.warning(config.log_prefix +
f'Error occurred when analyzing SSH Jump pod: {e}')
Expand Down
4 changes: 2 additions & 2 deletions sky/templates/kubernetes-ray.yml.j2
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ provider:

timeout: {{timeout}}

sshjump_image: {{k8s_sshjump_image}}
ssh_jump_image: {{k8s_ssh_jump_image}}

# ServiceAccount created by the autoscaler for the head node pod that it
# runs in. If this field isn't provided, the head pod config below must
Expand Down Expand Up @@ -130,7 +130,7 @@ available_node_types:
component: {{cluster_name_on_cloud}}-ray-head
skypilot-cluster: {{cluster_name_on_cloud}}
# Identifies the SSH jump pod used by this pod. Used in life cycle management of the ssh jump pod.
skypilot-sshjump: {{k8s_sshjump_name}}
skypilot-ssh-jump: {{k8s_ssh_jump_name}}
spec:
# Change this if you altered the autoscaler_service_account above
# or want to provide your own.
Expand Down
Original file line number Diff line number Diff line change
@@ -1,90 +1,90 @@
pod_spec:
apiVersion: v1
kind: Pod
metadata:
name: {{ name }}
labels:
component: {{ name }}
parent: skypilot
spec:
serviceAccountName: sky-sshjump-sa
volumes:
- name: secret-volume
secret:
secretName: {{ secret }}
containers:
- name: {{ name }}
imagePullPolicy: Always
image: {{ image }}
command: ["python3", "-u", "/skypilot/sky/utils/kubernetes/sshjump_lcm.py"]
ports:
- containerPort: 22
volumeMounts:
- name: secret-volume
readOnly: true
mountPath: /etc/secret-volume
lifecycle:
postStart:
exec:
command: ["/bin/bash", "-c", "mkdir -p ~/.ssh && cp /etc/secret-volume/ssh-publickey ~/.ssh/authorized_keys && sudo service ssh restart"]
env:
- name: MY_POD_NAME
valueFrom:
fieldRef:
fieldPath: metadata.name
- name: MY_POD_NAMESPACE
valueFrom:
fieldRef:
fieldPath: metadata.namespace
- name: ALERT_THRESHOLD
# seconds
value: "600"
- name: RETRY_INTERVAL
# seconds
value: "60"
terminationGracePeriodSeconds: 0
service_spec:
apiVersion: v1
kind: Service
metadata:
name: {{ name }}
labels:
parent: skypilot
spec:
type: {{ service_type }}
selector:
component: {{ name }}
ports:
- protocol: TCP
port: 22
targetPort: 22
# The following ServiceAccount/Role/RoleBinding sets up an RBAC for life cycle
# management of the jump pod/service
service_account:
apiVersion: v1
kind: ServiceAccount
metadata:
name: sky-sshjump-sa
parent: skypilot
role:
kind: Role
apiVersion: rbac.authorization.k8s.io/v1
metadata:
name: sky-sshjump-role
rules:
- apiGroups: [""]
resources: ["pods", "pods/status", "pods/exec", "services"]
verbs: ["get", "list", "create", "delete"]
role_binding:
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
name: sky-sshjump-rb
parent: skypilot
subjects:
- kind: ServiceAccount
name: sky-sshjump-sa
roleRef:
kind: Role
name: sky-sshjump-role
apiGroup: rbac.authorization.k8s.io
pod_spec:
apiVersion: v1
kind: Pod
metadata:
name: {{ name }}
labels:
component: {{ name }}
parent: skypilot
spec:
serviceAccountName: sky-ssh-jump-sa
volumes:
- name: secret-volume
secret:
secretName: {{ secret }}
containers:
- name: {{ name }}
imagePullPolicy: Always
image: {{ image }}
command: ["python3", "-u", "/skypilot/sky/utils/kubernetes/ssh_jump_lifecycle_manager.py"]
ports:
- containerPort: 22
volumeMounts:
- name: secret-volume
readOnly: true
mountPath: /etc/secret-volume
lifecycle:
postStart:
exec:
command: ["/bin/bash", "-c", "mkdir -p ~/.ssh && cp /etc/secret-volume/ssh-publickey ~/.ssh/authorized_keys && sudo service ssh restart"]
env:
- name: MY_POD_NAME
valueFrom:
fieldRef:
fieldPath: metadata.name
- name: MY_POD_NAMESPACE
valueFrom:
fieldRef:
fieldPath: metadata.namespace
- name: ALERT_THRESHOLD
# seconds
value: "600"
- name: RETRY_INTERVAL
# seconds
value: "60"
terminationGracePeriodSeconds: 0
service_spec:
apiVersion: v1
kind: Service
metadata:
name: {{ name }}
labels:
parent: skypilot
spec:
type: {{ service_type }}
selector:
component: {{ name }}
ports:
- protocol: TCP
port: 22
targetPort: 22
# The following ServiceAccount/Role/RoleBinding sets up an RBAC for life cycle
# management of the jump pod/service
service_account:
apiVersion: v1
kind: ServiceAccount
metadata:
name: sky-ssh-jump-sa
parent: skypilot
role:
kind: Role
apiVersion: rbac.authorization.k8s.io/v1
metadata:
name: sky-ssh-jump-role
rules:
- apiGroups: [""]
resources: ["pods", "pods/status", "pods/exec", "services"]
verbs: ["get", "list", "create", "delete"]
role_binding:
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
name: sky-ssh-jump-rb
parent: skypilot
subjects:
- kind: ServiceAccount
name: sky-ssh-jump-sa
roleRef:
kind: Role
name: sky-ssh-jump-role
apiGroup: rbac.authorization.k8s.io
Loading

0 comments on commit f74c9df

Please sign in to comment.