Skip to content

Commit

Permalink
address comments
Browse files Browse the repository at this point in the history
  • Loading branch information
Michaelvll committed Jun 3, 2024
1 parent 6b22124 commit 8a672d6
Show file tree
Hide file tree
Showing 3 changed files with 21 additions and 11 deletions.
5 changes: 3 additions & 2 deletions sky/provision/kubernetes/instance.py
Original file line number Diff line number Diff line change
Expand Up @@ -351,6 +351,7 @@ def _setup_ssh_in_pods(namespace: str, new_nodes: List) -> None:
# See https://www.educative.io/answers/error-mesg-ttyname-failed-inappropriate-ioctl-for-device # pylint: disable=line-too-long
'$(prefix_cmd) sed -i "s/mesg n/tty -s \\&\\& mesg n/" ~/.profile;')

# TODO(romilb): Parallelize the setup of SSH in pods for multi-node clusters
for new_node in new_nodes:
pod_name = new_node.metadata.name
runner = command_runner.KubernetesCommandRunner((namespace, pod_name))
Expand Down Expand Up @@ -761,7 +762,7 @@ def query_instances(

def get_command_runners(
cluster_info: common.ClusterInfo,
**crednetials: Dict[str, Any],
**credentials: Dict[str, Any],
) -> List[command_runner.CommandRunner]:
"""Get a command runner for the given cluster."""
assert cluster_info.provider_config is not None, cluster_info
Expand All @@ -774,4 +775,4 @@ def get_command_runners(
for pod_name in instances.keys()
if pod_name != cluster_info.head_instance_id)
return command_runner.KubernetesCommandRunner.make_runner_list(
node_list=node_list, **crednetials)
node_list=node_list, **credentials)
20 changes: 12 additions & 8 deletions sky/utils/command_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
RSYNC_EXCLUDE_OPTION = '--exclude-from={}'

_HASH_MAX_LENGTH = 10
_DEFAULT_CONNECT_TIMEOUT = 30


def _ssh_control_path(ssh_control_filename: Optional[str]) -> Optional[str]:
Expand Down Expand Up @@ -60,7 +61,7 @@ def ssh_options_list(
) -> List[str]:
"""Returns a list of sane options for 'ssh'."""
if connect_timeout is None:
connect_timeout = 30
connect_timeout = _DEFAULT_CONNECT_TIMEOUT
# Forked from Ray SSHOptions:
# https://github.com/ray-project/ray/blob/master/python/ray/autoscaler/_private/command_runner.py
arg_dict = {
Expand Down Expand Up @@ -657,7 +658,7 @@ def run(
assert port_forward is None, ('port_forward is not supported for k8s '
f'for now, but got: {port_forward}')
if connect_timeout is None:
connect_timeout = 30
connect_timeout = _DEFAULT_CONNECT_TIMEOUT
kubectl_args = [
'--pod-running-timeout', f'{connect_timeout}s', '-n',
self.namespace, self.pod_name
Expand All @@ -669,9 +670,6 @@ def run(
proc = subprocess_utils.run(command, shell=False, check=False)
return proc.returncode, '', ''

# Ignore ssh_mode for k8s.
assert port_forward is None, ('port_forward is not supported for k8s, '
f'but got: {port_forward}')
kubectl_base_command = ['kubectl', 'exec']

if ssh_mode == SshMode.INTERACTIVE:
Expand All @@ -686,7 +684,12 @@ def run(
skip_lines=skip_lines + 1,
source_bashrc=source_bashrc)
command = kubectl_base_command + [
'bash', '-c', shlex.quote(command_str)
# It is important to use /bin/bash -c here to make sure we quote the
# command to be run properly. Otherwise, directly appending commands
# after '--' will not work for some commands, such as '&&', '>' etc.
'/bin/bash',
'-c',
shlex.quote(command_str)
]

log_dir = os.path.expanduser(os.path.dirname(log_path))
Expand Down Expand Up @@ -810,8 +813,9 @@ def get_remote_home_dir() -> str:
logger.debug(f'Running rsync command: {command}')

backoff = common_utils.Backoff(initial_backoff=5, max_backoff_factor=5)
assert max_retry > 0, f'max_retry {max_retry} must be positive.'
while max_retry >= 0:
returncode, _, stderr = log_lib.run_with_log(
returncode, stdout, stderr = log_lib.run_with_log(
command,
log_path=log_path,
stream_logs=stream_logs,
Expand All @@ -828,5 +832,5 @@ def get_remote_home_dir() -> str:
subprocess_utils.handle_returncode(returncode,
command,
error_msg,
stderr=stderr,
stderr=stdout + stderr,
stream_logs=stream_logs)
7 changes: 6 additions & 1 deletion sky/utils/kubernetes/rsync_helper.sh
Original file line number Diff line number Diff line change
@@ -1,2 +1,7 @@
# When using pod@namespace, rsync passes args as: {us} -l pod namespace
shift;pod=$1;shift;namespace=$1;shift;kubectl exec -i $pod -n $namespace -- "$@"
shift
pod=$1
shift
namespace=$1
shift
kubectl exec -i $pod -n $namespace -- "$@"

0 comments on commit 8a672d6

Please sign in to comment.