Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Optimize docker initialization #3823

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
295 changes: 165 additions & 130 deletions sky/provision/docker_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,25 +55,6 @@ def from_env_vars(cls, d: Dict[str, str]) -> 'DockerLoginConfig':
DEFAULT_OBJECT_STORE_MEMORY_PROPORTION = 0.3


def _check_helper(cname, template, docker_cmd):
return ' '.join([
docker_cmd, 'inspect', '-f', '"{{' + template + '}}"', cname, '||',
'true'
])


def check_docker_running_cmd(cname, docker_cmd):
return _check_helper(cname, '.State.Running', docker_cmd)


def check_bind_mounts_cmd(cname, docker_cmd):
return _check_helper(cname, 'json .Mounts', docker_cmd)


def check_docker_image(cname, docker_cmd):
return _check_helper(cname, '.Config.Image', docker_cmd)


def docker_start_cmds(
image,
container_name,
Expand Down Expand Up @@ -192,32 +173,108 @@ def _run(self,
stream_logs=log_err_when_fail)
return stdout.strip()

def _check_host_status(self) -> dict:
result = self._run(f"""
# passing initializer state
_self_initialized={"1" if self.initialized else "0"}
_docker_socket_wait_timeout_seconds={_DOCKER_SOCKET_WAIT_TIMEOUT_SECONDS}

# get mem
cat /proc/meminfo | grep MemAvailable || true

# check if docker is installed
docker_command_exist=$(command -v {self.docker_cmd})
if [[ -n $docker_command_exist && $docker_command_exist == *"docker"* ]]
then
docker_installed=1
echo "docker_installed:" "Y"
else
docker_installed=0
echo "docker_installed:" "N"
fi

if [[ $docker_installed -eq 1 ]]
then
# wait for docker daemon to be ready
docker_socket_ready=0
end_time=$((SECONDS + _docker_socket_wait_timeout_seconds))
while [ $SECONDS -lt $end_time ]; do
if {self.docker_cmd} info >/dev/null 2>&1; then
echo "docker_socket_ready:" "Y"
docker_socket_ready=1
break
else
exec su -l $USER
sleep 5
fi
done
if [[ $docker_socket_ready -eq 0 ]]
then
echo "docker_socket_ready:" "N"
fi

# check runtime info
echo "docker_runtime:" $({self.docker_cmd} info -f "{{{{.Runtimes}}}}")

if [[ $_self_initialized -eq 0 ]]
then
# check container status
container_status=$({self.docker_cmd} ps -a --filter name="{self.container_name}" --format "{{{{.Status}}}}")
echo "container_status:" $container_status
# if container is up, check its image
if [[ $container_status == *"Up"* ]]
then
container_image=$({self.docker_cmd} inspect --format='{{{{.Config.Image}}}}' {self.container_name})
echo "container_image:" $container_image
fi
fi
fi
echo "status_checking_completed:" "Y"
""")
ret = {}

for l in result.split('\n'):
if ':' in l:
x = l.split(':')
k = x[0].strip()
v = ':'.join(x[1:]).strip()
Michaelvll marked this conversation as resolved.
Show resolved Hide resolved
if v in ['Y', 'N']:
v = True if v == 'Y' else False # type: ignore
if k == 'MemAvailable':
k = 'mem_available_in_kb'
v = int(v.split()[0]) # type: ignore
if k in [
'docker_installed', 'docker_socket_ready',
'docker_runtime', 'container_status', 'container_image',
'mem_available_in_kb', 'status_checking_completed'
]: # robust parsing
ret[k] = v
return ret

def initialize(self) -> str:
specific_image = self.docker_config['image']

self._check_docker_installed()
status = self._check_host_status()
logger.info(f'Host status: {str(status)}')

# SkyPilot: Check if the container is exited but not removed.
# If true, then we can start the container directly.
# Notice that we will skip all setup commands, so we need to
# manually start the ssh service.
if self._check_container_exited():
self.initialized = True
self._run(f'{self.docker_cmd} start {self.container_name}')
self._run('sudo service ssh start', run_env='docker')
return self._run('whoami', run_env='docker')
if 'container_status' in status:
if 'Exited' in status['container_status']:
logger.info('Container is exited but not removed.')
self.initialized = True
self._run(f'{self.docker_cmd} start {self.container_name}')
self._run('sudo service ssh start', run_env='docker')
return self._run('whoami', run_env='docker')
Comment on lines +260 to +266
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can this be integrated into the check_host_status command as well?

Copy link
Author

@camelop camelop Sep 8, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This one is not on the happy path though, but also follow the above design pattern (the action part in step 3).

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agree that we may avoid having it in _check_host_status. This code path could happen if sky stop cluster-name and sky start cluster-name are called. We should probably merge these three continuous self._run into a single call?


# SkyPilot: Docker login if user specified a private docker registry.
cmd_login = ''
if 'docker_login_config' in self.docker_config:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we combine the following into the same call as the _check_host_status?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also, I am wondering as we may get a longer commands to run, will it be more readable if we explicitly create a enum as a state machine to indicate the stage we are in, and we can do post processing based on the state.

Copy link
Author

@camelop camelop Sep 8, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's combined to the next "action" call.
Currently it is following the structure of

  1. remote read-only status checking with sh
  2. local operation decision making in Python
  3. remote action

To further save SSH calls, I would recommend completely refactor the execution structure into the following:

  1. gather all local Python execution demands and render an execution script with it (e.g. with templating from Jinja2, similar to https://docs.ansible.com/ansible/latest/playbook_guide/playbooks_templating.html ). The script should potentially execute different branches depending on status on the target machine.
  2. once the script is rendered ready, send it to remote for execution
  3. gather the result in one round to summarize whether the change has been successfully replied

I would expect such change to be a global coding style / design pattern change, so not sure if it is a good fit for a minor opt PR like this.

# TODO(tian): Maybe support a command to get the login password?
docker_login_config = DockerLoginConfig(
**self.docker_config['docker_login_config'])
self._run(
f'{self.docker_cmd} login --username '
f'{docker_login_config.username} '
f'--password {docker_login_config.password} '
f'{docker_login_config.server}',
wait_for_docker_daemon=True)
cmd_login = f'{self.docker_cmd} login --username '\
f'{docker_login_config.username} '\
f'--password {docker_login_config.password} '\
f'{docker_login_config.server};'
Comment on lines +274 to +277
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

style nit: avoid using backslash

Suggested change
cmd_login = f'{self.docker_cmd} login --username '\
f'{docker_login_config.username} '\
f'--password {docker_login_config.password} '\
f'{docker_login_config.server};'
cmd_login = (f'{self.docker_cmd} login --username '
f'{docker_login_config.username} '
f'--password {docker_login_config.password} '
f'{docker_login_config.server};')

# We automatically add the server prefix to the image name if
# the user did not add it.
server_prefix = f'{docker_login_config.server}/'
Expand All @@ -227,21 +284,19 @@ def initialize(self) -> str:
if self.docker_config.get('pull_before_run', True):
assert specific_image, ('Image must be included in config if ' +
'pull_before_run is specified')
self._run(f'{self.docker_cmd} pull {specific_image}',
wait_for_docker_daemon=True)
cmd_pull = f'{self.docker_cmd} pull {specific_image};'
else:
self._run(
f'{self.docker_cmd} image inspect {specific_image} '
'1> /dev/null 2>&1 || '
f'{self.docker_cmd} pull {specific_image}',
wait_for_docker_daemon=True)
cmd_pull = f'{self.docker_cmd} image inspect {specific_image} '\
'1> /dev/null 2>&1 || '\
f'{self.docker_cmd} pull {specific_image};'
Comment on lines +289 to +291
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

style nit: avoid using backslash

Suggested change
cmd_pull = f'{self.docker_cmd} image inspect {specific_image} '\
'1> /dev/null 2>&1 || '\
f'{self.docker_cmd} pull {specific_image};'
cmd_pull = (f'{self.docker_cmd} image inspect {specific_image} '
'1> /dev/null 2>&1 || '
f'{self.docker_cmd} pull {specific_image};')


logger.info(f'Starting container {self.container_name} with image '
f'{specific_image}')
container_running = self._check_container_status()

cmd_run = ''
container_running = 'Up' in status.get('container_status', False)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What does this mean by 'Up' in False mean here? Should we change the default value to a string?

if container_running:
running_image = self._run(
check_docker_image(self.container_name, self.docker_cmd))
running_image = status.get('container_image', None)
if running_image != specific_image:
logger.error(
f'A container with name {self.container_name} is running '
Expand All @@ -252,22 +307,43 @@ def initialize(self) -> str:
# from GPUs when a systemctl command is called. This is a known
# issue with nvidia container toolkit:
# https://github.com/NVIDIA/nvidia-container-toolkit/issues/48
self._run(
'[ -f /etc/docker/daemon.json ] || '
'echo "{}" | sudo tee /etc/docker/daemon.json;'
'sudo jq \'.["exec-opts"] = ["native.cgroupdriver=cgroupfs"]\' '
'/etc/docker/daemon.json > /tmp/daemon.json;'
'sudo mv /tmp/daemon.json /etc/docker/daemon.json;'
'sudo systemctl restart docker')
cmd_run ='[ -f /etc/docker/daemon.json ] || '\
'echo "{}" | sudo tee /etc/docker/daemon.json;'\
'sudo jq \'.["exec-opts"] ='\
'["native.cgroupdriver=cgroupfs"]\' '\
'/etc/docker/daemon.json > /tmp/daemon.json;'\
'sudo mv /tmp/daemon.json /etc/docker/daemon.json;'\
'sudo systemctl restart docker;'
Comment on lines +310 to +316
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

style nit: avoid using backslash

user_docker_run_options = self.docker_config.get('run_options', [])
start_command = docker_start_cmds(
specific_image,
self.container_name,
self._configure_runtime(
self._auto_configure_shm(user_docker_run_options)),
self._configure_runtime(self._auto_configure_shm(
user_docker_run_options,
available_memory=status.get('mem_available_in_kb', None)),
runtime_output=status.get(
'docker_runtime', None)),
self.docker_cmd,
)
Comment on lines 318 to 327
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For readability, let's keep the key/value fetching in a separate assignment?

self._run(start_command)
cmd_run += '\n' + start_command + ';'

# Copy local authorized_keys to docker container.
# Stop and disable jupyter service. This is to avoid port conflict on
# 8080 if we use default deep learning image in GCP, and 8888 if we use
# default deep learning image in Azure.
# Azure also has a jupyterhub service running on 8081, so we stop and
# disable that too.
container_name = constants.DEFAULT_DOCKER_CONTAINER_NAME
cmd_copy = f'{self.docker_cmd} cp ~/.ssh/authorized_keys '\
f'{container_name}:/tmp/host_ssh_authorized_keys;'\
'sudo systemctl stop jupyter > /dev/null 2>&1 || true;'\
'sudo systemctl disable jupyter > /dev/null 2>&1 || true;'\
'sudo systemctl stop jupyterhub > /dev/null 2>&1 || true;'\
'sudo systemctl disable jupyterhub > /dev/null 2>&1 || true;'
Comment on lines +337 to +342
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

style: backslash


cmd_action = f'{cmd_login} {cmd_pull} {cmd_run} {cmd_copy}'
cmd_result = self._run(cmd_action) # pylint: disable=unused-variable
# logger.debug(f"Command (host) result: {cmd_result}")

# SkyPilot: Setup Commands.
# TODO(zhwu): the following setups should be aligned with the kubernetes
Expand All @@ -279,36 +355,17 @@ def initialize(self) -> str:
# commands won't fail.
# Disable apt-get from asking user input during installation.
# see https://askubuntu.com/questions/909277/avoiding-user-interaction-with-tzdata-when-installing-certbot-in-a-docker-contai # pylint: disable=line-too-long
self._run(
f'echo \'{command_runner.ALIAS_SUDO_TO_EMPTY_FOR_ROOT_CMD}\' '
'>> ~/.bashrc;'
'echo "export DEBIAN_FRONTEND=noninteractive" >> ~/.bashrc;',
run_env='docker')
cmd_d_shell = f'echo \'{command_runner.ALIAS_SUDO_TO_EMPTY_FOR_ROOT_CMD}\' '\
'>> ~/.bashrc;'\
'echo "export DEBIAN_FRONTEND=noninteractive" >> ~/.bashrc;'\
'source ~/.bashrc;'
# Install dependencies.
self._run(
'sudo apt-get update; '
# Our mount script will install gcsfuse without fuse package.
# We need to install fuse package first to enable storage mount.
# The dpkg option is to suppress the prompt for fuse installation.
'sudo apt-get -o DPkg::Options::="--force-confnew" install -y '
'rsync curl wget patch openssh-server python3-pip fuse;',
run_env='docker')

# Copy local authorized_keys to docker container.
# Stop and disable jupyter service. This is to avoid port conflict on
# 8080 if we use default deep learning image in GCP, and 8888 if we use
# default deep learning image in Azure.
# Azure also has a jupyterhub service running on 8081, so we stop and
# disable that too.
container_name = constants.DEFAULT_DOCKER_CONTAINER_NAME
self._run(
f'rsync -e "{self.docker_cmd} exec -i" -avz ~/.ssh/authorized_keys '
f'{container_name}:/tmp/host_ssh_authorized_keys;'
'sudo systemctl stop jupyter > /dev/null 2>&1 || true;'
'sudo systemctl disable jupyter > /dev/null 2>&1 || true;'
'sudo systemctl stop jupyterhub > /dev/null 2>&1 || true;'
'sudo systemctl disable jupyterhub > /dev/null 2>&1 || true;',
run_env='host')
cmd_d_dep = 'sudo apt-get update; '\
'sudo apt-get -o DPkg::Options::="--force-confnew" install -y '\
'rsync curl wget patch openssh-server python3-pip fuse;'
Comment on lines +358 to +365
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

style: backslash

# Our mount script will install gcsfuse without fuse package.
# We need to install fuse package first to enable storage mount.
# The dpkg option is to suppress the prompt for fuse installation.

# Change the default port of sshd from 22 to DEFAULT_DOCKER_PORT.
# Append the host VM's authorized_keys to the container's authorized_keys.
Expand All @@ -319,38 +376,22 @@ def initialize(self) -> str:
# see https://www.educative.io/answers/error-mesg-ttyname-failed-inappropriate-ioctl-for-device # pylint: disable=line-too-long
port = constants.DEFAULT_DOCKER_PORT
# pylint: disable=anomalous-backslash-in-string
self._run(
f'sudo sed -i "s/#Port 22/Port {port}/" /etc/ssh/sshd_config;'
'mkdir -p ~/.ssh;'
'cat /tmp/host_ssh_authorized_keys >> ~/.ssh/authorized_keys;'
'sudo service ssh start;'
'sudo sed -i "s/mesg n/tty -s \&\& mesg n/" ~/.profile;'
f'{SETUP_ENV_VARS_CMD}',
run_env='docker')
cmd_d_port = f'sudo sed -i "s/#Port 22/Port {port}/" /etc/ssh/sshd_config;'\
'mkdir -p ~/.ssh;'\
'cat /tmp/host_ssh_authorized_keys >> ~/.ssh/authorized_keys;'\
'sudo service ssh start;'\
'sudo sed -i "s/mesg n/tty -s \&\& mesg n/" ~/.profile;'\
f'{SETUP_ENV_VARS_CMD};'
Comment on lines +379 to +384
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

style: backslash


cmd_d_action = f'{cmd_d_shell} {cmd_d_dep} {cmd_d_port}'
cmd_d_result = self._run(cmd_d_action, run_env='docker') # pylint: disable=unused-variable
# logger.debug(f"Command (docker) result: {cmd_d_result}")

# SkyPilot: End of Setup Commands.
docker_user = self._run('whoami', run_env='docker')
self.initialized = True
return docker_user

def _check_docker_installed(self):
no_exist = 'NoExist'
cleaned_output = self._run(
f'command -v {self.docker_cmd} || echo {no_exist!r}')
if no_exist in cleaned_output or 'docker' not in cleaned_output:
logger.error(
f'{self.docker_cmd.capitalize()} not installed. Please use an '
f'image with {self.docker_cmd.capitalize()} installed.')

def _check_container_status(self):
if self.initialized:
return True
output = (self._run(
check_docker_running_cmd(self.container_name, self.docker_cmd)))
# Checks for the false positive where 'true' is in the container name
return 'true' in output.lower(
) and 'no such object' not in output.lower()

def _docker_expand_user(self, string, any_char=False):
user_pos = string.find('~')
if user_pos > -1:
Expand All @@ -372,12 +413,15 @@ def _docker_expand_user(self, string, any_char=False):

return string

def _configure_runtime(self, run_options: List[str]) -> List[str]:
def _configure_runtime(self,
run_options: List[str],
runtime_output=None) -> List[str]:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
runtime_output=None) -> List[str]:
runtime_output: Optional[str]=None) -> List[str]:

if self.docker_config.get('disable_automatic_runtime_detection'):
return run_options

runtime_output = (self._run(f'{self.docker_cmd} ' +
'info -f "{{.Runtimes}}"'))
if runtime_output is None:
runtime_output = (self._run(f'{self.docker_cmd} ' +
'info -f "{{.Runtimes}}"'))
if 'nvidia-container-runtime' in runtime_output:
try:
self._run('nvidia-smi', log_err_when_fail=False)
Expand All @@ -392,7 +436,9 @@ def _configure_runtime(self, run_options: List[str]) -> List[str]:

return run_options

def _auto_configure_shm(self, run_options: List[str]) -> List[str]:
def _auto_configure_shm(self,
run_options: List[str],
available_memory=None) -> List[str]:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
available_memory=None) -> List[str]:
available_memory: Optional[int]=None) -> List[str]:

if self.docker_config.get('disable_shm_size_detection'):
return run_options
for run_opt in run_options:
Expand All @@ -401,10 +447,11 @@ def _auto_configure_shm(self, run_options: List[str]) -> List[str]:
f'`run_option`: {run_opt}')
return run_options
try:
shm_output = self._run('cat /proc/meminfo || true')
available_memory = int([
ln for ln in shm_output.split('\n') if 'MemAvailable' in ln
][0].split()[1])
if available_memory is None:
shm_output = self._run('cat /proc/meminfo || true')
available_memory = int([
ln for ln in shm_output.split('\n') if 'MemAvailable' in ln
][0].split()[1])
available_memory_bytes = available_memory * 1024
# Overestimate SHM size by 10%
shm_size = min(
Expand All @@ -417,15 +464,3 @@ def _auto_configure_shm(self, run_options: List[str]) -> List[str]:
logger.warning(
f'Received error while trying to auto-compute SHM size {e}')
return run_options

# SkyPilot: New function to check whether a container is exited
# (but not removed). This is due to previous `sky stop` command,
# which will stop the container but not remove it.
def _check_container_exited(self) -> bool:
if self.initialized:
return True
output = (self._run(check_docker_running_cmd(self.container_name,
self.docker_cmd),
wait_for_docker_daemon=True))
return 'false' in output.lower(
) and 'no such object' not in output.lower()
Loading