From 4dbe9a1ba6e24b892cad3b26891dcbd84c965786 Mon Sep 17 00:00:00 2001 From: littleround <23360163+camelop@users.noreply.github.com> Date: Fri, 9 Aug 2024 14:46:51 -0400 Subject: [PATCH 1/3] merge runs (reduce happy path round num from 14 to 5) --- sky/provision/docker_utils.py | 289 +++++++++++++++++++--------------- 1 file changed, 159 insertions(+), 130 deletions(-) diff --git a/sky/provision/docker_utils.py b/sky/provision/docker_utils.py index aa29a3666a3..0693585ede0 100644 --- a/sky/provision/docker_utils.py +++ b/sky/provision/docker_utils.py @@ -55,25 +55,6 @@ def from_env_vars(cls, d: Dict[str, str]) -> 'DockerLoginConfig': DEFAULT_OBJECT_STORE_MEMORY_PROPORTION = 0.3 -def _check_helper(cname, template, docker_cmd): - return ' '.join([ - docker_cmd, 'inspect', '-f', '"{{' + template + '}}"', cname, '||', - 'true' - ]) - - -def check_docker_running_cmd(cname, docker_cmd): - return _check_helper(cname, '.State.Running', docker_cmd) - - -def check_bind_mounts_cmd(cname, docker_cmd): - return _check_helper(cname, 'json .Mounts', docker_cmd) - - -def check_docker_image(cname, docker_cmd): - return _check_helper(cname, '.Config.Image', docker_cmd) - - def docker_start_cmds( image, container_name, @@ -192,32 +173,103 @@ def _run(self, stream_logs=log_err_when_fail) return stdout.strip() + def _check_host_status(self) -> dict: + result = self._run(f""" + # passing initializer state + _self_initialized={"1" if self.initialized else "0"} + _docker_socket_wait_timeout_seconds={_DOCKER_SOCKET_WAIT_TIMEOUT_SECONDS} + + # get mem + cat /proc/meminfo | grep MemAvailable || true + + # check if docker is installed + docker_command_exist=$(command -v {self.docker_cmd}) + if [[ -n $docker_command_exist && $docker_command_exist == *"docker"* ]] + then + docker_installed=1 + echo "docker_installed:" "Y" + else + docker_installed=0 + echo "docker_installed:" "N" + fi + + if [[ $docker_installed -eq 1 ]] + then + # wait for docker daemon to be ready + docker_ready=0 + end_time=$((SECONDS + _docker_socket_wait_timeout_seconds)) + while [ $SECONDS -lt $end_time ]; do + if {self.docker_cmd} info >/dev/null 2>&1; then + echo "docker_ready:" "Y" + docker_ready=1 + break + else + exec su -l $USER + sleep 5 + fi + done + if [[ $docker_ready -eq 0 ]] + then + echo "docker_ready:" "N" + fi + + # check runtime info + echo "docker_runtime:" $({self.docker_cmd} info -f "{{{{.Runtimes}}}}") + + if [[ $_self_initialized -eq 0 ]] + then + # check container status + container_status=$({self.docker_cmd} ps -a --filter name="{self.container_name}" --format "{{{{.Status}}}}") + echo "container_status:" $container_status + # if container is up, check its image + if [[ $container_status == *"Up"* ]] + then + container_image=$({self.docker_cmd} inspect --format='{{{{.Config.Image}}}}' {self.container_name}) + echo "container_image:" $container_image + fi + fi + fi + echo "status_checking_completed:" "Y" + """) + ret = {} + + for l in result.split("\n"): + if ":" in l: + x = l.split(":") + k = x[0].strip() + v = ":".join(x[1:]).strip() + if v in ["Y", "N"]: + v = True if v == "Y" else False + if k == "MemAvailable": + k = "mem_available_in_kb" + v = int(v.split()[0]) + ret[k] = v + return ret + def initialize(self) -> str: specific_image = self.docker_config['image'] - self._check_docker_installed() + status = self._check_host_status() + logger.info(f"Host status: {str(status)}") - # SkyPilot: Check if the container is exited but not removed. - # If true, then we can start the container directly. - # Notice that we will skip all setup commands, so we need to - # manually start the ssh service. - if self._check_container_exited(): - self.initialized = True - self._run(f'{self.docker_cmd} start {self.container_name}') - self._run('sudo service ssh start', run_env='docker') - return self._run('whoami', run_env='docker') + if "container_status" in status: + if "Exited" in status["container_status"]: + logger.info("Container is exited but not removed.") + self.initialized = True + self._run(f'{self.docker_cmd} start {self.container_name}') + self._run('sudo service ssh start', run_env='docker') + return self._run('whoami', run_env='docker') # SkyPilot: Docker login if user specified a private docker registry. + cmd_login = "" if 'docker_login_config' in self.docker_config: # TODO(tian): Maybe support a command to get the login password? docker_login_config = DockerLoginConfig( **self.docker_config['docker_login_config']) - self._run( - f'{self.docker_cmd} login --username ' - f'{docker_login_config.username} ' - f'--password {docker_login_config.password} ' - f'{docker_login_config.server}', - wait_for_docker_daemon=True) + cmd_login = f'{self.docker_cmd} login --username '\ + f'{docker_login_config.username} '\ + f'--password {docker_login_config.password} '\ + f'{docker_login_config.server};' # We automatically add the server prefix to the image name if # the user did not add it. server_prefix = f'{docker_login_config.server}/' @@ -227,21 +279,19 @@ def initialize(self) -> str: if self.docker_config.get('pull_before_run', True): assert specific_image, ('Image must be included in config if ' + 'pull_before_run is specified') - self._run(f'{self.docker_cmd} pull {specific_image}', - wait_for_docker_daemon=True) + cmd_pull = f'{self.docker_cmd} pull {specific_image};' else: - self._run( - f'{self.docker_cmd} image inspect {specific_image} ' - '1> /dev/null 2>&1 || ' - f'{self.docker_cmd} pull {specific_image}', - wait_for_docker_daemon=True) + cmd_pull = f'{self.docker_cmd} image inspect {specific_image} '\ + '1> /dev/null 2>&1 || '\ + f'{self.docker_cmd} pull {specific_image};' logger.info(f'Starting container {self.container_name} with image ' f'{specific_image}') - container_running = self._check_container_status() + + cmd_run = "" + container_running = 'Up' in status.get('container_status', False) if container_running: - running_image = self._run( - check_docker_image(self.container_name, self.docker_cmd)) + running_image = status.get('container_image', None) if running_image != specific_image: logger.error( f'A container with name {self.container_name} is running ' @@ -252,22 +302,42 @@ def initialize(self) -> str: # from GPUs when a systemctl command is called. This is a known # issue with nvidia container toolkit: # https://github.com/NVIDIA/nvidia-container-toolkit/issues/48 - self._run( - '[ -f /etc/docker/daemon.json ] || ' - 'echo "{}" | sudo tee /etc/docker/daemon.json;' - 'sudo jq \'.["exec-opts"] = ["native.cgroupdriver=cgroupfs"]\' ' - '/etc/docker/daemon.json > /tmp/daemon.json;' - 'sudo mv /tmp/daemon.json /etc/docker/daemon.json;' - 'sudo systemctl restart docker') + cmd_run ='[ -f /etc/docker/daemon.json ] || '\ + 'echo "{}" | sudo tee /etc/docker/daemon.json;'\ + 'sudo jq \'.["exec-opts"] = ["native.cgroupdriver=cgroupfs"]\' '\ + '/etc/docker/daemon.json > /tmp/daemon.json;'\ + 'sudo mv /tmp/daemon.json /etc/docker/daemon.json;'\ + 'sudo systemctl restart docker;' user_docker_run_options = self.docker_config.get('run_options', []) start_command = docker_start_cmds( specific_image, self.container_name, - self._configure_runtime( - self._auto_configure_shm(user_docker_run_options)), + self._configure_runtime(self._auto_configure_shm( + user_docker_run_options, + available_memory=status.get("mem_available_in_kb", None)), + runtime_output=status.get( + "docker_runtime", None)), self.docker_cmd, ) - self._run(start_command) + cmd_run += "\n" + start_command + ";" + + # Copy local authorized_keys to docker container. + # Stop and disable jupyter service. This is to avoid port conflict on + # 8080 if we use default deep learning image in GCP, and 8888 if we use + # default deep learning image in Azure. + # Azure also has a jupyterhub service running on 8081, so we stop and + # disable that too. + container_name = constants.DEFAULT_DOCKER_CONTAINER_NAME + cmd_copy = f'{self.docker_cmd} cp ~/.ssh/authorized_keys '\ + f'{container_name}:/tmp/host_ssh_authorized_keys;'\ + 'sudo systemctl stop jupyter > /dev/null 2>&1 || true;'\ + 'sudo systemctl disable jupyter > /dev/null 2>&1 || true;'\ + 'sudo systemctl stop jupyterhub > /dev/null 2>&1 || true;'\ + 'sudo systemctl disable jupyterhub > /dev/null 2>&1 || true;' + + cmd_action = f"{cmd_login} {cmd_pull} {cmd_run} {cmd_copy}" + cmd_result = self._run(cmd_action) + # logger.debug(f"Command (host) result: {cmd_result}") # SkyPilot: Setup Commands. # TODO(zhwu): the following setups should be aligned with the kubernetes @@ -279,36 +349,17 @@ def initialize(self) -> str: # commands won't fail. # Disable apt-get from asking user input during installation. # see https://askubuntu.com/questions/909277/avoiding-user-interaction-with-tzdata-when-installing-certbot-in-a-docker-contai # pylint: disable=line-too-long - self._run( - f'echo \'{command_runner.ALIAS_SUDO_TO_EMPTY_FOR_ROOT_CMD}\' ' - '>> ~/.bashrc;' - 'echo "export DEBIAN_FRONTEND=noninteractive" >> ~/.bashrc;', - run_env='docker') + cmd_d_shell = f'echo \'{command_runner.ALIAS_SUDO_TO_EMPTY_FOR_ROOT_CMD}\' '\ + '>> ~/.bashrc;'\ + 'echo "export DEBIAN_FRONTEND=noninteractive" >> ~/.bashrc;'\ + 'source ~/.bashrc;' # Install dependencies. - self._run( - 'sudo apt-get update; ' - # Our mount script will install gcsfuse without fuse package. - # We need to install fuse package first to enable storage mount. - # The dpkg option is to suppress the prompt for fuse installation. - 'sudo apt-get -o DPkg::Options::="--force-confnew" install -y ' - 'rsync curl wget patch openssh-server python3-pip fuse;', - run_env='docker') - - # Copy local authorized_keys to docker container. - # Stop and disable jupyter service. This is to avoid port conflict on - # 8080 if we use default deep learning image in GCP, and 8888 if we use - # default deep learning image in Azure. - # Azure also has a jupyterhub service running on 8081, so we stop and - # disable that too. - container_name = constants.DEFAULT_DOCKER_CONTAINER_NAME - self._run( - f'rsync -e "{self.docker_cmd} exec -i" -avz ~/.ssh/authorized_keys ' - f'{container_name}:/tmp/host_ssh_authorized_keys;' - 'sudo systemctl stop jupyter > /dev/null 2>&1 || true;' - 'sudo systemctl disable jupyter > /dev/null 2>&1 || true;' - 'sudo systemctl stop jupyterhub > /dev/null 2>&1 || true;' - 'sudo systemctl disable jupyterhub > /dev/null 2>&1 || true;', - run_env='host') + cmd_d_dep = f'sudo apt-get update; '\ + 'sudo apt-get -o DPkg::Options::="--force-confnew" install -y '\ + 'rsync curl wget patch openssh-server python3-pip fuse;' + # Our mount script will install gcsfuse without fuse package. + # We need to install fuse package first to enable storage mount. + # The dpkg option is to suppress the prompt for fuse installation. # Change the default port of sshd from 22 to DEFAULT_DOCKER_PORT. # Append the host VM's authorized_keys to the container's authorized_keys. @@ -319,38 +370,22 @@ def initialize(self) -> str: # see https://www.educative.io/answers/error-mesg-ttyname-failed-inappropriate-ioctl-for-device # pylint: disable=line-too-long port = constants.DEFAULT_DOCKER_PORT # pylint: disable=anomalous-backslash-in-string - self._run( - f'sudo sed -i "s/#Port 22/Port {port}/" /etc/ssh/sshd_config;' - 'mkdir -p ~/.ssh;' - 'cat /tmp/host_ssh_authorized_keys >> ~/.ssh/authorized_keys;' - 'sudo service ssh start;' - 'sudo sed -i "s/mesg n/tty -s \&\& mesg n/" ~/.profile;' - f'{SETUP_ENV_VARS_CMD}', - run_env='docker') + cmd_d_port = f'sudo sed -i "s/#Port 22/Port {port}/" /etc/ssh/sshd_config;'\ + 'mkdir -p ~/.ssh;'\ + 'cat /tmp/host_ssh_authorized_keys >> ~/.ssh/authorized_keys;'\ + 'sudo service ssh start;'\ + 'sudo sed -i "s/mesg n/tty -s \&\& mesg n/" ~/.profile;'\ + f'{SETUP_ENV_VARS_CMD};' + + cmd_d_action = f"{cmd_d_shell} {cmd_d_dep} {cmd_d_port}" + cmd_d_result = self._run(cmd_d_action, run_env='docker') + # logger.debug(f"Command (docker) result: {cmd_d_result}") # SkyPilot: End of Setup Commands. docker_user = self._run('whoami', run_env='docker') self.initialized = True return docker_user - def _check_docker_installed(self): - no_exist = 'NoExist' - cleaned_output = self._run( - f'command -v {self.docker_cmd} || echo {no_exist!r}') - if no_exist in cleaned_output or 'docker' not in cleaned_output: - logger.error( - f'{self.docker_cmd.capitalize()} not installed. Please use an ' - f'image with {self.docker_cmd.capitalize()} installed.') - - def _check_container_status(self): - if self.initialized: - return True - output = (self._run( - check_docker_running_cmd(self.container_name, self.docker_cmd))) - # Checks for the false positive where 'true' is in the container name - return 'true' in output.lower( - ) and 'no such object' not in output.lower() - def _docker_expand_user(self, string, any_char=False): user_pos = string.find('~') if user_pos > -1: @@ -372,12 +407,15 @@ def _docker_expand_user(self, string, any_char=False): return string - def _configure_runtime(self, run_options: List[str]) -> List[str]: + def _configure_runtime(self, + run_options: List[str], + runtime_output=None) -> List[str]: if self.docker_config.get('disable_automatic_runtime_detection'): return run_options - runtime_output = (self._run(f'{self.docker_cmd} ' + - 'info -f "{{.Runtimes}}"')) + if runtime_output is None: + runtime_output = (self._run(f'{self.docker_cmd} ' + + 'info -f "{{.Runtimes}}"')) if 'nvidia-container-runtime' in runtime_output: try: self._run('nvidia-smi', log_err_when_fail=False) @@ -392,7 +430,9 @@ def _configure_runtime(self, run_options: List[str]) -> List[str]: return run_options - def _auto_configure_shm(self, run_options: List[str]) -> List[str]: + def _auto_configure_shm(self, + run_options: List[str], + available_memory=None) -> List[str]: if self.docker_config.get('disable_shm_size_detection'): return run_options for run_opt in run_options: @@ -401,10 +441,11 @@ def _auto_configure_shm(self, run_options: List[str]) -> List[str]: f'`run_option`: {run_opt}') return run_options try: - shm_output = self._run('cat /proc/meminfo || true') - available_memory = int([ - ln for ln in shm_output.split('\n') if 'MemAvailable' in ln - ][0].split()[1]) + if available_memory is None: + shm_output = self._run('cat /proc/meminfo || true') + available_memory = int([ + ln for ln in shm_output.split('\n') if 'MemAvailable' in ln + ][0].split()[1]) available_memory_bytes = available_memory * 1024 # Overestimate SHM size by 10% shm_size = min( @@ -417,15 +458,3 @@ def _auto_configure_shm(self, run_options: List[str]) -> List[str]: logger.warning( f'Received error while trying to auto-compute SHM size {e}') return run_options - - # SkyPilot: New function to check whether a container is exited - # (but not removed). This is due to previous `sky stop` command, - # which will stop the container but not remove it. - def _check_container_exited(self) -> bool: - if self.initialized: - return True - output = (self._run(check_docker_running_cmd(self.container_name, - self.docker_cmd), - wait_for_docker_daemon=True)) - return 'false' in output.lower( - ) and 'no such object' not in output.lower() From 416adbd3f689eaf8b6b79a7a84a2d1e28c58c35f Mon Sep 17 00:00:00 2001 From: littleround <23360163+camelop@users.noreply.github.com> Date: Fri, 9 Aug 2024 16:17:04 -0400 Subject: [PATCH 2/3] formatter --- sky/provision/docker_utils.py | 49 ++++++++++++++++++----------------- 1 file changed, 25 insertions(+), 24 deletions(-) diff --git a/sky/provision/docker_utils.py b/sky/provision/docker_utils.py index 0693585ede0..2d3bb1f1871 100644 --- a/sky/provision/docker_utils.py +++ b/sky/provision/docker_utils.py @@ -233,16 +233,16 @@ def _check_host_status(self) -> dict: """) ret = {} - for l in result.split("\n"): - if ":" in l: - x = l.split(":") + for l in result.split('\n'): + if ':' in l: + x = l.split(':') k = x[0].strip() - v = ":".join(x[1:]).strip() - if v in ["Y", "N"]: - v = True if v == "Y" else False - if k == "MemAvailable": - k = "mem_available_in_kb" - v = int(v.split()[0]) + v = ':'.join(x[1:]).strip() + if v in ['Y', 'N']: + v = True if v == 'Y' else False # type: ignore + if k == 'MemAvailable': + k = 'mem_available_in_kb' + v = int(v.split()[0]) # type: ignore ret[k] = v return ret @@ -250,18 +250,18 @@ def initialize(self) -> str: specific_image = self.docker_config['image'] status = self._check_host_status() - logger.info(f"Host status: {str(status)}") + logger.info(f'Host status: {str(status)}') - if "container_status" in status: - if "Exited" in status["container_status"]: - logger.info("Container is exited but not removed.") + if 'container_status' in status: + if 'Exited' in status['container_status']: + logger.info('Container is exited but not removed.') self.initialized = True self._run(f'{self.docker_cmd} start {self.container_name}') self._run('sudo service ssh start', run_env='docker') return self._run('whoami', run_env='docker') # SkyPilot: Docker login if user specified a private docker registry. - cmd_login = "" + cmd_login = '' if 'docker_login_config' in self.docker_config: # TODO(tian): Maybe support a command to get the login password? docker_login_config = DockerLoginConfig( @@ -288,7 +288,7 @@ def initialize(self) -> str: logger.info(f'Starting container {self.container_name} with image ' f'{specific_image}') - cmd_run = "" + cmd_run = '' container_running = 'Up' in status.get('container_status', False) if container_running: running_image = status.get('container_image', None) @@ -304,7 +304,8 @@ def initialize(self) -> str: # https://github.com/NVIDIA/nvidia-container-toolkit/issues/48 cmd_run ='[ -f /etc/docker/daemon.json ] || '\ 'echo "{}" | sudo tee /etc/docker/daemon.json;'\ - 'sudo jq \'.["exec-opts"] = ["native.cgroupdriver=cgroupfs"]\' '\ + 'sudo jq \'.["exec-opts"] ='\ + '["native.cgroupdriver=cgroupfs"]\' '\ '/etc/docker/daemon.json > /tmp/daemon.json;'\ 'sudo mv /tmp/daemon.json /etc/docker/daemon.json;'\ 'sudo systemctl restart docker;' @@ -314,12 +315,12 @@ def initialize(self) -> str: self.container_name, self._configure_runtime(self._auto_configure_shm( user_docker_run_options, - available_memory=status.get("mem_available_in_kb", None)), + available_memory=status.get('mem_available_in_kb', None)), runtime_output=status.get( - "docker_runtime", None)), + 'docker_runtime', None)), self.docker_cmd, ) - cmd_run += "\n" + start_command + ";" + cmd_run += '\n' + start_command + ';' # Copy local authorized_keys to docker container. # Stop and disable jupyter service. This is to avoid port conflict on @@ -335,8 +336,8 @@ def initialize(self) -> str: 'sudo systemctl stop jupyterhub > /dev/null 2>&1 || true;'\ 'sudo systemctl disable jupyterhub > /dev/null 2>&1 || true;' - cmd_action = f"{cmd_login} {cmd_pull} {cmd_run} {cmd_copy}" - cmd_result = self._run(cmd_action) + cmd_action = f'{cmd_login} {cmd_pull} {cmd_run} {cmd_copy}' + cmd_result = self._run(cmd_action) # pylint: disable=unused-variable # logger.debug(f"Command (host) result: {cmd_result}") # SkyPilot: Setup Commands. @@ -354,7 +355,7 @@ def initialize(self) -> str: 'echo "export DEBIAN_FRONTEND=noninteractive" >> ~/.bashrc;'\ 'source ~/.bashrc;' # Install dependencies. - cmd_d_dep = f'sudo apt-get update; '\ + cmd_d_dep = 'sudo apt-get update; '\ 'sudo apt-get -o DPkg::Options::="--force-confnew" install -y '\ 'rsync curl wget patch openssh-server python3-pip fuse;' # Our mount script will install gcsfuse without fuse package. @@ -377,8 +378,8 @@ def initialize(self) -> str: 'sudo sed -i "s/mesg n/tty -s \&\& mesg n/" ~/.profile;'\ f'{SETUP_ENV_VARS_CMD};' - cmd_d_action = f"{cmd_d_shell} {cmd_d_dep} {cmd_d_port}" - cmd_d_result = self._run(cmd_d_action, run_env='docker') + cmd_d_action = f'{cmd_d_shell} {cmd_d_dep} {cmd_d_port}' + cmd_d_result = self._run(cmd_d_action, run_env='docker') # pylint: disable=unused-variable # logger.debug(f"Command (docker) result: {cmd_d_result}") # SkyPilot: End of Setup Commands. From 1cadf8441950eb03f1a58dd66d8538257be8af84 Mon Sep 17 00:00:00 2001 From: sixbigsquare <129671892+sixbigsquare@users.noreply.github.com> Date: Sun, 8 Sep 2024 06:21:18 -0700 Subject: [PATCH 3/3] robust status key parsing; "docker_ready" rename --- sky/provision/docker_utils.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/sky/provision/docker_utils.py b/sky/provision/docker_utils.py index 2d3bb1f1871..cddc06134cb 100644 --- a/sky/provision/docker_utils.py +++ b/sky/provision/docker_utils.py @@ -196,21 +196,21 @@ def _check_host_status(self) -> dict: if [[ $docker_installed -eq 1 ]] then # wait for docker daemon to be ready - docker_ready=0 + docker_socket_ready=0 end_time=$((SECONDS + _docker_socket_wait_timeout_seconds)) while [ $SECONDS -lt $end_time ]; do if {self.docker_cmd} info >/dev/null 2>&1; then - echo "docker_ready:" "Y" - docker_ready=1 + echo "docker_socket_ready:" "Y" + docker_socket_ready=1 break else exec su -l $USER sleep 5 fi done - if [[ $docker_ready -eq 0 ]] + if [[ $docker_socket_ready -eq 0 ]] then - echo "docker_ready:" "N" + echo "docker_socket_ready:" "N" fi # check runtime info @@ -243,7 +243,12 @@ def _check_host_status(self) -> dict: if k == 'MemAvailable': k = 'mem_available_in_kb' v = int(v.split()[0]) # type: ignore - ret[k] = v + if k in [ + 'docker_installed', 'docker_socket_ready', + 'docker_runtime', 'container_status', 'container_image', + 'mem_available_in_kb', 'status_checking_completed' + ]: # robust parsing + ret[k] = v return ret def initialize(self) -> str: