diff --git a/llm/axolotl/axolotl-docker.yaml b/llm/axolotl/axolotl-docker.yaml new file mode 100644 index 00000000000..b883ebdde46 --- /dev/null +++ b/llm/axolotl/axolotl-docker.yaml @@ -0,0 +1,29 @@ +# Usage: +# HF_TOKEN=abc sky launch -c axolotl axolotl.yaml --env HF_TOKEN -y -i30 --down + +name: axolotl + +resources: + accelerators: L4:1 + cloud: gcp # optional + +workdir: mistral + +setup: | + docker pull winglian/axolotl:main-py3.10-cu118-2.0.1 + +run: | + docker run --gpus all \ + -v ~/sky_workdir:/sky_workdir \ + -v /root/.cache:/root/.cache \ + winglian/axolotl:main-py3.10-cu118-2.0.1 \ + huggingface-cli login --token ${HF_TOKEN} + + docker run --gpus all \ + -v ~/sky_workdir:/sky_workdir \ + -v /root/.cache:/root/.cache \ + winglian/axolotl:main-py3.10-cu118-2.0.1 \ + accelerate launch -m axolotl.cli.train /sky_workdir/qlora.yaml + +envs: + HF_TOKEN: # TODO: Fill with your own huggingface token, or use --env to pass. diff --git a/llm/axolotl/axolotl-spot.yaml b/llm/axolotl/axolotl-spot.yaml index 942f4ccc4ba..8970737483d 100644 --- a/llm/axolotl/axolotl-spot.yaml +++ b/llm/axolotl/axolotl-spot.yaml @@ -12,6 +12,7 @@ resources: accelerators: A100:1 cloud: gcp # optional use_spot: True + image_id: docker:winglian/axolotl:main-py3.10-cu118-2.0.1 workdir: mistral @@ -20,29 +21,12 @@ file_mounts: name: ${BUCKET} mode: MOUNT -setup: | - docker pull winglian/axolotl:main-py3.10-cu118-2.0.1 - run: | - docker run --gpus all \ - -v ~/sky_workdir:/sky_workdir \ - -v /root/.cache:/root/.cache \ - winglian/axolotl:main-py3.10-cu118-2.0.1 \ - huggingface-cli login --token ${HF_TOKEN} + huggingface-cli login --token ${HF_TOKEN} - docker run --gpus all \ - -v ~/sky_workdir:/sky_workdir \ - -v /root/.cache:/root/.cache \ - -v /sky-notebook:/sky-notebook \ - winglian/axolotl:main-py3.10-cu118-2.0.1 \ - accelerate launch -m axolotl.cli.train /sky_workdir/qlora-checkpoint.yaml + accelerate launch -m axolotl.cli.train qlora-checkpoint.yaml envs: HF_TOKEN: # TODO: Fill with your own huggingface token, or use --env to pass. BUCKET: # TODO: Fill with your unique bucket name, or use --env to pass. - - - - - diff --git a/llm/axolotl/axolotl.yaml b/llm/axolotl/axolotl.yaml index 9cec1d1f331..f46588e9aae 100644 --- a/llm/axolotl/axolotl.yaml +++ b/llm/axolotl/axolotl.yaml @@ -5,25 +5,14 @@ name: axolotl resources: accelerators: L4:1 - cloud: gcp # optional + image_id: docker:winglian/axolotl:main-py3.10-cu118-2.0.1 workdir: mistral -setup: | - docker pull winglian/axolotl:main-py3.10-cu118-2.0.1 - run: | - docker run --gpus all \ - -v ~/sky_workdir:/sky_workdir \ - -v /root/.cache:/root/.cache \ - winglian/axolotl:main-py3.10-cu118-2.0.1 \ - huggingface-cli login --token ${HF_TOKEN} - - docker run --gpus all \ - -v ~/sky_workdir:/sky_workdir \ - -v /root/.cache:/root/.cache \ - winglian/axolotl:main-py3.10-cu118-2.0.1 \ - accelerate launch -m axolotl.cli.train /sky_workdir/qlora.yaml + huggingface-cli login --token ${HF_TOKEN} + + accelerate launch -m axolotl.cli.train qlora.yaml envs: HF_TOKEN: # TODO: Fill with your own huggingface token, or use --env to pass. diff --git a/llm/axolotl/mistral/qlora-checkpoint.yaml b/llm/axolotl/mistral/qlora-checkpoint.yaml index 278a5d72b9a..1f1cc67446c 100644 --- a/llm/axolotl/mistral/qlora-checkpoint.yaml +++ b/llm/axolotl/mistral/qlora-checkpoint.yaml @@ -71,6 +71,7 @@ warmup_steps: 10 eval_steps: 0.05 eval_table_size: eval_table_max_new_tokens: 128 +eval_sample_packing: false save_steps: 2 ## increase based on your dataset save_strategy: steps debug: @@ -81,4 +82,4 @@ fsdp_config: special_tokens: bos_token: "" eos_token: "" - unk_token: "" \ No newline at end of file + unk_token: "" diff --git a/llm/axolotl/mistral/qlora.yaml b/llm/axolotl/mistral/qlora.yaml index 42c3742b52d..39b2c55b1ce 100644 --- a/llm/axolotl/mistral/qlora.yaml +++ b/llm/axolotl/mistral/qlora.yaml @@ -69,6 +69,7 @@ warmup_steps: 10 eval_steps: 0.05 eval_table_size: eval_table_max_new_tokens: 128 +eval_sample_packing: false save_steps: debug: deepspeed: @@ -78,4 +79,4 @@ fsdp_config: special_tokens: bos_token: "" eos_token: "" - unk_token: "" \ No newline at end of file + unk_token: "" diff --git a/sky/backends/backend_utils.py b/sky/backends/backend_utils.py index cf43cfdf2ed..b1598c7c039 100644 --- a/sky/backends/backend_utils.py +++ b/sky/backends/backend_utils.py @@ -925,7 +925,14 @@ def write_cluster_config( 'dump_port_command': dump_port_command, # Sky-internal constants. 'sky_ray_cmd': constants.SKY_RAY_CMD, - 'sky_pip_cmd': constants.SKY_PIP_CMD, + # pip install needs to have python env activated to make sure + # installed packages are within the env path. + 'sky_pip_cmd': f'{constants.SKY_PIP_CMD}', + # Activate the SkyPilot runtime environment when starting ray + # cluster, so that ray autoscaler can access cloud SDK and CLIs + # on remote + 'sky_activate_python_env': + constants.ACTIVATE_SKY_REMOTE_PYTHON_ENV, 'ray_version': constants.SKY_REMOTE_RAY_VERSION, # Command for waiting ray cluster to be ready on head. 'ray_head_wait_initialized_command': diff --git a/sky/jobs/core.py b/sky/jobs/core.py index ff9953489d5..7f9e0d757ea 100644 --- a/sky/jobs/core.py +++ b/sky/jobs/core.py @@ -98,7 +98,6 @@ def launch( 'dag_name': dag.name, 'retry_until_up': retry_until_up, 'remote_user_config_path': remote_user_config_path, - 'sky_python_cmd': skylet_constants.SKY_PYTHON_CMD, 'modified_catalogs': service_catalog_common.get_modified_catalog_file_mounts(), **controller_utils.shared_controller_vars_to_fill( diff --git a/sky/provision/docker_utils.py b/sky/provision/docker_utils.py index 10ae5dafc07..b9ed689fdaf 100644 --- a/sky/provision/docker_utils.py +++ b/sky/provision/docker_utils.py @@ -15,6 +15,17 @@ DOCKER_PERMISSION_DENIED_STR = ('permission denied while trying to connect to ' 'the Docker daemon socket') +# Configure environment variables. A docker image can have environment variables +# set in the Dockerfile with `ENV``. We need to export these variables to the +# shell environment, so that our ssh session can access them. +SETUP_ENV_VARS_CMD = ( + 'prefix_cmd() ' + '{ if [ $(id -u) -ne 0 ]; then echo "sudo"; else echo ""; fi; } && ' + 'printenv | while IFS=\'=\' read -r key value; do echo "export $key=\\\"$value\\\""; done > ' # pylint: disable=line-too-long + '~/container_env_var.sh && ' + '$(prefix_cmd) mv ~/container_env_var.sh /etc/profile.d/container_env_var.sh' +) + @dataclasses.dataclass class DockerLoginConfig: @@ -244,6 +255,8 @@ def initialize(self) -> str: self._run(start_command) # SkyPilot: Setup Commands. + # TODO(zhwu): the following setups should be aligned with the kubernetes + # pod setup, like provision.kubernetes.instance::_set_env_vars_in_pods # TODO(tian): These setup commands assumed that the container is # debian-based. We should make it more general. # Most of docker images are using root as default user, so we set an @@ -296,7 +309,8 @@ def initialize(self) -> str: 'mkdir -p ~/.ssh;' 'cat /tmp/host_ssh_authorized_keys >> ~/.ssh/authorized_keys;' 'sudo service ssh start;' - 'sudo sed -i "s/mesg n/tty -s \&\& mesg n/" ~/.profile;', + 'sudo sed -i "s/mesg n/tty -s \&\& mesg n/" ~/.profile;' + f'{SETUP_ENV_VARS_CMD}', run_env='docker') # SkyPilot: End of Setup Commands. diff --git a/sky/provision/instance_setup.py b/sky/provision/instance_setup.py index 2e07f026616..c81ecd78db4 100644 --- a/sky/provision/instance_setup.py +++ b/sky/provision/instance_setup.py @@ -61,7 +61,10 @@ 'done;') # Restart skylet when the version does not match to keep the skylet up-to-date. -MAYBE_SKYLET_RESTART_CMD = (f'{constants.SKY_PYTHON_CMD} -m ' +# We need to activate the python environment to make sure autostop in skylet +# can find the cloud SDK/CLI in PATH. +MAYBE_SKYLET_RESTART_CMD = (f'{constants.ACTIVATE_SKY_REMOTE_PYTHON_ENV}; ' + f'{constants.SKY_PYTHON_CMD} -m ' 'sky.skylet.attempt_skylet;') diff --git a/sky/provision/kubernetes/instance.py b/sky/provision/kubernetes/instance.py index 9068079701f..4f88293525f 100644 --- a/sky/provision/kubernetes/instance.py +++ b/sky/provision/kubernetes/instance.py @@ -10,6 +10,7 @@ from sky import status_lib from sky.adaptors import kubernetes from sky.provision import common +from sky.provision import docker_utils from sky.provision.kubernetes import config as config_lib from sky.provision.kubernetes import utils as kubernetes_utils from sky.utils import common_utils @@ -241,7 +242,7 @@ def _wait_for_pods_to_run(namespace, new_nodes): 'the node. Error details: ' f'{container_status.state.waiting.message}.') # Reaching this point means that one of the pods had an issue, - # so break out of the loop + # so break out of the loop, and wait until next second. break if all_pods_running: @@ -301,13 +302,7 @@ def _set_env_vars_in_pods(namespace: str, new_pods: List): set_k8s_env_var_cmd = [ '/bin/sh', '-c', - ( - 'prefix_cmd() ' - '{ if [ $(id -u) -ne 0 ]; then echo "sudo"; else echo ""; fi; } && ' - 'printenv | while IFS=\'=\' read -r key value; do echo "export $key=\\\"$value\\\""; done > ' # pylint: disable=line-too-long - '~/k8s_env_var.sh && ' - 'mv ~/k8s_env_var.sh /etc/profile.d/k8s_env_var.sh || ' - '$(prefix_cmd) mv ~/k8s_env_var.sh /etc/profile.d/k8s_env_var.sh') + docker_utils.SETUP_ENV_VARS_CMD, ] for new_pod in new_pods: @@ -540,6 +535,8 @@ def _create_pods(region: str, cluster_name_on_cloud: str, _wait_for_pods_to_schedule(namespace, wait_pods, provision_timeout) # Wait until the pods and their containers are up and running, and # fail early if there is an error + logger.debug(f'run_instances: waiting for pods to be running (pulling ' + f'images): {list(wait_pods_dict.keys())}') _wait_for_pods_to_run(namespace, wait_pods) logger.debug(f'run_instances: all pods are scheduled and running: ' f'{list(wait_pods_dict.keys())}') diff --git a/sky/skylet/attempt_skylet.py b/sky/skylet/attempt_skylet.py index 609cfa09141..54df4986080 100644 --- a/sky/skylet/attempt_skylet.py +++ b/sky/skylet/attempt_skylet.py @@ -21,6 +21,9 @@ def restart_skylet(): shell=True, check=False) subprocess.run( + # We have made sure that `attempt_skylet.py` is executed with the + # skypilot runtime env activated, so that skylet can access the cloud + # CLI tools. f'nohup {constants.SKY_PYTHON_CMD} -m sky.skylet.skylet' ' >> ~/.sky/skylet.log 2>&1 &', shell=True, diff --git a/sky/skylet/constants.py b/sky/skylet/constants.py index 0f2d7540007..0c68fd7f6e6 100644 --- a/sky/skylet/constants.py +++ b/sky/skylet/constants.py @@ -37,8 +37,18 @@ SKY_PYTHON_CMD = f'$({SKY_GET_PYTHON_PATH_CMD})' SKY_PIP_CMD = f'{SKY_PYTHON_CMD} -m pip' # Ray executable, e.g., /opt/conda/bin/ray -SKY_RAY_CMD = (f'$([ -s {SKY_RAY_PATH_FILE} ] && ' +# We need to add SKY_PYTHON_CMD before ray executable because: +# The ray executable is a python script with a header like: +# #!/opt/conda/bin/python3 +# When we create the skypilot-runtime venv, the previously installed ray +# executable will be reused (due to --system-site-packages), and that will cause +# running ray CLI commands to use the wrong python executable. +SKY_RAY_CMD = (f'{SKY_PYTHON_CMD} $([ -s {SKY_RAY_PATH_FILE} ] && ' f'cat {SKY_RAY_PATH_FILE} 2> /dev/null || which ray)') +# Separate env for SkyPilot runtime dependencies. +SKY_REMOTE_PYTHON_ENV_NAME = 'skypilot-runtime' +SKY_REMOTE_PYTHON_ENV = f'~/{SKY_REMOTE_PYTHON_ENV_NAME}' +ACTIVATE_SKY_REMOTE_PYTHON_ENV = f'source {SKY_REMOTE_PYTHON_ENV}/bin/activate' # The name for the environment variable that stores the unique ID of the # current task. This will stay the same across multiple recoveries of the @@ -91,20 +101,27 @@ # AWS's Deep Learning AMI's default conda environment. CONDA_INSTALLATION_COMMANDS = ( 'which conda > /dev/null 2>&1 || ' - '{ wget -nc https://repo.anaconda.com/miniconda/Miniconda3-py310_23.11.0-2-Linux-x86_64.sh -O Miniconda3-Linux-x86_64.sh && ' # pylint: disable=line-too-long + '{ curl https://repo.anaconda.com/miniconda/Miniconda3-py310_23.11.0-2-Linux-x86_64.sh -o Miniconda3-Linux-x86_64.sh && ' # pylint: disable=line-too-long 'bash Miniconda3-Linux-x86_64.sh -b && ' 'eval "$(~/miniconda3/bin/conda shell.bash hook)" && conda init && ' 'conda config --set auto_activate_base true && ' - # Use $(echo ~) instead of ~ to avoid the error "no such file or directory". - # Also, not using $HOME to avoid the error HOME variable not set. - f'echo "$(echo ~)/miniconda3/bin/python" > {SKY_PYTHON_PATH_FILE}; }}; ' + f'conda activate base; }}; ' 'grep "# >>> conda initialize >>>" ~/.bashrc || ' '{ conda init && source ~/.bashrc; };' - '(type -a python | grep -q python3) || ' - 'echo \'alias python=python3\' >> ~/.bashrc;' - '(type -a pip | grep -q pip3) || echo \'alias pip=pip3\' >> ~/.bashrc;' - # Writes Python path to file if it does not exist or the file is empty. - f'[ -s {SKY_PYTHON_PATH_FILE} ] || which python3 > {SKY_PYTHON_PATH_FILE};') + # If Python version is larger then equal to 3.12, create a new conda env + # with Python 3.10. + # We don't use a separate conda env for SkyPilot dependencies because it is + # costly to create a new conda env, and venv should be a lightweight and + # faster alternative when the python version satisfies the requirement. + '[[ $(python3 --version | cut -d " " -f 2 | cut -d "." -f 2) -ge 12 ]] && ' + f'echo "Creating conda env with Python 3.10" && ' + f'conda create -y -n {SKY_REMOTE_PYTHON_ENV_NAME} python=3.10 && ' + f'conda activate {SKY_REMOTE_PYTHON_ENV_NAME};' + # Create a separate conda environment for SkyPilot dependencies. + f'[ -d {SKY_REMOTE_PYTHON_ENV} ] || ' + f'{{ {SKY_PYTHON_CMD} -m venv {SKY_REMOTE_PYTHON_ENV} --system-site-packages && ' + f'echo "$(echo {SKY_REMOTE_PYTHON_ENV})/bin/python" > {SKY_PYTHON_PATH_FILE}; }};' +) _sky_version = str(version.parse(sky.__version__)) RAY_STATUS = f'RAY_ADDRESS=127.0.0.1:{SKY_REMOTE_RAY_PORT} {SKY_RAY_CMD} status' @@ -142,7 +159,9 @@ # mentioned above are resolved. 'export PATH=$PATH:$HOME/.local/bin; ' # Writes ray path to file if it does not exist or the file is empty. - f'[ -s {SKY_RAY_PATH_FILE} ] || which ray > {SKY_RAY_PATH_FILE}; ' + f'[ -s {SKY_RAY_PATH_FILE} ] || ' + f'{{ {ACTIVATE_SKY_REMOTE_PYTHON_ENV} && ' + f'which ray > {SKY_RAY_PATH_FILE} || exit 1; }}; ' # END ray package check and installation f'{{ {SKY_PIP_CMD} list | grep "skypilot " && ' '[ "$(cat ~/.sky/wheels/current_sky_wheel_hash)" == "{sky_wheel_hash}" ]; } || ' # pylint: disable=line-too-long diff --git a/sky/skylet/events.py b/sky/skylet/events.py index c63b42cc438..b6e99707dab 100644 --- a/sky/skylet/events.py +++ b/sky/skylet/events.py @@ -3,7 +3,6 @@ import os import re import subprocess -import sys import time import traceback @@ -193,7 +192,10 @@ def _stop_cluster(self, autostop_config): # Passing env inherited from os.environ is technically not # needed, because we call `python