diff --git a/llm/axolotl/axolotl-docker.yaml b/llm/axolotl/axolotl-docker.yaml
new file mode 100644
index 00000000000..b883ebdde46
--- /dev/null
+++ b/llm/axolotl/axolotl-docker.yaml
@@ -0,0 +1,29 @@
+# Usage:
+# HF_TOKEN=abc sky launch -c axolotl axolotl.yaml --env HF_TOKEN -y -i30 --down
+
+name: axolotl
+
+resources:
+ accelerators: L4:1
+ cloud: gcp # optional
+
+workdir: mistral
+
+setup: |
+ docker pull winglian/axolotl:main-py3.10-cu118-2.0.1
+
+run: |
+ docker run --gpus all \
+ -v ~/sky_workdir:/sky_workdir \
+ -v /root/.cache:/root/.cache \
+ winglian/axolotl:main-py3.10-cu118-2.0.1 \
+ huggingface-cli login --token ${HF_TOKEN}
+
+ docker run --gpus all \
+ -v ~/sky_workdir:/sky_workdir \
+ -v /root/.cache:/root/.cache \
+ winglian/axolotl:main-py3.10-cu118-2.0.1 \
+ accelerate launch -m axolotl.cli.train /sky_workdir/qlora.yaml
+
+envs:
+ HF_TOKEN: # TODO: Fill with your own huggingface token, or use --env to pass.
diff --git a/llm/axolotl/axolotl-spot.yaml b/llm/axolotl/axolotl-spot.yaml
index 942f4ccc4ba..8970737483d 100644
--- a/llm/axolotl/axolotl-spot.yaml
+++ b/llm/axolotl/axolotl-spot.yaml
@@ -12,6 +12,7 @@ resources:
accelerators: A100:1
cloud: gcp # optional
use_spot: True
+ image_id: docker:winglian/axolotl:main-py3.10-cu118-2.0.1
workdir: mistral
@@ -20,29 +21,12 @@ file_mounts:
name: ${BUCKET}
mode: MOUNT
-setup: |
- docker pull winglian/axolotl:main-py3.10-cu118-2.0.1
-
run: |
- docker run --gpus all \
- -v ~/sky_workdir:/sky_workdir \
- -v /root/.cache:/root/.cache \
- winglian/axolotl:main-py3.10-cu118-2.0.1 \
- huggingface-cli login --token ${HF_TOKEN}
+ huggingface-cli login --token ${HF_TOKEN}
- docker run --gpus all \
- -v ~/sky_workdir:/sky_workdir \
- -v /root/.cache:/root/.cache \
- -v /sky-notebook:/sky-notebook \
- winglian/axolotl:main-py3.10-cu118-2.0.1 \
- accelerate launch -m axolotl.cli.train /sky_workdir/qlora-checkpoint.yaml
+ accelerate launch -m axolotl.cli.train qlora-checkpoint.yaml
envs:
HF_TOKEN: # TODO: Fill with your own huggingface token, or use --env to pass.
BUCKET: # TODO: Fill with your unique bucket name, or use --env to pass.
-
-
-
-
-
diff --git a/llm/axolotl/axolotl.yaml b/llm/axolotl/axolotl.yaml
index 9cec1d1f331..f46588e9aae 100644
--- a/llm/axolotl/axolotl.yaml
+++ b/llm/axolotl/axolotl.yaml
@@ -5,25 +5,14 @@ name: axolotl
resources:
accelerators: L4:1
- cloud: gcp # optional
+ image_id: docker:winglian/axolotl:main-py3.10-cu118-2.0.1
workdir: mistral
-setup: |
- docker pull winglian/axolotl:main-py3.10-cu118-2.0.1
-
run: |
- docker run --gpus all \
- -v ~/sky_workdir:/sky_workdir \
- -v /root/.cache:/root/.cache \
- winglian/axolotl:main-py3.10-cu118-2.0.1 \
- huggingface-cli login --token ${HF_TOKEN}
-
- docker run --gpus all \
- -v ~/sky_workdir:/sky_workdir \
- -v /root/.cache:/root/.cache \
- winglian/axolotl:main-py3.10-cu118-2.0.1 \
- accelerate launch -m axolotl.cli.train /sky_workdir/qlora.yaml
+ huggingface-cli login --token ${HF_TOKEN}
+
+ accelerate launch -m axolotl.cli.train qlora.yaml
envs:
HF_TOKEN: # TODO: Fill with your own huggingface token, or use --env to pass.
diff --git a/llm/axolotl/mistral/qlora-checkpoint.yaml b/llm/axolotl/mistral/qlora-checkpoint.yaml
index 278a5d72b9a..1f1cc67446c 100644
--- a/llm/axolotl/mistral/qlora-checkpoint.yaml
+++ b/llm/axolotl/mistral/qlora-checkpoint.yaml
@@ -71,6 +71,7 @@ warmup_steps: 10
eval_steps: 0.05
eval_table_size:
eval_table_max_new_tokens: 128
+eval_sample_packing: false
save_steps: 2 ## increase based on your dataset
save_strategy: steps
debug:
@@ -81,4 +82,4 @@ fsdp_config:
special_tokens:
bos_token: ""
eos_token: ""
- unk_token: ""
\ No newline at end of file
+ unk_token: ""
diff --git a/llm/axolotl/mistral/qlora.yaml b/llm/axolotl/mistral/qlora.yaml
index 42c3742b52d..39b2c55b1ce 100644
--- a/llm/axolotl/mistral/qlora.yaml
+++ b/llm/axolotl/mistral/qlora.yaml
@@ -69,6 +69,7 @@ warmup_steps: 10
eval_steps: 0.05
eval_table_size:
eval_table_max_new_tokens: 128
+eval_sample_packing: false
save_steps:
debug:
deepspeed:
@@ -78,4 +79,4 @@ fsdp_config:
special_tokens:
bos_token: ""
eos_token: ""
- unk_token: ""
\ No newline at end of file
+ unk_token: ""
diff --git a/sky/backends/backend_utils.py b/sky/backends/backend_utils.py
index cf43cfdf2ed..b1598c7c039 100644
--- a/sky/backends/backend_utils.py
+++ b/sky/backends/backend_utils.py
@@ -925,7 +925,14 @@ def write_cluster_config(
'dump_port_command': dump_port_command,
# Sky-internal constants.
'sky_ray_cmd': constants.SKY_RAY_CMD,
- 'sky_pip_cmd': constants.SKY_PIP_CMD,
+ # pip install needs to have python env activated to make sure
+ # installed packages are within the env path.
+ 'sky_pip_cmd': f'{constants.SKY_PIP_CMD}',
+ # Activate the SkyPilot runtime environment when starting ray
+ # cluster, so that ray autoscaler can access cloud SDK and CLIs
+ # on remote
+ 'sky_activate_python_env':
+ constants.ACTIVATE_SKY_REMOTE_PYTHON_ENV,
'ray_version': constants.SKY_REMOTE_RAY_VERSION,
# Command for waiting ray cluster to be ready on head.
'ray_head_wait_initialized_command':
diff --git a/sky/jobs/core.py b/sky/jobs/core.py
index ff9953489d5..7f9e0d757ea 100644
--- a/sky/jobs/core.py
+++ b/sky/jobs/core.py
@@ -98,7 +98,6 @@ def launch(
'dag_name': dag.name,
'retry_until_up': retry_until_up,
'remote_user_config_path': remote_user_config_path,
- 'sky_python_cmd': skylet_constants.SKY_PYTHON_CMD,
'modified_catalogs':
service_catalog_common.get_modified_catalog_file_mounts(),
**controller_utils.shared_controller_vars_to_fill(
diff --git a/sky/provision/docker_utils.py b/sky/provision/docker_utils.py
index 10ae5dafc07..b9ed689fdaf 100644
--- a/sky/provision/docker_utils.py
+++ b/sky/provision/docker_utils.py
@@ -15,6 +15,17 @@
DOCKER_PERMISSION_DENIED_STR = ('permission denied while trying to connect to '
'the Docker daemon socket')
+# Configure environment variables. A docker image can have environment variables
+# set in the Dockerfile with `ENV``. We need to export these variables to the
+# shell environment, so that our ssh session can access them.
+SETUP_ENV_VARS_CMD = (
+ 'prefix_cmd() '
+ '{ if [ $(id -u) -ne 0 ]; then echo "sudo"; else echo ""; fi; } && '
+ 'printenv | while IFS=\'=\' read -r key value; do echo "export $key=\\\"$value\\\""; done > ' # pylint: disable=line-too-long
+ '~/container_env_var.sh && '
+ '$(prefix_cmd) mv ~/container_env_var.sh /etc/profile.d/container_env_var.sh'
+)
+
@dataclasses.dataclass
class DockerLoginConfig:
@@ -244,6 +255,8 @@ def initialize(self) -> str:
self._run(start_command)
# SkyPilot: Setup Commands.
+ # TODO(zhwu): the following setups should be aligned with the kubernetes
+ # pod setup, like provision.kubernetes.instance::_set_env_vars_in_pods
# TODO(tian): These setup commands assumed that the container is
# debian-based. We should make it more general.
# Most of docker images are using root as default user, so we set an
@@ -296,7 +309,8 @@ def initialize(self) -> str:
'mkdir -p ~/.ssh;'
'cat /tmp/host_ssh_authorized_keys >> ~/.ssh/authorized_keys;'
'sudo service ssh start;'
- 'sudo sed -i "s/mesg n/tty -s \&\& mesg n/" ~/.profile;',
+ 'sudo sed -i "s/mesg n/tty -s \&\& mesg n/" ~/.profile;'
+ f'{SETUP_ENV_VARS_CMD}',
run_env='docker')
# SkyPilot: End of Setup Commands.
diff --git a/sky/provision/instance_setup.py b/sky/provision/instance_setup.py
index 2e07f026616..c81ecd78db4 100644
--- a/sky/provision/instance_setup.py
+++ b/sky/provision/instance_setup.py
@@ -61,7 +61,10 @@
'done;')
# Restart skylet when the version does not match to keep the skylet up-to-date.
-MAYBE_SKYLET_RESTART_CMD = (f'{constants.SKY_PYTHON_CMD} -m '
+# We need to activate the python environment to make sure autostop in skylet
+# can find the cloud SDK/CLI in PATH.
+MAYBE_SKYLET_RESTART_CMD = (f'{constants.ACTIVATE_SKY_REMOTE_PYTHON_ENV}; '
+ f'{constants.SKY_PYTHON_CMD} -m '
'sky.skylet.attempt_skylet;')
diff --git a/sky/provision/kubernetes/instance.py b/sky/provision/kubernetes/instance.py
index 9068079701f..4f88293525f 100644
--- a/sky/provision/kubernetes/instance.py
+++ b/sky/provision/kubernetes/instance.py
@@ -10,6 +10,7 @@
from sky import status_lib
from sky.adaptors import kubernetes
from sky.provision import common
+from sky.provision import docker_utils
from sky.provision.kubernetes import config as config_lib
from sky.provision.kubernetes import utils as kubernetes_utils
from sky.utils import common_utils
@@ -241,7 +242,7 @@ def _wait_for_pods_to_run(namespace, new_nodes):
'the node. Error details: '
f'{container_status.state.waiting.message}.')
# Reaching this point means that one of the pods had an issue,
- # so break out of the loop
+ # so break out of the loop, and wait until next second.
break
if all_pods_running:
@@ -301,13 +302,7 @@ def _set_env_vars_in_pods(namespace: str, new_pods: List):
set_k8s_env_var_cmd = [
'/bin/sh',
'-c',
- (
- 'prefix_cmd() '
- '{ if [ $(id -u) -ne 0 ]; then echo "sudo"; else echo ""; fi; } && '
- 'printenv | while IFS=\'=\' read -r key value; do echo "export $key=\\\"$value\\\""; done > ' # pylint: disable=line-too-long
- '~/k8s_env_var.sh && '
- 'mv ~/k8s_env_var.sh /etc/profile.d/k8s_env_var.sh || '
- '$(prefix_cmd) mv ~/k8s_env_var.sh /etc/profile.d/k8s_env_var.sh')
+ docker_utils.SETUP_ENV_VARS_CMD,
]
for new_pod in new_pods:
@@ -540,6 +535,8 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
_wait_for_pods_to_schedule(namespace, wait_pods, provision_timeout)
# Wait until the pods and their containers are up and running, and
# fail early if there is an error
+ logger.debug(f'run_instances: waiting for pods to be running (pulling '
+ f'images): {list(wait_pods_dict.keys())}')
_wait_for_pods_to_run(namespace, wait_pods)
logger.debug(f'run_instances: all pods are scheduled and running: '
f'{list(wait_pods_dict.keys())}')
diff --git a/sky/skylet/attempt_skylet.py b/sky/skylet/attempt_skylet.py
index 609cfa09141..54df4986080 100644
--- a/sky/skylet/attempt_skylet.py
+++ b/sky/skylet/attempt_skylet.py
@@ -21,6 +21,9 @@ def restart_skylet():
shell=True,
check=False)
subprocess.run(
+ # We have made sure that `attempt_skylet.py` is executed with the
+ # skypilot runtime env activated, so that skylet can access the cloud
+ # CLI tools.
f'nohup {constants.SKY_PYTHON_CMD} -m sky.skylet.skylet'
' >> ~/.sky/skylet.log 2>&1 &',
shell=True,
diff --git a/sky/skylet/constants.py b/sky/skylet/constants.py
index 0f2d7540007..0c68fd7f6e6 100644
--- a/sky/skylet/constants.py
+++ b/sky/skylet/constants.py
@@ -37,8 +37,18 @@
SKY_PYTHON_CMD = f'$({SKY_GET_PYTHON_PATH_CMD})'
SKY_PIP_CMD = f'{SKY_PYTHON_CMD} -m pip'
# Ray executable, e.g., /opt/conda/bin/ray
-SKY_RAY_CMD = (f'$([ -s {SKY_RAY_PATH_FILE} ] && '
+# We need to add SKY_PYTHON_CMD before ray executable because:
+# The ray executable is a python script with a header like:
+# #!/opt/conda/bin/python3
+# When we create the skypilot-runtime venv, the previously installed ray
+# executable will be reused (due to --system-site-packages), and that will cause
+# running ray CLI commands to use the wrong python executable.
+SKY_RAY_CMD = (f'{SKY_PYTHON_CMD} $([ -s {SKY_RAY_PATH_FILE} ] && '
f'cat {SKY_RAY_PATH_FILE} 2> /dev/null || which ray)')
+# Separate env for SkyPilot runtime dependencies.
+SKY_REMOTE_PYTHON_ENV_NAME = 'skypilot-runtime'
+SKY_REMOTE_PYTHON_ENV = f'~/{SKY_REMOTE_PYTHON_ENV_NAME}'
+ACTIVATE_SKY_REMOTE_PYTHON_ENV = f'source {SKY_REMOTE_PYTHON_ENV}/bin/activate'
# The name for the environment variable that stores the unique ID of the
# current task. This will stay the same across multiple recoveries of the
@@ -91,20 +101,27 @@
# AWS's Deep Learning AMI's default conda environment.
CONDA_INSTALLATION_COMMANDS = (
'which conda > /dev/null 2>&1 || '
- '{ wget -nc https://repo.anaconda.com/miniconda/Miniconda3-py310_23.11.0-2-Linux-x86_64.sh -O Miniconda3-Linux-x86_64.sh && ' # pylint: disable=line-too-long
+ '{ curl https://repo.anaconda.com/miniconda/Miniconda3-py310_23.11.0-2-Linux-x86_64.sh -o Miniconda3-Linux-x86_64.sh && ' # pylint: disable=line-too-long
'bash Miniconda3-Linux-x86_64.sh -b && '
'eval "$(~/miniconda3/bin/conda shell.bash hook)" && conda init && '
'conda config --set auto_activate_base true && '
- # Use $(echo ~) instead of ~ to avoid the error "no such file or directory".
- # Also, not using $HOME to avoid the error HOME variable not set.
- f'echo "$(echo ~)/miniconda3/bin/python" > {SKY_PYTHON_PATH_FILE}; }}; '
+ f'conda activate base; }}; '
'grep "# >>> conda initialize >>>" ~/.bashrc || '
'{ conda init && source ~/.bashrc; };'
- '(type -a python | grep -q python3) || '
- 'echo \'alias python=python3\' >> ~/.bashrc;'
- '(type -a pip | grep -q pip3) || echo \'alias pip=pip3\' >> ~/.bashrc;'
- # Writes Python path to file if it does not exist or the file is empty.
- f'[ -s {SKY_PYTHON_PATH_FILE} ] || which python3 > {SKY_PYTHON_PATH_FILE};')
+ # If Python version is larger then equal to 3.12, create a new conda env
+ # with Python 3.10.
+ # We don't use a separate conda env for SkyPilot dependencies because it is
+ # costly to create a new conda env, and venv should be a lightweight and
+ # faster alternative when the python version satisfies the requirement.
+ '[[ $(python3 --version | cut -d " " -f 2 | cut -d "." -f 2) -ge 12 ]] && '
+ f'echo "Creating conda env with Python 3.10" && '
+ f'conda create -y -n {SKY_REMOTE_PYTHON_ENV_NAME} python=3.10 && '
+ f'conda activate {SKY_REMOTE_PYTHON_ENV_NAME};'
+ # Create a separate conda environment for SkyPilot dependencies.
+ f'[ -d {SKY_REMOTE_PYTHON_ENV} ] || '
+ f'{{ {SKY_PYTHON_CMD} -m venv {SKY_REMOTE_PYTHON_ENV} --system-site-packages && '
+ f'echo "$(echo {SKY_REMOTE_PYTHON_ENV})/bin/python" > {SKY_PYTHON_PATH_FILE}; }};'
+)
_sky_version = str(version.parse(sky.__version__))
RAY_STATUS = f'RAY_ADDRESS=127.0.0.1:{SKY_REMOTE_RAY_PORT} {SKY_RAY_CMD} status'
@@ -142,7 +159,9 @@
# mentioned above are resolved.
'export PATH=$PATH:$HOME/.local/bin; '
# Writes ray path to file if it does not exist or the file is empty.
- f'[ -s {SKY_RAY_PATH_FILE} ] || which ray > {SKY_RAY_PATH_FILE}; '
+ f'[ -s {SKY_RAY_PATH_FILE} ] || '
+ f'{{ {ACTIVATE_SKY_REMOTE_PYTHON_ENV} && '
+ f'which ray > {SKY_RAY_PATH_FILE} || exit 1; }}; '
# END ray package check and installation
f'{{ {SKY_PIP_CMD} list | grep "skypilot " && '
'[ "$(cat ~/.sky/wheels/current_sky_wheel_hash)" == "{sky_wheel_hash}" ]; } || ' # pylint: disable=line-too-long
diff --git a/sky/skylet/events.py b/sky/skylet/events.py
index c63b42cc438..b6e99707dab 100644
--- a/sky/skylet/events.py
+++ b/sky/skylet/events.py
@@ -3,7 +3,6 @@
import os
import re
import subprocess
-import sys
import time
import traceback
@@ -193,7 +192,10 @@ def _stop_cluster(self, autostop_config):
# Passing env inherited from os.environ is technically not
# needed, because we call `python