From 5b82fd7d379067f59414efce2140c2219ce80adb Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Tue, 21 May 2024 08:11:56 +0000 Subject: [PATCH 01/38] Quote the command correctly when source_bashrc is not set --- sky/utils/command_runner.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/sky/utils/command_runner.py b/sky/utils/command_runner.py index 3aa87eda138..2f71e4a6592 100644 --- a/sky/utils/command_runner.py +++ b/sky/utils/command_runner.py @@ -184,8 +184,9 @@ def _get_command_to_run( # cluster by 1 second. # sourcing ~/.bashrc is not required for internal executions command += [ - 'true && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore' - f' && ({cmd})' + shlex.quote( + 'true && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore' + f' && ({cmd})') ] if not separate_stderr: command.append('2>&1') From 9822e4465a2f9f31cb5e7ddfb6ce1a1adc89a5a9 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Tue, 21 May 2024 08:44:33 +0000 Subject: [PATCH 02/38] Remove unnecessary source bashrc --- sky/backends/cloud_vm_ray_backend.py | 5 ++--- sky/provision/instance_setup.py | 11 +++++------ 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py index 6d2447fe89b..6ff9731033e 100644 --- a/sky/backends/cloud_vm_ray_backend.py +++ b/sky/backends/cloud_vm_ray_backend.py @@ -3175,7 +3175,8 @@ def _setup_node(node_id: int) -> None: f'{create_script_code} && {setup_cmd}', log_path=setup_log_path, process_stream=False, - source_bashrc=True, + # We do not source bashrc for setup, since bashrc is sourced + # in the script already. ) def error_message() -> str: @@ -3724,7 +3725,6 @@ def tail_managed_job_logs(self, process_stream=False, ssh_mode=command_runner.SshMode.INTERACTIVE, stdin=subprocess.DEVNULL, - source_bashrc=True, ) def tail_serve_logs(self, handle: CloudVmRayResourceHandle, @@ -3762,7 +3762,6 @@ def tail_serve_logs(self, handle: CloudVmRayResourceHandle, process_stream=False, ssh_mode=command_runner.SshMode.INTERACTIVE, stdin=subprocess.DEVNULL, - source_bashrc=True, ) def teardown_no_lock(self, diff --git a/sky/provision/instance_setup.py b/sky/provision/instance_setup.py index 1e5e6285fef..6de916b2d1c 100644 --- a/sky/provision/instance_setup.py +++ b/sky/provision/instance_setup.py @@ -197,10 +197,10 @@ def _setup_node(runner: command_runner.CommandRunner, log_path: str): cmd, stream_logs=False, log_path=log_path, - require_outputs=True, - # Installing depencies requires source bashrc to access the PATH - # in bashrc. - source_bashrc=True) + require_outputs=True + # No need to source bashrc, as it should be sourced in the setup + # commands already. + ) retry_cnt = 0 while returncode == 255 and retry_cnt < _MAX_RETRY: # Got network connection issue occur during setup. This could @@ -214,8 +214,7 @@ def _setup_node(runner: command_runner.CommandRunner, log_path: str): returncode, stdout, stderr = runner.run(cmd, stream_logs=False, log_path=log_path, - require_outputs=True, - source_bashrc=True) + require_outputs=True) if not returncode: break From 4b045ea4c92dbb38bad552400985b1c9d5ae7957 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Tue, 21 May 2024 08:50:48 +0000 Subject: [PATCH 03/38] format --- sky/provision/instance_setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sky/provision/instance_setup.py b/sky/provision/instance_setup.py index 6de916b2d1c..150ce9c3846 100644 --- a/sky/provision/instance_setup.py +++ b/sky/provision/instance_setup.py @@ -200,7 +200,7 @@ def _setup_node(runner: command_runner.CommandRunner, log_path: str): require_outputs=True # No need to source bashrc, as it should be sourced in the setup # commands already. - ) + ) retry_cnt = 0 while returncode == 255 and retry_cnt < _MAX_RETRY: # Got network connection issue occur during setup. This could From 1e2576eeec079d961c665c30054cc83e8437d0a3 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Tue, 21 May 2024 09:23:51 +0000 Subject: [PATCH 04/38] Fix setup script for conda --- sky/provision/instance_setup.py | 11 ++++++----- sky/skylet/constants.py | 11 +++++++---- 2 files changed, 13 insertions(+), 9 deletions(-) diff --git a/sky/provision/instance_setup.py b/sky/provision/instance_setup.py index 150ce9c3846..2e07f026616 100644 --- a/sky/provision/instance_setup.py +++ b/sky/provision/instance_setup.py @@ -197,10 +197,10 @@ def _setup_node(runner: command_runner.CommandRunner, log_path: str): cmd, stream_logs=False, log_path=log_path, - require_outputs=True - # No need to source bashrc, as it should be sourced in the setup - # commands already. - ) + require_outputs=True, + # Installing dependencies requires source bashrc to access + # conda. + source_bashrc=True) retry_cnt = 0 while returncode == 255 and retry_cnt < _MAX_RETRY: # Got network connection issue occur during setup. This could @@ -214,7 +214,8 @@ def _setup_node(runner: command_runner.CommandRunner, log_path: str): returncode, stdout, stderr = runner.run(cmd, stream_logs=False, log_path=log_path, - require_outputs=True) + require_outputs=True, + source_bashrc=True) if not returncode: break diff --git a/sky/skylet/constants.py b/sky/skylet/constants.py index 578629ea3e2..0f2d7540007 100644 --- a/sky/skylet/constants.py +++ b/sky/skylet/constants.py @@ -91,15 +91,18 @@ # AWS's Deep Learning AMI's default conda environment. CONDA_INSTALLATION_COMMANDS = ( 'which conda > /dev/null 2>&1 || ' - '(wget -nc https://repo.anaconda.com/miniconda/Miniconda3-py310_23.11.0-2-Linux-x86_64.sh -O Miniconda3-Linux-x86_64.sh && ' # pylint: disable=line-too-long + '{ wget -nc https://repo.anaconda.com/miniconda/Miniconda3-py310_23.11.0-2-Linux-x86_64.sh -O Miniconda3-Linux-x86_64.sh && ' # pylint: disable=line-too-long 'bash Miniconda3-Linux-x86_64.sh -b && ' 'eval "$(~/miniconda3/bin/conda shell.bash hook)" && conda init && ' - 'conda config --set auto_activate_base true); ' - 'grep "# >>> conda initialize >>>" ~/.bashrc || conda init;' + 'conda config --set auto_activate_base true && ' + # Use $(echo ~) instead of ~ to avoid the error "no such file or directory". + # Also, not using $HOME to avoid the error HOME variable not set. + f'echo "$(echo ~)/miniconda3/bin/python" > {SKY_PYTHON_PATH_FILE}; }}; ' + 'grep "# >>> conda initialize >>>" ~/.bashrc || ' + '{ conda init && source ~/.bashrc; };' '(type -a python | grep -q python3) || ' 'echo \'alias python=python3\' >> ~/.bashrc;' '(type -a pip | grep -q pip3) || echo \'alias pip=pip3\' >> ~/.bashrc;' - 'source ~/.bashrc;' # Writes Python path to file if it does not exist or the file is empty. f'[ -s {SKY_PYTHON_PATH_FILE} ] || which python3 > {SKY_PYTHON_PATH_FILE};') From f25f623548350a88d7d21ca94b6361506d53b131 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Tue, 21 May 2024 09:29:46 +0000 Subject: [PATCH 05/38] Add comment --- sky/utils/command_runner.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/sky/utils/command_runner.py b/sky/utils/command_runner.py index 2f71e4a6592..ee9e75860ec 100644 --- a/sky/utils/command_runner.py +++ b/sky/utils/command_runner.py @@ -432,10 +432,12 @@ def run( cmd, process_stream, separate_stderr, - # A hack to remove the following bash warnings (twice): + # A hack to remove the following SSH warning+bash warnings (twice): + # Warning: Permanently added 'xx.xx.xx.xx' to the list of known hosts. # bash: cannot set terminal process group # bash: no job control in this shell - skip_lines=5 if source_bashrc else 0, + # When not source_bashrc, the bash warning will only show once. + skip_lines=5 if source_bashrc else 3, source_bashrc=source_bashrc) command = base_ssh_command + [shlex.quote(command_str)] From 68f7ebd95e38f823d0152e3e1881d05e69dcfb4c Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Tue, 21 May 2024 17:22:11 +0000 Subject: [PATCH 06/38] format --- sky/utils/command_runner.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sky/utils/command_runner.py b/sky/utils/command_runner.py index ee9e75860ec..0b8ffc74171 100644 --- a/sky/utils/command_runner.py +++ b/sky/utils/command_runner.py @@ -433,7 +433,8 @@ def run( process_stream, separate_stderr, # A hack to remove the following SSH warning+bash warnings (twice): - # Warning: Permanently added 'xx.xx.xx.xx' to the list of known hosts. + # Warning: Permanently added 'xx.xx.xx.xx' to the list of known + # hosts. # bash: cannot set terminal process group # bash: no job control in this shell # When not source_bashrc, the bash warning will only show once. From f2dd3d84fa9465f8a20c1bd54a3a29fce870b756 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Tue, 21 May 2024 18:23:06 +0000 Subject: [PATCH 07/38] Separate env for skypilot --- sky/skylet/constants.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/sky/skylet/constants.py b/sky/skylet/constants.py index 0f2d7540007..467cff4c432 100644 --- a/sky/skylet/constants.py +++ b/sky/skylet/constants.py @@ -39,6 +39,9 @@ # Ray executable, e.g., /opt/conda/bin/ray SKY_RAY_CMD = (f'$([ -s {SKY_RAY_PATH_FILE} ] && ' f'cat {SKY_RAY_PATH_FILE} 2> /dev/null || which ray)') +# Separate env for SkyPilot runtime dependencies. +SKY_REMOTE_PYTHON_ENV_NAME = 'skypilot-runtime' +SKY_REMOTE_PYTHON_ENV = f'~/{SKY_REMOTE_PYTHON_ENV_NAME}' # The name for the environment variable that stores the unique ID of the # current task. This will stay the same across multiple recoveries of the @@ -100,11 +103,11 @@ f'echo "$(echo ~)/miniconda3/bin/python" > {SKY_PYTHON_PATH_FILE}; }}; ' 'grep "# >>> conda initialize >>>" ~/.bashrc || ' '{ conda init && source ~/.bashrc; };' - '(type -a python | grep -q python3) || ' - 'echo \'alias python=python3\' >> ~/.bashrc;' - '(type -a pip | grep -q pip3) || echo \'alias pip=pip3\' >> ~/.bashrc;' - # Writes Python path to file if it does not exist or the file is empty. - f'[ -s {SKY_PYTHON_PATH_FILE} ] || which python3 > {SKY_PYTHON_PATH_FILE};') + # Create a separate conda environment for SkyPilot dependencies. + f'[ -d {SKY_REMOTE_PYTHON_ENV} ] || ' + f'{{ {SKY_PYTHON_CMD} -m venv {SKY_REMOTE_PYTHON_ENV} && ' + f'echo $(echo {SKY_REMOTE_PYTHON_ENV})/bin/python > {SKY_PYTHON_PATH_FILE}; }};' +) _sky_version = str(version.parse(sky.__version__)) RAY_STATUS = f'RAY_ADDRESS=127.0.0.1:{SKY_REMOTE_RAY_PORT} {SKY_RAY_CMD} status' @@ -142,7 +145,9 @@ # mentioned above are resolved. 'export PATH=$PATH:$HOME/.local/bin; ' # Writes ray path to file if it does not exist or the file is empty. - f'[ -s {SKY_RAY_PATH_FILE} ] || which ray > {SKY_RAY_PATH_FILE}; ' + f'[ -s {SKY_RAY_PATH_FILE} ] || ' + f'{{ source {SKY_REMOTE_PYTHON_ENV}/bin/activate && ' + f'which ray > {SKY_RAY_PATH_FILE}; }}' # END ray package check and installation f'{{ {SKY_PIP_CMD} list | grep "skypilot " && ' '[ "$(cat ~/.sky/wheels/current_sky_wheel_hash)" == "{sky_wheel_hash}" ]; } || ' # pylint: disable=line-too-long From 01216cddd732cdcc80ac0091c2ed998ff1f3ff06 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Tue, 21 May 2024 18:27:26 +0000 Subject: [PATCH 08/38] add test smoke --- tests/test_smoke.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_smoke.py b/tests/test_smoke.py index 73ad2c0f46a..47bf9a93e1c 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -3777,7 +3777,7 @@ def test_user_ray_cluster(generic_cloud: str): test = Test( 'user-ray-cluster', [ - f'sky launch -y -c {name} --cloud {generic_cloud} "ray start --head"', + f'sky launch -y -c {name} --cloud {generic_cloud} "pip install ray>2.11; ray start --head"', f'sky exec {name} "echo hi"', f'sky logs {name} 1 --status', f'sky status -r {name} | grep UP', From a6f69968e446d6fc1fa3e0e4d07733d2e0a667e0 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Tue, 21 May 2024 21:47:19 +0000 Subject: [PATCH 09/38] add system site-packages --- sky/skylet/constants.py | 2 +- tests/kubernetes/README.md | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/sky/skylet/constants.py b/sky/skylet/constants.py index 467cff4c432..1b1636907b3 100644 --- a/sky/skylet/constants.py +++ b/sky/skylet/constants.py @@ -105,7 +105,7 @@ '{ conda init && source ~/.bashrc; };' # Create a separate conda environment for SkyPilot dependencies. f'[ -d {SKY_REMOTE_PYTHON_ENV} ] || ' - f'{{ {SKY_PYTHON_CMD} -m venv {SKY_REMOTE_PYTHON_ENV} && ' + f'{{ {SKY_PYTHON_CMD} -m venv {SKY_REMOTE_PYTHON_ENV} --system-site-packages && ' f'echo $(echo {SKY_REMOTE_PYTHON_ENV})/bin/python > {SKY_PYTHON_PATH_FILE}; }};' ) diff --git a/tests/kubernetes/README.md b/tests/kubernetes/README.md index 4a882352703..655daf55b55 100644 --- a/tests/kubernetes/README.md +++ b/tests/kubernetes/README.md @@ -32,13 +32,13 @@ sky local up ```bash PROJECT_ID=$(gcloud config get-value project) CLUSTER_NAME=testclusterromil - gcloud beta container --project "${PROJECT_ID}" clusters create "${CLUSTER_NAME}" --zone "us-central1-c" --no-enable-basic-auth --cluster-version "1.29.0-gke.1381000" --release-channel "regular" --machine-type "n1-standard-8" --accelerator "type=nvidia-tesla-t4,count=1" --image-type "COS_CONTAINERD" --disk-type "pd-balanced" --disk-size "100" --metadata disable-legacy-endpoints=true --scopes "https://www.googleapis.com/auth/devstorage.read_only","https://www.googleapis.com/auth/logging.write","https://www.googleapis.com/auth/monitoring","https://www.googleapis.com/auth/servicecontrol","https://www.googleapis.com/auth/service.management.readonly","https://www.googleapis.com/auth/trace.append" --num-nodes "2" --logging=SYSTEM,WORKLOAD --monitoring=SYSTEM --enable-ip-alias --network "projects/${PROJECT_ID}/global/networks/default" --subnetwork "projects/${PROJECT_ID}/regions/us-central1/subnetworks/default" --no-enable-intra-node-visibility --default-max-pods-per-node "110" --security-posture=standard --workload-vulnerability-scanning=disabled --no-enable-master-authorized-networks --addons HorizontalPodAutoscaling,HttpLoadBalancing,GcePersistentDiskCsiDriver --enable-autoupgrade --enable-autorepair --max-surge-upgrade 1 --max-unavailable-upgrade 0 --enable-managed-prometheus --enable-shielded-nodes --node-locations "us-central1-c" && gcloud beta container --project "${PROJECT_ID}" node-pools create "v100" --cluster "${CLUSTER_NAME}" --zone "us-central1-c" --machine-type "n1-standard-8" --accelerator "type=nvidia-tesla-v100,count=1" --image-type "COS_CONTAINERD" --disk-type "pd-balanced" --disk-size "100" --metadata disable-legacy-endpoints=true --scopes "https://www.googleapis.com/auth/devstorage.read_only","https://www.googleapis.com/auth/logging.write","https://www.googleapis.com/auth/monitoring","https://www.googleapis.com/auth/servicecontrol","https://www.googleapis.com/auth/service.management.readonly","https://www.googleapis.com/auth/trace.append" --num-nodes "2" --enable-autoupgrade --enable-autorepair --max-surge-upgrade 1 --max-unavailable-upgrade 0 --node-locations "us-central1-c" && gcloud beta container --project "${PROJECT_ID}" node-pools create "largecpu" --cluster "${CLUSTER_NAME}" --zone "us-central1-c" --machine-type "n1-standard-16" --image-type "COS_CONTAINERD" --disk-type "pd-balanced" --disk-size "100" --metadata disable-legacy-endpoints=true --scopes "https://www.googleapis.com/auth/devstorage.read_only","https://www.googleapis.com/auth/logging.write","https://www.googleapis.com/auth/monitoring","https://www.googleapis.com/auth/servicecontrol","https://www.googleapis.com/auth/service.management.readonly","https://www.googleapis.com/auth/trace.append" --num-nodes "2" --enable-autoupgrade --enable-autorepair --max-surge-upgrade 1 --max-unavailable-upgrade 0 --node-locations "us-central1-c" + gcloud beta container --project "${PROJECT_ID}" clusters create "${CLUSTER_NAME}" --zone "us-central1-c" --no-enable-basic-auth --cluster-version "1.29.1-gke.1589020" --release-channel "regular" --machine-type "n1-standard-8" --accelerator "type=nvidia-tesla-t4,count=1" --image-type "COS_CONTAINERD" --disk-type "pd-balanced" --disk-size "100" --metadata disable-legacy-endpoints=true --scopes "https://www.googleapis.com/auth/devstorage.read_only","https://www.googleapis.com/auth/logging.write","https://www.googleapis.com/auth/monitoring","https://www.googleapis.com/auth/servicecontrol","https://www.googleapis.com/auth/service.management.readonly","https://www.googleapis.com/auth/trace.append" --num-nodes "2" --logging=SYSTEM,WORKLOAD --monitoring=SYSTEM --enable-ip-alias --network "projects/${PROJECT_ID}/global/networks/default" --subnetwork "projects/${PROJECT_ID}/regions/us-central1/subnetworks/default" --no-enable-intra-node-visibility --default-max-pods-per-node "110" --security-posture=standard --workload-vulnerability-scanning=disabled --no-enable-master-authorized-networks --addons HorizontalPodAutoscaling,HttpLoadBalancing,GcePersistentDiskCsiDriver --enable-autoupgrade --enable-autorepair --max-surge-upgrade 1 --max-unavailable-upgrade 0 --enable-managed-prometheus --enable-shielded-nodes --node-locations "us-central1-c" && gcloud beta container --project "${PROJECT_ID}" node-pools create "v100" --cluster "${CLUSTER_NAME}" --zone "us-central1-c" --machine-type "n1-standard-8" --accelerator "type=nvidia-tesla-v100,count=1" --image-type "COS_CONTAINERD" --disk-type "pd-balanced" --disk-size "100" --metadata disable-legacy-endpoints=true --scopes "https://www.googleapis.com/auth/devstorage.read_only","https://www.googleapis.com/auth/logging.write","https://www.googleapis.com/auth/monitoring","https://www.googleapis.com/auth/servicecontrol","https://www.googleapis.com/auth/service.management.readonly","https://www.googleapis.com/auth/trace.append" --num-nodes "2" --enable-autoupgrade --enable-autorepair --max-surge-upgrade 1 --max-unavailable-upgrade 0 --node-locations "us-central1-c" && gcloud beta container --project "${PROJECT_ID}" node-pools create "largecpu" --cluster "${CLUSTER_NAME}" --zone "us-central1-c" --machine-type "n1-standard-16" --image-type "COS_CONTAINERD" --disk-type "pd-balanced" --disk-size "100" --metadata disable-legacy-endpoints=true --scopes "https://www.googleapis.com/auth/devstorage.read_only","https://www.googleapis.com/auth/logging.write","https://www.googleapis.com/auth/monitoring","https://www.googleapis.com/auth/servicecontrol","https://www.googleapis.com/auth/service.management.readonly","https://www.googleapis.com/auth/trace.append" --num-nodes "2" --enable-autoupgrade --enable-autorepair --max-surge-upgrade 1 --max-unavailable-upgrade 0 --node-locations "us-central1-c" ``` 2. Get the kubeconfig for your cluster and place it in `~/.kube/config`: ```bash gcloud container clusters get-credentials --region # Example: - # gcloud container clusters get-credentials testcluster --region us-central1-c + # gcloud container clusters get-credentials $CLUSTER_NAME --region us-central1-c ``` 3. Verify by running `kubectl get nodes`. You should see your nodes. 4. **If you want GPU support**, make sure you install GPU drivers by running: From bc893969f5a61f3d635527288a76f5a8be52f9bd Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Tue, 21 May 2024 23:04:29 +0000 Subject: [PATCH 10/38] add test for default to non-base conda env --- tests/test_smoke.py | 10 ++++++++-- tests/test_yamls/different_default_conda_env.yaml | 11 +++++++++++ 2 files changed, 19 insertions(+), 2 deletions(-) create mode 100644 tests/test_yamls/different_default_conda_env.yaml diff --git a/tests/test_smoke.py b/tests/test_smoke.py index 47bf9a93e1c..1da2682412e 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -3771,8 +3771,8 @@ def test_skyserve_failures(generic_cloud: str): # TODO(Ziming, Tian): Add tests for autoscaling. -# ------- Testing user ray cluster -------- -def test_user_ray_cluster(generic_cloud: str): +# ------- Testing user dependencies -------- +def test_user_dependencies(generic_cloud: str): name = _get_cluster_name() test = Test( 'user-ray-cluster', @@ -3783,6 +3783,12 @@ def test_user_ray_cluster(generic_cloud: str): f'sky status -r {name} | grep UP', f'sky exec {name} "echo bye"', f'sky logs {name} 2 --status', + f'sky launch -c {name} tests/test_yamls/different_default_conda_env.yaml', + f'sky logs {name} 3 --status', + # Launch again to test the default env does not affect SkyPilot + # runtime setup + f'sky launch -c {name} "python --version | grep \"Python 3.6\""', + f'sky logs {name} 4 --status', ], f'sky down -y {name}', ) diff --git a/tests/test_yamls/different_default_conda_env.yaml b/tests/test_yamls/different_default_conda_env.yaml new file mode 100644 index 00000000000..88d10d0f7be --- /dev/null +++ b/tests/test_yamls/different_default_conda_env.yaml @@ -0,0 +1,11 @@ +resources: + cpus: 2+ + + +setup: | + conda create -n testenv python=3.6 + + echo "conda activate testenv" >> ~/.bashrc + +run: | + python --version | grep "Python 3.6" || exit 1 From 713fed7568d8587f0a3929cba9b426dc6e8dda15 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Wed, 22 May 2024 01:43:44 +0000 Subject: [PATCH 11/38] Fix controllers and ray node providers --- sky/backends/backend_utils.py | 2 ++ sky/jobs/core.py | 1 - sky/skylet/attempt_skylet.py | 3 +++ sky/skylet/constants.py | 2 ++ sky/templates/azure-ray.yml.j2 | 4 ++-- sky/templates/jobs-controller.yaml.j2 | 3 +++ sky/templates/sky-serve-controller.yaml.j2 | 5 +++++ sky/utils/controller_utils.py | 3 ++- 8 files changed, 19 insertions(+), 4 deletions(-) diff --git a/sky/backends/backend_utils.py b/sky/backends/backend_utils.py index cf43cfdf2ed..ed9e0d00045 100644 --- a/sky/backends/backend_utils.py +++ b/sky/backends/backend_utils.py @@ -930,6 +930,8 @@ def write_cluster_config( # Command for waiting ray cluster to be ready on head. 'ray_head_wait_initialized_command': instance_setup.RAY_HEAD_WAIT_INITIALIZED_COMMAND, + 'sky_activate_python_env': + constants.ACTIVATE_SKY_REMOTE_PYTHON_ENV, # Cloud credentials for cloud storage. 'credentials': credentials, diff --git a/sky/jobs/core.py b/sky/jobs/core.py index ff9953489d5..7f9e0d757ea 100644 --- a/sky/jobs/core.py +++ b/sky/jobs/core.py @@ -98,7 +98,6 @@ def launch( 'dag_name': dag.name, 'retry_until_up': retry_until_up, 'remote_user_config_path': remote_user_config_path, - 'sky_python_cmd': skylet_constants.SKY_PYTHON_CMD, 'modified_catalogs': service_catalog_common.get_modified_catalog_file_mounts(), **controller_utils.shared_controller_vars_to_fill( diff --git a/sky/skylet/attempt_skylet.py b/sky/skylet/attempt_skylet.py index 609cfa09141..49775790932 100644 --- a/sky/skylet/attempt_skylet.py +++ b/sky/skylet/attempt_skylet.py @@ -21,6 +21,9 @@ def restart_skylet(): shell=True, check=False) subprocess.run( + # Activate python environment first to make sure skylet can find the + # cloud SDK for autostopping. + f'{constants.ACTIVATE_SKY_REMOTE_PYTHON_ENV}; ' f'nohup {constants.SKY_PYTHON_CMD} -m sky.skylet.skylet' ' >> ~/.sky/skylet.log 2>&1 &', shell=True, diff --git a/sky/skylet/constants.py b/sky/skylet/constants.py index 1b1636907b3..d6f62901efb 100644 --- a/sky/skylet/constants.py +++ b/sky/skylet/constants.py @@ -166,6 +166,8 @@ f'&& {{ {SKY_PYTHON_CMD} -c "from sky.skylet.ray_patches import patch; patch()" ' '|| exit 1; };') +ACTIVATE_SKY_REMOTE_PYTHON_ENV = f'source {SKY_REMOTE_PYTHON_ENV}/bin/activate' + # The name for the environment variable that stores SkyPilot user hash, which # is mainly used to make sure sky commands runs on a VM launched by SkyPilot # will be recognized as the same user (e.g., jobs controller or sky serve diff --git a/sky/templates/azure-ray.yml.j2 b/sky/templates/azure-ray.yml.j2 index f1477d92132..803327f1032 100644 --- a/sky/templates/azure-ray.yml.j2 +++ b/sky/templates/azure-ray.yml.j2 @@ -164,14 +164,14 @@ setup_commands: # current num items (num SSH connections): 2 head_start_ray_commands: # NOTE: --disable-usage-stats in `ray start` saves 10 seconds of idle wait. - - {{ sky_ray_cmd }} stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 {{ sky_ray_cmd }} start --disable-usage-stats --head --port={{ray_port}} --dashboard-port={{ray_dashboard_port}} --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml {{"--num-gpus=%s" % num_gpus if num_gpus}} {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1; + - {{ sky_activate_python_env }}; {{ sky_ray_cmd }} stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 {{ sky_ray_cmd }} start --disable-usage-stats --head --port={{ray_port}} --dashboard-port={{ray_dashboard_port}} --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml {{"--num-gpus=%s" % num_gpus if num_gpus}} {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1; which prlimit && for id in $(pgrep -f raylet/raylet); do sudo prlimit --nofile=1048576:1048576 --pid=$id || true; done; {{dump_port_command}}; {{ray_head_wait_initialized_command}} {%- if num_nodes > 1 %} worker_start_ray_commands: - - {{ sky_ray_cmd }} stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 {{ sky_ray_cmd }} start --disable-usage-stats --address=$RAY_HEAD_IP:{{ray_port}} --object-manager-port=8076 {{"--num-gpus=%s" % num_gpus if num_gpus}} {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1; + - {{ sky_activate_python_env }}; {{ sky_ray_cmd }} stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 {{ sky_ray_cmd }} start --disable-usage-stats --address=$RAY_HEAD_IP:{{ray_port}} --object-manager-port=8076 {{"--num-gpus=%s" % num_gpus if num_gpus}} {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1; which prlimit && for id in $(pgrep -f raylet/raylet); do sudo prlimit --nofile=1048576:1048576 --pid=$id || true; done; {%- else %} worker_start_ray_commands: [] diff --git a/sky/templates/jobs-controller.yaml.j2 b/sky/templates/jobs-controller.yaml.j2 index 65ae07dcc55..7d15dc680ac 100644 --- a/sky/templates/jobs-controller.yaml.j2 +++ b/sky/templates/jobs-controller.yaml.j2 @@ -10,6 +10,8 @@ file_mounts: {%- endfor %} setup: | + {{ sky_activate_python_env }} + {%- for cmd in cloud_dependencies_installation_commands %} {{cmd}} {%- endfor %} @@ -25,6 +27,7 @@ setup: | ((ps aux | grep -v nohup | grep -v grep | grep -q -- "-m sky.jobs.dashboard.dashboard") || (nohup {{ sky_python_cmd }} -m sky.jobs.dashboard.dashboard >> ~/.sky/job-dashboard.log 2>&1 &)); run: | + {{ sky_activate_python_env }} # Start the controller for the current managed job. python -u -m sky.jobs.controller {{remote_user_yaml_path}} \ --job-id $SKYPILOT_INTERNAL_JOB_ID {% if retry_until_up %}--retry-until-up{% endif %} diff --git a/sky/templates/sky-serve-controller.yaml.j2 b/sky/templates/sky-serve-controller.yaml.j2 index 8f79b653a2b..a4f1b49e3ed 100644 --- a/sky/templates/sky-serve-controller.yaml.j2 +++ b/sky/templates/sky-serve-controller.yaml.j2 @@ -3,6 +3,8 @@ name: {{service_name}} setup: | + {{ sky_activate_python_env }} + # Install all cloud dependencies. # This is for multicloud support. To allow controller launch on all clouds, # we need to install all cloud dependencies. @@ -24,6 +26,9 @@ file_mounts: {%- endfor %} run: | + # Activate the Python environment, so that cloud SDKs can be found in the + # PATH. + {{ sky_activate_python_env }} # Start sky serve service. python -u -m sky.serve.service \ --service-name {{service_name}} \ diff --git a/sky/utils/controller_utils.py b/sky/utils/controller_utils.py index 9908fa54286..e99d370670e 100644 --- a/sky/utils/controller_utils.py +++ b/sky/utils/controller_utils.py @@ -347,7 +347,8 @@ def shared_controller_vars_to_fill( remote_user_config_path: str) -> Dict[str, str]: vars_to_fill: Dict[str, Any] = { 'cloud_dependencies_installation_commands': - _get_cloud_dependencies_installation_commands(controller) + _get_cloud_dependencies_installation_commands(controller), + 'sky_activate_python_env': constants.ACTIVATE_SKY_REMOTE_PYTHON_ENV, } env_vars: Dict[str, str] = { env.value: '1' for env in env_options.Options if env.get() From 75f7833cd14b524f7b9da08aaf928ee6e3683740 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Wed, 22 May 2024 05:47:37 +0000 Subject: [PATCH 12/38] move activate to maybe_skylet --- sky/provision/instance_setup.py | 3 ++- sky/skylet/attempt_skylet.py | 1 - 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/sky/provision/instance_setup.py b/sky/provision/instance_setup.py index 2e07f026616..07f2c513c7f 100644 --- a/sky/provision/instance_setup.py +++ b/sky/provision/instance_setup.py @@ -61,7 +61,8 @@ 'done;') # Restart skylet when the version does not match to keep the skylet up-to-date. -MAYBE_SKYLET_RESTART_CMD = (f'{constants.SKY_PYTHON_CMD} -m ' +MAYBE_SKYLET_RESTART_CMD = (f'{constants.ACTIVATE_SKY_REMOTE_PYTHON_ENV}; ' + f'{constants.SKY_PYTHON_CMD} -m ' 'sky.skylet.attempt_skylet;') diff --git a/sky/skylet/attempt_skylet.py b/sky/skylet/attempt_skylet.py index 49775790932..be1faa8c3b6 100644 --- a/sky/skylet/attempt_skylet.py +++ b/sky/skylet/attempt_skylet.py @@ -23,7 +23,6 @@ def restart_skylet(): subprocess.run( # Activate python environment first to make sure skylet can find the # cloud SDK for autostopping. - f'{constants.ACTIVATE_SKY_REMOTE_PYTHON_ENV}; ' f'nohup {constants.SKY_PYTHON_CMD} -m sky.skylet.skylet' ' >> ~/.sky/skylet.log 2>&1 &', shell=True, From cd85d4262049abbaa095146001da925188e56a5d Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Wed, 22 May 2024 06:51:53 +0000 Subject: [PATCH 13/38] Make axolotl example work for kubernetes --- llm/axolotl/axolotl-docker.yaml | 35 ++++++++++++++++++++++++++++ llm/axolotl/axolotl-spot.yaml | 17 +++----------- llm/axolotl/axolotl.yaml | 19 ++++----------- sky/backends/backend_utils.py | 5 +++- sky/provision/kubernetes/instance.py | 2 +- sky/setup_files/setup.py | 12 +++------- sky/skylet/constants.py | 22 +++++++++++------ sky/skylet/events.py | 9 +++++++ 8 files changed, 74 insertions(+), 47 deletions(-) create mode 100644 llm/axolotl/axolotl-docker.yaml diff --git a/llm/axolotl/axolotl-docker.yaml b/llm/axolotl/axolotl-docker.yaml new file mode 100644 index 00000000000..9cec1d1f331 --- /dev/null +++ b/llm/axolotl/axolotl-docker.yaml @@ -0,0 +1,35 @@ +# Usage: +# HF_TOKEN=abc sky launch -c axolotl axolotl.yaml --env HF_TOKEN -y -i30 --down + +name: axolotl + +resources: + accelerators: L4:1 + cloud: gcp # optional + +workdir: mistral + +setup: | + docker pull winglian/axolotl:main-py3.10-cu118-2.0.1 + +run: | + docker run --gpus all \ + -v ~/sky_workdir:/sky_workdir \ + -v /root/.cache:/root/.cache \ + winglian/axolotl:main-py3.10-cu118-2.0.1 \ + huggingface-cli login --token ${HF_TOKEN} + + docker run --gpus all \ + -v ~/sky_workdir:/sky_workdir \ + -v /root/.cache:/root/.cache \ + winglian/axolotl:main-py3.10-cu118-2.0.1 \ + accelerate launch -m axolotl.cli.train /sky_workdir/qlora.yaml + +envs: + HF_TOKEN: # TODO: Fill with your own huggingface token, or use --env to pass. + + + + + + diff --git a/llm/axolotl/axolotl-spot.yaml b/llm/axolotl/axolotl-spot.yaml index 942f4ccc4ba..e444a796e51 100644 --- a/llm/axolotl/axolotl-spot.yaml +++ b/llm/axolotl/axolotl-spot.yaml @@ -12,6 +12,7 @@ resources: accelerators: A100:1 cloud: gcp # optional use_spot: True + image_id: docker:winglian/axolotl:main-py3.10-cu118-2.0.1 workdir: mistral @@ -20,22 +21,10 @@ file_mounts: name: ${BUCKET} mode: MOUNT -setup: | - docker pull winglian/axolotl:main-py3.10-cu118-2.0.1 - run: | - docker run --gpus all \ - -v ~/sky_workdir:/sky_workdir \ - -v /root/.cache:/root/.cache \ - winglian/axolotl:main-py3.10-cu118-2.0.1 \ - huggingface-cli login --token ${HF_TOKEN} + huggingface-cli login --token ${HF_TOKEN} - docker run --gpus all \ - -v ~/sky_workdir:/sky_workdir \ - -v /root/.cache:/root/.cache \ - -v /sky-notebook:/sky-notebook \ - winglian/axolotl:main-py3.10-cu118-2.0.1 \ - accelerate launch -m axolotl.cli.train /sky_workdir/qlora-checkpoint.yaml + accelerate launch -m axolotl.cli.train /sky_workdir/qlora-checkpoint.yaml envs: HF_TOKEN: # TODO: Fill with your own huggingface token, or use --env to pass. diff --git a/llm/axolotl/axolotl.yaml b/llm/axolotl/axolotl.yaml index 9cec1d1f331..636a344441d 100644 --- a/llm/axolotl/axolotl.yaml +++ b/llm/axolotl/axolotl.yaml @@ -5,25 +5,14 @@ name: axolotl resources: accelerators: L4:1 - cloud: gcp # optional + image_id: docker:winglian/axolotl:main-py3.10-cu118-2.0.1 workdir: mistral -setup: | - docker pull winglian/axolotl:main-py3.10-cu118-2.0.1 - run: | - docker run --gpus all \ - -v ~/sky_workdir:/sky_workdir \ - -v /root/.cache:/root/.cache \ - winglian/axolotl:main-py3.10-cu118-2.0.1 \ - huggingface-cli login --token ${HF_TOKEN} - - docker run --gpus all \ - -v ~/sky_workdir:/sky_workdir \ - -v /root/.cache:/root/.cache \ - winglian/axolotl:main-py3.10-cu118-2.0.1 \ - accelerate launch -m axolotl.cli.train /sky_workdir/qlora.yaml + huggingface-cli login --token ${HF_TOKEN} + + accelerate launch -m axolotl.cli.train /sky_workdir/qlora.yaml envs: HF_TOKEN: # TODO: Fill with your own huggingface token, or use --env to pass. diff --git a/sky/backends/backend_utils.py b/sky/backends/backend_utils.py index ed9e0d00045..e691fb06264 100644 --- a/sky/backends/backend_utils.py +++ b/sky/backends/backend_utils.py @@ -925,7 +925,10 @@ def write_cluster_config( 'dump_port_command': dump_port_command, # Sky-internal constants. 'sky_ray_cmd': constants.SKY_RAY_CMD, - 'sky_pip_cmd': constants.SKY_PIP_CMD, + # pip install needs to have python env activated to make sure + # installed packages are within the env path. + 'sky_pip_cmd': (f'{constants.ACTIVATE_SKY_REMOTE_PYTHON_ENV}; ' + f'{constants.SKY_PIP_CMD}'), 'ray_version': constants.SKY_REMOTE_RAY_VERSION, # Command for waiting ray cluster to be ready on head. 'ray_head_wait_initialized_command': diff --git a/sky/provision/kubernetes/instance.py b/sky/provision/kubernetes/instance.py index 9068079701f..2c95e9d3734 100644 --- a/sky/provision/kubernetes/instance.py +++ b/sky/provision/kubernetes/instance.py @@ -241,7 +241,7 @@ def _wait_for_pods_to_run(namespace, new_nodes): 'the node. Error details: ' f'{container_status.state.waiting.message}.') # Reaching this point means that one of the pods had an issue, - # so break out of the loop + # so break out of the loop, and wait until next second. break if all_pods_running: diff --git a/sky/setup_files/setup.py b/sky/setup_files/setup.py index adde7d6ab84..fbec585f411 100644 --- a/sky/setup_files/setup.py +++ b/sky/setup_files/setup.py @@ -173,15 +173,9 @@ def parse_readme(readme: str) -> str: ] remote = [ - # Adopted from ray's setup.py: https://github.com/ray-project/ray/blob/ray-2.4.0/python/setup.py - # SkyPilot: != 1.48.0 is required to avoid the error where ray dashboard fails to start when - # ray start is called (#2054). - # Tracking issue: https://github.com/ray-project/ray/issues/30984 - "grpcio >= 1.32.0, <= 1.49.1, != 1.48.0; python_version < '3.10' and sys_platform == 'darwin'", # noqa:E501 - "grpcio >= 1.42.0, <= 1.49.1, != 1.48.0; python_version >= '3.10' and sys_platform == 'darwin'", # noqa:E501 - # Original issue: https://github.com/ray-project/ray/issues/33833 - "grpcio >= 1.32.0, <= 1.51.3, != 1.48.0; python_version < '3.10' and sys_platform != 'darwin'", # noqa:E501 - "grpcio >= 1.42.0, <= 1.51.3, != 1.48.0; python_version >= '3.10' and sys_platform != 'darwin'", # noqa:E501 + # Adopted from ray's setup.py: https://github.com/ray-project/ray/blob/ray-2.9.3/python/setup.py + 'grpcio >= 1.32.0, != 1.56.0; python_version < "3.10"', # noqa:E501 + 'grpcio >= 1.42.0, != 1.56.0; python_version >= "3.10"', # noqa:E501 # Adopted from ray's setup.py: # https://github.com/ray-project/ray/blob/ray-2.9.3/python/setup.py#L343 'protobuf >= 3.15.3, != 3.19.5', diff --git a/sky/skylet/constants.py b/sky/skylet/constants.py index d6f62901efb..8cc19f80764 100644 --- a/sky/skylet/constants.py +++ b/sky/skylet/constants.py @@ -42,6 +42,7 @@ # Separate env for SkyPilot runtime dependencies. SKY_REMOTE_PYTHON_ENV_NAME = 'skypilot-runtime' SKY_REMOTE_PYTHON_ENV = f'~/{SKY_REMOTE_PYTHON_ENV_NAME}' +ACTIVATE_SKY_REMOTE_PYTHON_ENV = f'source {SKY_REMOTE_PYTHON_ENV}/bin/activate' # The name for the environment variable that stores the unique ID of the # current task. This will stay the same across multiple recoveries of the @@ -100,14 +101,23 @@ 'conda config --set auto_activate_base true && ' # Use $(echo ~) instead of ~ to avoid the error "no such file or directory". # Also, not using $HOME to avoid the error HOME variable not set. - f'echo "$(echo ~)/miniconda3/bin/python" > {SKY_PYTHON_PATH_FILE}; }}; ' + f'conda activate base; }}; ' 'grep "# >>> conda initialize >>>" ~/.bashrc || ' '{ conda init && source ~/.bashrc; };' + # If Python version is larger then equal to 3.12, create a new conda env + # with Python 3.10. + # We don't use a separate conda env for SkyPilot dependencies because it is + # costly to create a new conda env, and venv should be a lightweight and + # faster alternative when the python version satisfies the requirement. + '[[ $(python3 --version | cut -d " " -f 2 | cut -d "." -f 2) -ge 12 ]] && ' + f'echo "Creating conda env with Python 3.10" && ' + f'conda create -y -n {SKY_REMOTE_PYTHON_ENV_NAME} python=3.10 && ' + f'conda activate {SKY_REMOTE_PYTHON_ENV_NAME};' # Create a separate conda environment for SkyPilot dependencies. f'[ -d {SKY_REMOTE_PYTHON_ENV} ] || ' f'{{ {SKY_PYTHON_CMD} -m venv {SKY_REMOTE_PYTHON_ENV} --system-site-packages && ' - f'echo $(echo {SKY_REMOTE_PYTHON_ENV})/bin/python > {SKY_PYTHON_PATH_FILE}; }};' -) + f'echo "$(echo {SKY_REMOTE_PYTHON_ENV})/bin/python" > {SKY_PYTHON_PATH_FILE}; }};' + f'{ACTIVATE_SKY_REMOTE_PYTHON_ENV};') _sky_version = str(version.parse(sky.__version__)) RAY_STATUS = f'RAY_ADDRESS=127.0.0.1:{SKY_REMOTE_RAY_PORT} {SKY_RAY_CMD} status' @@ -146,8 +156,8 @@ 'export PATH=$PATH:$HOME/.local/bin; ' # Writes ray path to file if it does not exist or the file is empty. f'[ -s {SKY_RAY_PATH_FILE} ] || ' - f'{{ source {SKY_REMOTE_PYTHON_ENV}/bin/activate && ' - f'which ray > {SKY_RAY_PATH_FILE}; }}' + f'{{ {ACTIVATE_SKY_REMOTE_PYTHON_ENV} && ' + f'which ray > {SKY_RAY_PATH_FILE} || exit 1; }}; ' # END ray package check and installation f'{{ {SKY_PIP_CMD} list | grep "skypilot " && ' '[ "$(cat ~/.sky/wheels/current_sky_wheel_hash)" == "{sky_wheel_hash}" ]; } || ' # pylint: disable=line-too-long @@ -166,8 +176,6 @@ f'&& {{ {SKY_PYTHON_CMD} -c "from sky.skylet.ray_patches import patch; patch()" ' '|| exit 1; };') -ACTIVATE_SKY_REMOTE_PYTHON_ENV = f'source {SKY_REMOTE_PYTHON_ENV}/bin/activate' - # The name for the environment variable that stores SkyPilot user hash, which # is mainly used to make sure sky commands runs on a VM launched by SkyPilot # will be recognized as the same user (e.g., jobs controller or sky serve diff --git a/sky/skylet/events.py b/sky/skylet/events.py index c63b42cc438..c5e2c235042 100644 --- a/sky/skylet/events.py +++ b/sky/skylet/events.py @@ -206,6 +206,15 @@ def _stop_cluster(self, autostop_config): # `. env=env) + # Stop the ray autoscaler to avoid scaling up, during + # stopping/terminating of the cluster. We do not rely `ray down` + # below for stopping ray cluster, as it will not use the correct + # ray path. + logger.info('Stopping the ray cluster.') + subprocess.run(f'{constants.SKY_RAY_CMD} stop', + shell=True, + check=True) + logger.info('Running final ray down.') subprocess.run( f'{constants.SKY_RAY_CMD} down -y {config_path}', From d98dbd4893aebb141e989a2370a53bfd4c346b8b Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Wed, 22 May 2024 07:24:58 +0000 Subject: [PATCH 14/38] fix axolotl --- llm/axolotl/axolotl-spot.yaml | 3 ++- llm/axolotl/axolotl.yaml | 2 +- sky/skylet/constants.py | 2 +- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/llm/axolotl/axolotl-spot.yaml b/llm/axolotl/axolotl-spot.yaml index e444a796e51..9d34f11193f 100644 --- a/llm/axolotl/axolotl-spot.yaml +++ b/llm/axolotl/axolotl-spot.yaml @@ -24,7 +24,7 @@ file_mounts: run: | huggingface-cli login --token ${HF_TOKEN} - accelerate launch -m axolotl.cli.train /sky_workdir/qlora-checkpoint.yaml + accelerate launch -m axolotl.cli.train qlora-checkpoint.yaml envs: HF_TOKEN: # TODO: Fill with your own huggingface token, or use --env to pass. @@ -35,3 +35,4 @@ envs: +4 diff --git a/llm/axolotl/axolotl.yaml b/llm/axolotl/axolotl.yaml index 636a344441d..f46588e9aae 100644 --- a/llm/axolotl/axolotl.yaml +++ b/llm/axolotl/axolotl.yaml @@ -12,7 +12,7 @@ workdir: mistral run: | huggingface-cli login --token ${HF_TOKEN} - accelerate launch -m axolotl.cli.train /sky_workdir/qlora.yaml + accelerate launch -m axolotl.cli.train qlora.yaml envs: HF_TOKEN: # TODO: Fill with your own huggingface token, or use --env to pass. diff --git a/sky/skylet/constants.py b/sky/skylet/constants.py index 8cc19f80764..7deb9f6d02f 100644 --- a/sky/skylet/constants.py +++ b/sky/skylet/constants.py @@ -95,7 +95,7 @@ # AWS's Deep Learning AMI's default conda environment. CONDA_INSTALLATION_COMMANDS = ( 'which conda > /dev/null 2>&1 || ' - '{ wget -nc https://repo.anaconda.com/miniconda/Miniconda3-py310_23.11.0-2-Linux-x86_64.sh -O Miniconda3-Linux-x86_64.sh && ' # pylint: disable=line-too-long + '{ wget -nc https://repo.anaconda.com/miniconda/Miniconda3-py310_23.11.0-2-Linux-x86_64.sh -O Miniconda3-Linux-x86_64.sh; ' # pylint: disable=line-too-long 'bash Miniconda3-Linux-x86_64.sh -b && ' 'eval "$(~/miniconda3/bin/conda shell.bash hook)" && conda init && ' 'conda config --set auto_activate_base true && ' From daf746143aaaa3e16608e88ef483288d06074f26 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Wed, 22 May 2024 07:44:47 +0000 Subject: [PATCH 15/38] add test for 3.12 --- tests/test_smoke.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/test_smoke.py b/tests/test_smoke.py index 1da2682412e..8b6ac25fac0 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -1026,8 +1026,11 @@ def test_kubernetes_storage_mounts(): 'docker:nvidia/cuda:11.8.0-devel-ubuntu18.04', 'docker:ubuntu:18.04', # Test latest image with python 3.11 installed by default. - # Does not work for python 3.12 due to ray's requirement for 3.11. 'docker:continuumio/miniconda3:24.1.2-0', + # Test python 3.12 where SkyPilot should automatically create a separate + # conda env for runtime with python 3.10. + 'docker:continuumio/miniconda3:latest', + ]) def test_docker_storage_mounts(generic_cloud: str, image_id: str): # Tests bucket mounting on docker container From 833d735b2566063c8fe6a7c8d3b765405976db62 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Wed, 22 May 2024 08:02:12 +0000 Subject: [PATCH 16/38] format --- tests/test_smoke.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/test_smoke.py b/tests/test_smoke.py index 8b6ac25fac0..221633dc1a4 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -1030,7 +1030,6 @@ def test_kubernetes_storage_mounts(): # Test python 3.12 where SkyPilot should automatically create a separate # conda env for runtime with python 3.10. 'docker:continuumio/miniconda3:latest', - ]) def test_docker_storage_mounts(generic_cloud: str, image_id: str): # Tests bucket mounting on docker container From 44bcb6bb4a55481b32664ed74a644dd49b6545fd Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Wed, 22 May 2024 09:02:53 +0000 Subject: [PATCH 17/38] Fix docker PATH --- sky/provision/docker_utils.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/sky/provision/docker_utils.py b/sky/provision/docker_utils.py index 10ae5dafc07..70214418168 100644 --- a/sky/provision/docker_utils.py +++ b/sky/provision/docker_utils.py @@ -244,6 +244,8 @@ def initialize(self) -> str: self._run(start_command) # SkyPilot: Setup Commands. + # TODO(zhwu): the following setups should be aligned with the kubernetes + # pod setup, like provision.kubernetes.instance::_set_env_vars_in_pods # TODO(tian): These setup commands assumed that the container is # debian-based. We should make it more general. # Most of docker images are using root as default user, so we set an @@ -296,8 +298,16 @@ def initialize(self) -> str: 'mkdir -p ~/.ssh;' 'cat /tmp/host_ssh_authorized_keys >> ~/.ssh/authorized_keys;' 'sudo service ssh start;' - 'sudo sed -i "s/mesg n/tty -s \&\& mesg n/" ~/.profile;', + 'sudo sed -i "s/mesg n/tty -s \&\& mesg n/" ~/.profile;' + # SkyPilot: configure environment variables. A docker image can have + # environment variables set in the Dockerfile. We need to export + # these variables to the shell environment, so that our ssh session + # can access them. + 'printenv | while IFS=\'=\' read -r key value; do echo "export $key=\\\"$value\\\""; done > ' # pylint: disable=line-too-long + '~/docker_env_var.sh && ' + 'sudo mv ~/docker_env_var.sh /etc/profile.d/docker_env_var.sh', run_env='docker') + # SkyPilot: End of Setup Commands. docker_user = self._run('whoami', run_env='docker') From abb0f8d518d5e2958777e61764dc2e37ffc26546 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Wed, 22 May 2024 09:03:21 +0000 Subject: [PATCH 18/38] format --- sky/provision/docker_utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sky/provision/docker_utils.py b/sky/provision/docker_utils.py index 70214418168..f597f3cb24d 100644 --- a/sky/provision/docker_utils.py +++ b/sky/provision/docker_utils.py @@ -307,7 +307,6 @@ def initialize(self) -> str: '~/docker_env_var.sh && ' 'sudo mv ~/docker_env_var.sh /etc/profile.d/docker_env_var.sh', run_env='docker') - # SkyPilot: End of Setup Commands. docker_user = self._run('whoami', run_env='docker') From b996cf9c53be47cb344e88855dd05df4f4172d0d Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Wed, 22 May 2024 09:31:52 +0000 Subject: [PATCH 19/38] add axolotl image in test --- llm/axolotl/mistral/qlora-checkpoint.yaml | 3 ++- llm/axolotl/mistral/qlora.yaml | 3 ++- sky/provision/kubernetes/instance.py | 2 ++ tests/test_smoke.py | 15 ++++++++++++--- 4 files changed, 18 insertions(+), 5 deletions(-) diff --git a/llm/axolotl/mistral/qlora-checkpoint.yaml b/llm/axolotl/mistral/qlora-checkpoint.yaml index 278a5d72b9a..1f1cc67446c 100644 --- a/llm/axolotl/mistral/qlora-checkpoint.yaml +++ b/llm/axolotl/mistral/qlora-checkpoint.yaml @@ -71,6 +71,7 @@ warmup_steps: 10 eval_steps: 0.05 eval_table_size: eval_table_max_new_tokens: 128 +eval_sample_packing: false save_steps: 2 ## increase based on your dataset save_strategy: steps debug: @@ -81,4 +82,4 @@ fsdp_config: special_tokens: bos_token: "" eos_token: "" - unk_token: "" \ No newline at end of file + unk_token: "" diff --git a/llm/axolotl/mistral/qlora.yaml b/llm/axolotl/mistral/qlora.yaml index 42c3742b52d..39b2c55b1ce 100644 --- a/llm/axolotl/mistral/qlora.yaml +++ b/llm/axolotl/mistral/qlora.yaml @@ -69,6 +69,7 @@ warmup_steps: 10 eval_steps: 0.05 eval_table_size: eval_table_max_new_tokens: 128 +eval_sample_packing: false save_steps: debug: deepspeed: @@ -78,4 +79,4 @@ fsdp_config: special_tokens: bos_token: "" eos_token: "" - unk_token: "" \ No newline at end of file + unk_token: "" diff --git a/sky/provision/kubernetes/instance.py b/sky/provision/kubernetes/instance.py index 2c95e9d3734..b5de890e518 100644 --- a/sky/provision/kubernetes/instance.py +++ b/sky/provision/kubernetes/instance.py @@ -540,6 +540,8 @@ def _create_pods(region: str, cluster_name_on_cloud: str, _wait_for_pods_to_schedule(namespace, wait_pods, provision_timeout) # Wait until the pods and their containers are up and running, and # fail early if there is an error + logger.debug(f'run_instances: waiting for pods to be running (pulling ' + f'images): {list(wait_pods_dict.keys())}') _wait_for_pods_to_run(namespace, wait_pods) logger.debug(f'run_instances: all pods are scheduled and running: ' f'{list(wait_pods_dict.keys())}') diff --git a/tests/test_smoke.py b/tests/test_smoke.py index 221633dc1a4..bc73d5b67be 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -1027,7 +1027,7 @@ def test_kubernetes_storage_mounts(): 'docker:ubuntu:18.04', # Test latest image with python 3.11 installed by default. 'docker:continuumio/miniconda3:24.1.2-0', - # Test python 3.12 where SkyPilot should automatically create a separate + # Test python>=3.12 where SkyPilot should automatically create a separate # conda env for runtime with python 3.10. 'docker:continuumio/miniconda3:latest', ]) @@ -1210,8 +1210,15 @@ def test_job_queue(generic_cloud: str): 'docker:nvidia/cuda:11.8.0-devel-ubuntu18.04', 'docker:ubuntu:18.04', # Test latest image with python 3.11 installed by default. - # Does not work for python 3.12 due to ray's requirement for 3.11. 'docker:continuumio/miniconda3:24.1.2-0', + # Test python>=3.12 where SkyPilot should automatically create a separate + # conda env for runtime with python 3.10. + 'docker:continuumio/miniconda3:latest', + # Axolotl image is a good example custom image that has its conda path + # set in PATH with dockerfile and uses python>=3.12. It could test: + # 1. we handle the env var set in dockerfile correctly + # 2. python>=3.12 works with SkyPilot runtime. + 'docker:winglian/axolotl:main-latest' ]) def test_job_queue_with_docker(generic_cloud: str, image_id: str): name = _get_cluster_name() + image_id[len('docker:'):][:4] @@ -2915,8 +2922,10 @@ def test_aws_custom_image(): 'docker:nvidia/cuda:11.8.0-devel-ubuntu18.04', 'docker:ubuntu:18.04', # Test latest image with python 3.11 installed by default. - # Does not work for python 3.12 due to ray's requirement for 3.11. 'docker:continuumio/miniconda3:24.1.2-0', + # Test python>=3.12 where SkyPilot should automatically create a separate + # conda env for runtime with python 3.10. + 'docker:continuumio/miniconda3:latest', ]) def test_kubernetes_custom_image(image_id): """Test Kubernetes custom image""" From 6bf68fc23a335d1d64000b14d936512de60edd43 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Wed, 22 May 2024 09:39:32 +0000 Subject: [PATCH 20/38] address comments --- sky/utils/command_runner.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/sky/utils/command_runner.py b/sky/utils/command_runner.py index 0b8ffc74171..3ed69d0e2a1 100644 --- a/sky/utils/command_runner.py +++ b/sky/utils/command_runner.py @@ -184,9 +184,8 @@ def _get_command_to_run( # cluster by 1 second. # sourcing ~/.bashrc is not required for internal executions command += [ - shlex.quote( - 'true && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore' - f' && ({cmd})') + shlex.quote('true && export OMP_NUM_THREADS=1 ' + f'PYTHONWARNINGS=ignore && ({cmd})') ] if not separate_stderr: command.append('2>&1') @@ -433,8 +432,7 @@ def run( process_stream, separate_stderr, # A hack to remove the following SSH warning+bash warnings (twice): - # Warning: Permanently added 'xx.xx.xx.xx' to the list of known - # hosts. + # Warning: Permanently added 'xx.xx.xx.xx' to the list of known... # bash: cannot set terminal process group # bash: no job control in this shell # When not source_bashrc, the bash warning will only show once. From 7ad8c343f2a277798c3f37a054ddeb74eaf124bf Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Wed, 22 May 2024 18:10:50 +0000 Subject: [PATCH 21/38] revert grpcio version as it is only installed in our runtime env --- sky/setup_files/setup.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/sky/setup_files/setup.py b/sky/setup_files/setup.py index fbec585f411..adde7d6ab84 100644 --- a/sky/setup_files/setup.py +++ b/sky/setup_files/setup.py @@ -173,9 +173,15 @@ def parse_readme(readme: str) -> str: ] remote = [ - # Adopted from ray's setup.py: https://github.com/ray-project/ray/blob/ray-2.9.3/python/setup.py - 'grpcio >= 1.32.0, != 1.56.0; python_version < "3.10"', # noqa:E501 - 'grpcio >= 1.42.0, != 1.56.0; python_version >= "3.10"', # noqa:E501 + # Adopted from ray's setup.py: https://github.com/ray-project/ray/blob/ray-2.4.0/python/setup.py + # SkyPilot: != 1.48.0 is required to avoid the error where ray dashboard fails to start when + # ray start is called (#2054). + # Tracking issue: https://github.com/ray-project/ray/issues/30984 + "grpcio >= 1.32.0, <= 1.49.1, != 1.48.0; python_version < '3.10' and sys_platform == 'darwin'", # noqa:E501 + "grpcio >= 1.42.0, <= 1.49.1, != 1.48.0; python_version >= '3.10' and sys_platform == 'darwin'", # noqa:E501 + # Original issue: https://github.com/ray-project/ray/issues/33833 + "grpcio >= 1.32.0, <= 1.51.3, != 1.48.0; python_version < '3.10' and sys_platform != 'darwin'", # noqa:E501 + "grpcio >= 1.42.0, <= 1.51.3, != 1.48.0; python_version >= '3.10' and sys_platform != 'darwin'", # noqa:E501 # Adopted from ray's setup.py: # https://github.com/ray-project/ray/blob/ray-2.9.3/python/setup.py#L343 'protobuf >= 3.15.3, != 3.19.5', From 536c7ad3dbd01809c272d94ae3af53d4e2e6d052 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Wed, 22 May 2024 19:08:32 +0000 Subject: [PATCH 22/38] refactor command for env set up --- sky/provision/docker_utils.py | 19 ++++++++++++------- sky/provision/kubernetes/instance.py | 9 ++------- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/sky/provision/docker_utils.py b/sky/provision/docker_utils.py index f597f3cb24d..b9ed689fdaf 100644 --- a/sky/provision/docker_utils.py +++ b/sky/provision/docker_utils.py @@ -15,6 +15,17 @@ DOCKER_PERMISSION_DENIED_STR = ('permission denied while trying to connect to ' 'the Docker daemon socket') +# Configure environment variables. A docker image can have environment variables +# set in the Dockerfile with `ENV``. We need to export these variables to the +# shell environment, so that our ssh session can access them. +SETUP_ENV_VARS_CMD = ( + 'prefix_cmd() ' + '{ if [ $(id -u) -ne 0 ]; then echo "sudo"; else echo ""; fi; } && ' + 'printenv | while IFS=\'=\' read -r key value; do echo "export $key=\\\"$value\\\""; done > ' # pylint: disable=line-too-long + '~/container_env_var.sh && ' + '$(prefix_cmd) mv ~/container_env_var.sh /etc/profile.d/container_env_var.sh' +) + @dataclasses.dataclass class DockerLoginConfig: @@ -299,13 +310,7 @@ def initialize(self) -> str: 'cat /tmp/host_ssh_authorized_keys >> ~/.ssh/authorized_keys;' 'sudo service ssh start;' 'sudo sed -i "s/mesg n/tty -s \&\& mesg n/" ~/.profile;' - # SkyPilot: configure environment variables. A docker image can have - # environment variables set in the Dockerfile. We need to export - # these variables to the shell environment, so that our ssh session - # can access them. - 'printenv | while IFS=\'=\' read -r key value; do echo "export $key=\\\"$value\\\""; done > ' # pylint: disable=line-too-long - '~/docker_env_var.sh && ' - 'sudo mv ~/docker_env_var.sh /etc/profile.d/docker_env_var.sh', + f'{SETUP_ENV_VARS_CMD}', run_env='docker') # SkyPilot: End of Setup Commands. diff --git a/sky/provision/kubernetes/instance.py b/sky/provision/kubernetes/instance.py index b5de890e518..4f88293525f 100644 --- a/sky/provision/kubernetes/instance.py +++ b/sky/provision/kubernetes/instance.py @@ -10,6 +10,7 @@ from sky import status_lib from sky.adaptors import kubernetes from sky.provision import common +from sky.provision import docker_utils from sky.provision.kubernetes import config as config_lib from sky.provision.kubernetes import utils as kubernetes_utils from sky.utils import common_utils @@ -301,13 +302,7 @@ def _set_env_vars_in_pods(namespace: str, new_pods: List): set_k8s_env_var_cmd = [ '/bin/sh', '-c', - ( - 'prefix_cmd() ' - '{ if [ $(id -u) -ne 0 ]; then echo "sudo"; else echo ""; fi; } && ' - 'printenv | while IFS=\'=\' read -r key value; do echo "export $key=\\\"$value\\\""; done > ' # pylint: disable=line-too-long - '~/k8s_env_var.sh && ' - 'mv ~/k8s_env_var.sh /etc/profile.d/k8s_env_var.sh || ' - '$(prefix_cmd) mv ~/k8s_env_var.sh /etc/profile.d/k8s_env_var.sh') + docker_utils.SETUP_ENV_VARS_CMD, ] for new_pod in new_pods: From 7133cbca6f3714729484b5bb55583959f8c6561b Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Wed, 22 May 2024 19:23:25 +0000 Subject: [PATCH 23/38] switch to curl as CentOS may not have wget installed but have curl --- sky/skylet/constants.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sky/skylet/constants.py b/sky/skylet/constants.py index 8cc19f80764..2e69ccdc6cb 100644 --- a/sky/skylet/constants.py +++ b/sky/skylet/constants.py @@ -95,7 +95,7 @@ # AWS's Deep Learning AMI's default conda environment. CONDA_INSTALLATION_COMMANDS = ( 'which conda > /dev/null 2>&1 || ' - '{ wget -nc https://repo.anaconda.com/miniconda/Miniconda3-py310_23.11.0-2-Linux-x86_64.sh -O Miniconda3-Linux-x86_64.sh && ' # pylint: disable=line-too-long + '{ curl https://repo.anaconda.com/miniconda/Miniconda3-py310_23.11.0-2-Linux-x86_64.sh -o Miniconda3-Linux-x86_64.sh && ' # pylint: disable=line-too-long 'bash Miniconda3-Linux-x86_64.sh -b && ' 'eval "$(~/miniconda3/bin/conda shell.bash hook)" && conda init && ' 'conda config --set auto_activate_base true && ' From 006670caac3d00a5f83ebf690b090ebf116f0da4 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Wed, 22 May 2024 20:27:07 +0000 Subject: [PATCH 24/38] add l4 in command --- tests/kubernetes/README.md | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tests/kubernetes/README.md b/tests/kubernetes/README.md index 655daf55b55..d00e3032e13 100644 --- a/tests/kubernetes/README.md +++ b/tests/kubernetes/README.md @@ -32,7 +32,7 @@ sky local up ```bash PROJECT_ID=$(gcloud config get-value project) CLUSTER_NAME=testclusterromil - gcloud beta container --project "${PROJECT_ID}" clusters create "${CLUSTER_NAME}" --zone "us-central1-c" --no-enable-basic-auth --cluster-version "1.29.1-gke.1589020" --release-channel "regular" --machine-type "n1-standard-8" --accelerator "type=nvidia-tesla-t4,count=1" --image-type "COS_CONTAINERD" --disk-type "pd-balanced" --disk-size "100" --metadata disable-legacy-endpoints=true --scopes "https://www.googleapis.com/auth/devstorage.read_only","https://www.googleapis.com/auth/logging.write","https://www.googleapis.com/auth/monitoring","https://www.googleapis.com/auth/servicecontrol","https://www.googleapis.com/auth/service.management.readonly","https://www.googleapis.com/auth/trace.append" --num-nodes "2" --logging=SYSTEM,WORKLOAD --monitoring=SYSTEM --enable-ip-alias --network "projects/${PROJECT_ID}/global/networks/default" --subnetwork "projects/${PROJECT_ID}/regions/us-central1/subnetworks/default" --no-enable-intra-node-visibility --default-max-pods-per-node "110" --security-posture=standard --workload-vulnerability-scanning=disabled --no-enable-master-authorized-networks --addons HorizontalPodAutoscaling,HttpLoadBalancing,GcePersistentDiskCsiDriver --enable-autoupgrade --enable-autorepair --max-surge-upgrade 1 --max-unavailable-upgrade 0 --enable-managed-prometheus --enable-shielded-nodes --node-locations "us-central1-c" && gcloud beta container --project "${PROJECT_ID}" node-pools create "v100" --cluster "${CLUSTER_NAME}" --zone "us-central1-c" --machine-type "n1-standard-8" --accelerator "type=nvidia-tesla-v100,count=1" --image-type "COS_CONTAINERD" --disk-type "pd-balanced" --disk-size "100" --metadata disable-legacy-endpoints=true --scopes "https://www.googleapis.com/auth/devstorage.read_only","https://www.googleapis.com/auth/logging.write","https://www.googleapis.com/auth/monitoring","https://www.googleapis.com/auth/servicecontrol","https://www.googleapis.com/auth/service.management.readonly","https://www.googleapis.com/auth/trace.append" --num-nodes "2" --enable-autoupgrade --enable-autorepair --max-surge-upgrade 1 --max-unavailable-upgrade 0 --node-locations "us-central1-c" && gcloud beta container --project "${PROJECT_ID}" node-pools create "largecpu" --cluster "${CLUSTER_NAME}" --zone "us-central1-c" --machine-type "n1-standard-16" --image-type "COS_CONTAINERD" --disk-type "pd-balanced" --disk-size "100" --metadata disable-legacy-endpoints=true --scopes "https://www.googleapis.com/auth/devstorage.read_only","https://www.googleapis.com/auth/logging.write","https://www.googleapis.com/auth/monitoring","https://www.googleapis.com/auth/servicecontrol","https://www.googleapis.com/auth/service.management.readonly","https://www.googleapis.com/auth/trace.append" --num-nodes "2" --enable-autoupgrade --enable-autorepair --max-surge-upgrade 1 --max-unavailable-upgrade 0 --node-locations "us-central1-c" + gcloud beta container --project "${PROJECT_ID}" clusters create "${CLUSTER_NAME}" --zone "us-central1-c" --no-enable-basic-auth --cluster-version "1.29.1-gke.1589020" --release-channel "regular" --machine-type "n1-standard-8" --accelerator "type=nvidia-tesla-t4,count=1" --image-type "COS_CONTAINERD" --disk-type "pd-balanced" --disk-size "100" --metadata disable-legacy-endpoints=true --scopes "https://www.googleapis.com/auth/devstorage.read_only","https://www.googleapis.com/auth/logging.write","https://www.googleapis.com/auth/monitoring","https://www.googleapis.com/auth/servicecontrol","https://www.googleapis.com/auth/service.management.readonly","https://www.googleapis.com/auth/trace.append" --num-nodes "2" --logging=SYSTEM,WORKLOAD --monitoring=SYSTEM --enable-ip-alias --network "projects/${PROJECT_ID}/global/networks/default" --subnetwork "projects/${PROJECT_ID}/regions/us-central1/subnetworks/default" --no-enable-intra-node-visibility --default-max-pods-per-node "110" --security-posture=standard --workload-vulnerability-scanning=disabled --no-enable-master-authorized-networks --addons HorizontalPodAutoscaling,HttpLoadBalancing,GcePersistentDiskCsiDriver --enable-autoupgrade --enable-autorepair --max-surge-upgrade 1 --max-unavailable-upgrade 0 --enable-managed-prometheus --enable-shielded-nodes --node-locations "us-central1-c" && gcloud beta container --project "${PROJECT_ID}" node-pools create "v100" --cluster "${CLUSTER_NAME}" --zone "us-central1-c" --machine-type "n1-standard-8" --accelerator "type=nvidia-tesla-v100,count=1" --image-type "COS_CONTAINERD" --disk-type "pd-balanced" --disk-size "100" --metadata disable-legacy-endpoints=true --scopes "https://www.googleapis.com/auth/devstorage.read_only","https://www.googleapis.com/auth/logging.write","https://www.googleapis.com/auth/monitoring","https://www.googleapis.com/auth/servicecontrol","https://www.googleapis.com/auth/service.management.readonly","https://www.googleapis.com/auth/trace.append" --num-nodes "2" --enable-autoupgrade --enable-autorepair --max-surge-upgrade 1 --max-unavailable-upgrade 0 --node-locations "us-central1-c" && gcloud beta container --project "${PROJECT_ID}" node-pools create "largecpu" --cluster "${CLUSTER_NAME}" --zone "us-central1-c" --machine-type "n1-standard-16" --image-type "COS_CONTAINERD" --disk-type "pd-balanced" --disk-size "100" --metadata disable-legacy-endpoints=true --scopes "https://www.googleapis.com/auth/devstorage.read_only","https://www.googleapis.com/auth/logging.write","https://www.googleapis.com/auth/monitoring","https://www.googleapis.com/auth/servicecontrol","https://www.googleapis.com/auth/service.management.readonly","https://www.googleapis.com/auth/trace.append" --num-nodes "2" --enable-autoupgrade --enable-autorepair --max-surge-upgrade 1 --max-unavailable-upgrade 0 --node-locations "us-central1-c" && gcloud beta container --project "${PROJECT_ID}" node-pools create "l4" --cluster "${CLUSTER_NAME}" --zone "us-central1-c" --machine-type "g2-standard-4" --accelerator "type=nvidia-l4,count=1" --image-type "COS_CONTAINERD" --disk-type "pd-balanced" --disk-size "100" --metadata disable-legacy-endpoints=true --scopes "https://www.googleapis.com/auth/devstorage.read_only","https://www.googleapis.com/auth/logging.write","https://www.googleapis.com/auth/monitoring","https://www.googleapis.com/auth/servicecontrol","https://www.googleapis.com/auth/service.management.readonly","https://www.googleapis.com/auth/trace.append" --num-nodes "2" --enable-autoupgrade --enable-autorepair --max-surge-upgrade 1 --max-unavailable-upgrade 0 --node-locations "us-central1-c" ``` 2. Get the kubeconfig for your cluster and place it in `~/.kube/config`: ```bash @@ -45,9 +45,13 @@ sky local up ```bash # If using COS based nodes (e.g., in the example above): kubectl apply -f https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/nvidia-driver-installer/cos/daemonset-preloaded.yaml + + kubectl apply -f https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/nvidia-driver-installer/cos/daemonset-preloaded-latest.yaml # If using Ubuntu based nodes: kubectl apply -f https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/nvidia-driver-installer/ubuntu/daemonset-preloaded.yaml + + kubectl apply -f https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/nvidia-driver-installer/ubuntu/daemonset-preloaded-R525.yaml ``` This will create a resource like `nvidia.com/gpu: 1`. You can verify this resource is available by running: ```bash From 767377786271dfa2c5a04dd5f02e2080f5d8405e Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Wed, 22 May 2024 20:40:23 +0000 Subject: [PATCH 25/38] fix dependency for test --- tests/skyserve/cancel/cancel.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/skyserve/cancel/cancel.yaml b/tests/skyserve/cancel/cancel.yaml index 1e4007878fb..a8dab23b822 100644 --- a/tests/skyserve/cancel/cancel.yaml +++ b/tests/skyserve/cancel/cancel.yaml @@ -13,4 +13,6 @@ resources: workdir: examples/serve/misc/cancel +setup: pip install aiohttp + run: python3 server.py --port 9000 From 32db6386d565c618e93c438c5f4092bdef511582 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Thu, 23 May 2024 00:57:26 +0000 Subject: [PATCH 26/38] fix python path for ray executable --- sky/backends/backend_utils.py | 5 +---- sky/provision/instance_setup.py | 2 ++ sky/skylet/constants.py | 10 ++++++++-- sky/templates/azure-ray.yml.j2 | 4 ++-- tests/conftest.py | 2 +- 5 files changed, 14 insertions(+), 9 deletions(-) diff --git a/sky/backends/backend_utils.py b/sky/backends/backend_utils.py index e691fb06264..97b4947fa6b 100644 --- a/sky/backends/backend_utils.py +++ b/sky/backends/backend_utils.py @@ -927,14 +927,11 @@ def write_cluster_config( 'sky_ray_cmd': constants.SKY_RAY_CMD, # pip install needs to have python env activated to make sure # installed packages are within the env path. - 'sky_pip_cmd': (f'{constants.ACTIVATE_SKY_REMOTE_PYTHON_ENV}; ' - f'{constants.SKY_PIP_CMD}'), + 'sky_pip_cmd': f'{constants.SKY_PIP_CMD}', 'ray_version': constants.SKY_REMOTE_RAY_VERSION, # Command for waiting ray cluster to be ready on head. 'ray_head_wait_initialized_command': instance_setup.RAY_HEAD_WAIT_INITIALIZED_COMMAND, - 'sky_activate_python_env': - constants.ACTIVATE_SKY_REMOTE_PYTHON_ENV, # Cloud credentials for cloud storage. 'credentials': credentials, diff --git a/sky/provision/instance_setup.py b/sky/provision/instance_setup.py index 07f2c513c7f..c81ecd78db4 100644 --- a/sky/provision/instance_setup.py +++ b/sky/provision/instance_setup.py @@ -61,6 +61,8 @@ 'done;') # Restart skylet when the version does not match to keep the skylet up-to-date. +# We need to activate the python environment to make sure autostop in skylet +# can find the cloud SDK/CLI in PATH. MAYBE_SKYLET_RESTART_CMD = (f'{constants.ACTIVATE_SKY_REMOTE_PYTHON_ENV}; ' f'{constants.SKY_PYTHON_CMD} -m ' 'sky.skylet.attempt_skylet;') diff --git a/sky/skylet/constants.py b/sky/skylet/constants.py index 2e69ccdc6cb..4372e354e84 100644 --- a/sky/skylet/constants.py +++ b/sky/skylet/constants.py @@ -37,7 +37,13 @@ SKY_PYTHON_CMD = f'$({SKY_GET_PYTHON_PATH_CMD})' SKY_PIP_CMD = f'{SKY_PYTHON_CMD} -m pip' # Ray executable, e.g., /opt/conda/bin/ray -SKY_RAY_CMD = (f'$([ -s {SKY_RAY_PATH_FILE} ] && ' +# We need to add SKY_PYTHON_CMD before ray executable because: +# The ray executable is a python script with a header like: +# #!/opt/conda/bin/python3 +# When we create the skypilot-runtime venv, the previously installed ray +# executable will be reused (due to --system-site-packages), and that will cause +# running ray CLI commands to use the wrong python executable. +SKY_RAY_CMD = (f'{SKY_PYTHON_CMD} $([ -s {SKY_RAY_PATH_FILE} ] && ' f'cat {SKY_RAY_PATH_FILE} 2> /dev/null || which ray)') # Separate env for SkyPilot runtime dependencies. SKY_REMOTE_PYTHON_ENV_NAME = 'skypilot-runtime' @@ -117,7 +123,7 @@ f'[ -d {SKY_REMOTE_PYTHON_ENV} ] || ' f'{{ {SKY_PYTHON_CMD} -m venv {SKY_REMOTE_PYTHON_ENV} --system-site-packages && ' f'echo "$(echo {SKY_REMOTE_PYTHON_ENV})/bin/python" > {SKY_PYTHON_PATH_FILE}; }};' - f'{ACTIVATE_SKY_REMOTE_PYTHON_ENV};') +) _sky_version = str(version.parse(sky.__version__)) RAY_STATUS = f'RAY_ADDRESS=127.0.0.1:{SKY_REMOTE_RAY_PORT} {SKY_RAY_CMD} status' diff --git a/sky/templates/azure-ray.yml.j2 b/sky/templates/azure-ray.yml.j2 index 803327f1032..f1477d92132 100644 --- a/sky/templates/azure-ray.yml.j2 +++ b/sky/templates/azure-ray.yml.j2 @@ -164,14 +164,14 @@ setup_commands: # current num items (num SSH connections): 2 head_start_ray_commands: # NOTE: --disable-usage-stats in `ray start` saves 10 seconds of idle wait. - - {{ sky_activate_python_env }}; {{ sky_ray_cmd }} stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 {{ sky_ray_cmd }} start --disable-usage-stats --head --port={{ray_port}} --dashboard-port={{ray_dashboard_port}} --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml {{"--num-gpus=%s" % num_gpus if num_gpus}} {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1; + - {{ sky_ray_cmd }} stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 {{ sky_ray_cmd }} start --disable-usage-stats --head --port={{ray_port}} --dashboard-port={{ray_dashboard_port}} --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml {{"--num-gpus=%s" % num_gpus if num_gpus}} {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1; which prlimit && for id in $(pgrep -f raylet/raylet); do sudo prlimit --nofile=1048576:1048576 --pid=$id || true; done; {{dump_port_command}}; {{ray_head_wait_initialized_command}} {%- if num_nodes > 1 %} worker_start_ray_commands: - - {{ sky_activate_python_env }}; {{ sky_ray_cmd }} stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 {{ sky_ray_cmd }} start --disable-usage-stats --address=$RAY_HEAD_IP:{{ray_port}} --object-manager-port=8076 {{"--num-gpus=%s" % num_gpus if num_gpus}} {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1; + - {{ sky_ray_cmd }} stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 {{ sky_ray_cmd }} start --disable-usage-stats --address=$RAY_HEAD_IP:{{ray_port}} --object-manager-port=8076 {{"--num-gpus=%s" % num_gpus if num_gpus}} {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1; which prlimit && for id in $(pgrep -f raylet/raylet); do sudo prlimit --nofile=1048576:1048576 --pid=$id || true; done; {%- else %} worker_start_ray_commands: [] diff --git a/tests/conftest.py b/tests/conftest.py index a7b7f10217a..ce92afd88c7 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -160,7 +160,7 @@ def pytest_collection_modifyitems(config, items): serial_mark = pytest.mark.xdist_group( name=f'serial_{generic_cloud_keyword}') # Handle generic tests - if generic_cloud in ['lambda', 'kubernetes']: + if generic_cloud in ['lambda']: for item in items: if (_is_generic_test(item) and f'no_{generic_cloud_keyword}' not in item.keywords): From 4358afbe8cf6fe115eb2e9c564ef30c93673efca Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Thu, 23 May 2024 01:22:32 +0000 Subject: [PATCH 27/38] Fix azure launch --- sky/backends/backend_utils.py | 2 ++ sky/templates/azure-ray.yml.j2 | 4 ++-- sky/templates/ibm-ray.yml.j2 | 4 ++-- sky/templates/lambda-ray.yml.j2 | 4 ++-- sky/templates/oci-ray.yml.j2 | 4 ++-- sky/templates/scp-ray.yml.j2 | 4 ++-- 6 files changed, 12 insertions(+), 10 deletions(-) diff --git a/sky/backends/backend_utils.py b/sky/backends/backend_utils.py index 97b4947fa6b..5ad68f2d212 100644 --- a/sky/backends/backend_utils.py +++ b/sky/backends/backend_utils.py @@ -928,6 +928,8 @@ def write_cluster_config( # pip install needs to have python env activated to make sure # installed packages are within the env path. 'sky_pip_cmd': f'{constants.SKY_PIP_CMD}', + 'sky_activate_python_env': + constants.ACTIVATE_SKY_REMOTE_PYTHON_ENV, 'ray_version': constants.SKY_REMOTE_RAY_VERSION, # Command for waiting ray cluster to be ready on head. 'ray_head_wait_initialized_command': diff --git a/sky/templates/azure-ray.yml.j2 b/sky/templates/azure-ray.yml.j2 index f1477d92132..803327f1032 100644 --- a/sky/templates/azure-ray.yml.j2 +++ b/sky/templates/azure-ray.yml.j2 @@ -164,14 +164,14 @@ setup_commands: # current num items (num SSH connections): 2 head_start_ray_commands: # NOTE: --disable-usage-stats in `ray start` saves 10 seconds of idle wait. - - {{ sky_ray_cmd }} stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 {{ sky_ray_cmd }} start --disable-usage-stats --head --port={{ray_port}} --dashboard-port={{ray_dashboard_port}} --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml {{"--num-gpus=%s" % num_gpus if num_gpus}} {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1; + - {{ sky_activate_python_env }}; {{ sky_ray_cmd }} stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 {{ sky_ray_cmd }} start --disable-usage-stats --head --port={{ray_port}} --dashboard-port={{ray_dashboard_port}} --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml {{"--num-gpus=%s" % num_gpus if num_gpus}} {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1; which prlimit && for id in $(pgrep -f raylet/raylet); do sudo prlimit --nofile=1048576:1048576 --pid=$id || true; done; {{dump_port_command}}; {{ray_head_wait_initialized_command}} {%- if num_nodes > 1 %} worker_start_ray_commands: - - {{ sky_ray_cmd }} stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 {{ sky_ray_cmd }} start --disable-usage-stats --address=$RAY_HEAD_IP:{{ray_port}} --object-manager-port=8076 {{"--num-gpus=%s" % num_gpus if num_gpus}} {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1; + - {{ sky_activate_python_env }}; {{ sky_ray_cmd }} stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 {{ sky_ray_cmd }} start --disable-usage-stats --address=$RAY_HEAD_IP:{{ray_port}} --object-manager-port=8076 {{"--num-gpus=%s" % num_gpus if num_gpus}} {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1; which prlimit && for id in $(pgrep -f raylet/raylet); do sudo prlimit --nofile=1048576:1048576 --pid=$id || true; done; {%- else %} worker_start_ray_commands: [] diff --git a/sky/templates/ibm-ray.yml.j2 b/sky/templates/ibm-ray.yml.j2 index cb527a85a55..728367f506c 100644 --- a/sky/templates/ibm-ray.yml.j2 +++ b/sky/templates/ibm-ray.yml.j2 @@ -118,13 +118,13 @@ head_start_ray_commands: # NOTE: --disable-usage-stats in `ray start` saves 10 seconds of idle wait. # Line "which prlimit ..": increase the limit of the number of open files for the raylet process, as the `ulimit` may not take effect at this point, because it requires # all the sessions to be reloaded. This is a workaround. - - {{ sky_ray_cmd }} stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 {{ sky_ray_cmd }} start --disable-usage-stats --head --port={{ray_port}} --dashboard-port={{ray_dashboard_port}} --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1; + - {{ sky_activate_python_env }}; {{ sky_ray_cmd }} stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 {{ sky_ray_cmd }} start --disable-usage-stats --head --port={{ray_port}} --dashboard-port={{ray_dashboard_port}} --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1; which prlimit && for id in $(pgrep -f raylet/raylet); do sudo prlimit --nofile=1048576:1048576 --pid=$id || true; done; {{dump_port_command}}; {{ray_head_wait_initialized_command}} {%- if num_nodes > 1 %} worker_start_ray_commands: - - {{ sky_ray_cmd }} stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 {{ sky_ray_cmd }} start --disable-usage-stats --address=$RAY_HEAD_IP:{{ray_port}} --object-manager-port=8076 {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1; + - {{ sky_activate_python_env }}; {{ sky_ray_cmd }} stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 {{ sky_ray_cmd }} start --disable-usage-stats --address=$RAY_HEAD_IP:{{ray_port}} --object-manager-port=8076 {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1; which prlimit && for id in $(pgrep -f raylet/raylet); do sudo prlimit --nofile=1048576:1048576 --pid=$id || true; done; {%- else %} worker_start_ray_commands: [] diff --git a/sky/templates/lambda-ray.yml.j2 b/sky/templates/lambda-ray.yml.j2 index 1aaf7edaddd..4e8b834503f 100644 --- a/sky/templates/lambda-ray.yml.j2 +++ b/sky/templates/lambda-ray.yml.j2 @@ -89,13 +89,13 @@ setup_commands: # Increment the following for catching performance bugs easier: # current num items (num SSH connections): 2 head_start_ray_commands: - - {{ sky_ray_cmd }} stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 {{ sky_ray_cmd }} start --disable-usage-stats --head --port={{ray_port}} --dashboard-port={{ray_dashboard_port}} --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1; + - {{ sky_activate_python_env }}; {{ sky_ray_cmd }} stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 {{ sky_ray_cmd }} start --disable-usage-stats --head --port={{ray_port}} --dashboard-port={{ray_dashboard_port}} --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1; which prlimit && for id in $(pgrep -f raylet/raylet); do sudo prlimit --nofile=1048576:1048576 --pid=$id || true; done; {{dump_port_command}}; {{ray_head_wait_initialized_command}} {%- if num_nodes > 1 %} worker_start_ray_commands: - - {{ sky_ray_cmd }} stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 {{ sky_ray_cmd }} start --disable-usage-stats --address=$RAY_HEAD_IP:{{ray_port}} --object-manager-port=8076 {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1; + - {{ sky_activate_python_env }}; {{ sky_ray_cmd }} stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 {{ sky_ray_cmd }} start --disable-usage-stats --address=$RAY_HEAD_IP:{{ray_port}} --object-manager-port=8076 {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1; which prlimit && for id in $(pgrep -f raylet/raylet); do sudo prlimit --nofile=1048576:1048576 --pid=$id || true; done; {%- else %} worker_start_ray_commands: [] diff --git a/sky/templates/oci-ray.yml.j2 b/sky/templates/oci-ray.yml.j2 index a15a53732b1..32bd6326ee2 100644 --- a/sky/templates/oci-ray.yml.j2 +++ b/sky/templates/oci-ray.yml.j2 @@ -114,13 +114,13 @@ head_start_ray_commands: # NOTE: --disable-usage-stats in `ray start` saves 10 seconds of idle wait. # Line "which prlimit ..": increase the limit of the number of open files for the raylet process, as the `ulimit` may not take effect at this point, because it requires # all the sessions to be reloaded. This is a workaround. - - {{ sky_ray_cmd }} stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 {{ sky_ray_cmd }} start --disable-usage-stats --head --port={{ray_port}} --dashboard-port={{ray_dashboard_port}} --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1; + - {{ sky_activate_python_env }}; {{ sky_ray_cmd }} stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 {{ sky_ray_cmd }} start --disable-usage-stats --head --port={{ray_port}} --dashboard-port={{ray_dashboard_port}} --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1; which prlimit && for id in $(pgrep -f raylet/raylet); do sudo prlimit --nofile=1048576:1048576 --pid=$id || true; done; {{dump_port_command}}; {{ray_head_wait_initialized_command}} {%- if num_nodes > 1 %} worker_start_ray_commands: - - {{ sky_ray_cmd }} stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 {{ sky_ray_cmd }} start --disable-usage-stats --address=$RAY_HEAD_IP:{{ray_port}} --object-manager-port=8076 {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1; + - {{ sky_activate_python_env }}; {{ sky_ray_cmd }} stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 {{ sky_ray_cmd }} start --disable-usage-stats --address=$RAY_HEAD_IP:{{ray_port}} --object-manager-port=8076 {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1; which prlimit && for id in $(pgrep -f raylet/raylet); do sudo prlimit --nofile=1048576:1048576 --pid=$id || true; done; {%- else %} worker_start_ray_commands: [] diff --git a/sky/templates/scp-ray.yml.j2 b/sky/templates/scp-ray.yml.j2 index 42126652920..907aa547d64 100644 --- a/sky/templates/scp-ray.yml.j2 +++ b/sky/templates/scp-ray.yml.j2 @@ -88,13 +88,13 @@ head_start_ray_commands: # NOTE: --disable-usage-stats in `ray start` saves 10 seconds of idle wait. # Line "which prlimit ..": increase the limit of the number of open files for the raylet process, as the `ulimit` may not take effect at this point, because it requires # all the sessions to be reloaded. This is a workaround. - - {{ sky_ray_cmd }} stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 {{ sky_ray_cmd }} start --disable-usage-stats --head --port={{ray_port}} --dashboard-port={{ray_dashboard_port}} --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1; + - {{ sky_activate_python_env }}; {{ sky_ray_cmd }} stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 {{ sky_ray_cmd }} start --disable-usage-stats --head --port={{ray_port}} --dashboard-port={{ray_dashboard_port}} --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1; which prlimit && for id in $(pgrep -f raylet/raylet); do sudo prlimit --nofile=1048576:1048576 --pid=$id || true; done; {{dump_port_command}}; {{ray_head_wait_initialized_command}} {%- if num_nodes > 1 %} worker_start_ray_commands: - - {{ sky_ray_cmd }} stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 {{ sky_ray_cmd }} start --disable-usage-stats --address=$RAY_HEAD_IP:{{ray_port}} --object-manager-port=8076 {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1; + - {{ sky_activate_python_env }}; {{ sky_ray_cmd }} stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 {{ sky_ray_cmd }} start --disable-usage-stats --address=$RAY_HEAD_IP:{{ray_port}} --object-manager-port=8076 {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1; which prlimit && for id in $(pgrep -f raylet/raylet); do sudo prlimit --nofile=1048576:1048576 --pid=$id || true; done; {%- else %} worker_start_ray_commands: [] From 16e63b59868184af9e9668238a3e529d4adfd3eb Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Thu, 23 May 2024 05:23:40 +0000 Subject: [PATCH 28/38] add comments --- sky/backends/backend_utils.py | 3 +++ sky/skylet/events.py | 5 +++-- sky/utils/controller_utils.py | 3 +++ 3 files changed, 9 insertions(+), 2 deletions(-) diff --git a/sky/backends/backend_utils.py b/sky/backends/backend_utils.py index 5ad68f2d212..b1598c7c039 100644 --- a/sky/backends/backend_utils.py +++ b/sky/backends/backend_utils.py @@ -928,6 +928,9 @@ def write_cluster_config( # pip install needs to have python env activated to make sure # installed packages are within the env path. 'sky_pip_cmd': f'{constants.SKY_PIP_CMD}', + # Activate the SkyPilot runtime environment when starting ray + # cluster, so that ray autoscaler can access cloud SDK and CLIs + # on remote 'sky_activate_python_env': constants.ACTIVATE_SKY_REMOTE_PYTHON_ENV, 'ray_version': constants.SKY_REMOTE_RAY_VERSION, diff --git a/sky/skylet/events.py b/sky/skylet/events.py index c5e2c235042..2459ec38696 100644 --- a/sky/skylet/events.py +++ b/sky/skylet/events.py @@ -3,7 +3,6 @@ import os import re import subprocess -import sys import time import traceback @@ -193,7 +192,9 @@ def _stop_cluster(self, autostop_config): # Passing env inherited from os.environ is technically not # needed, because we call `python