diff --git a/docs/source/running-jobs/environment-variables.rst b/docs/source/running-jobs/environment-variables.rst index 8bc5e667e05..16502f70818 100644 --- a/docs/source/running-jobs/environment-variables.rst +++ b/docs/source/running-jobs/environment-variables.rst @@ -118,6 +118,9 @@ Environment variables for ``setup`` - sky-2023-07-06-21-18-31-563597_myclus_1 For managed spot jobs: sky-managed-2023-07-06-21-18-31-563597_my-job-name_1-0 + * - ``SKYPILOT_CLUSTER_INFO`` + - A JSON string containing information about the cluster. To access the information, you could parse the JSON string in bash ``echo $SKYPILOT_CLUSTER_INFO | jq .cloud`` or in Python ``json.loads(os.environ['SKYPILOT_CLUSTER_INFO'])['cloud']``. + - {"cluster_name": "my-cluster-name", "cloud": "GCP", "region": "us-central1", "zone": "us-central1-a"} * - ``SKYPILOT_SERVE_REPLICA_ID`` - The ID of a replica within the service (starting from 1). Available only for a :ref:`service `'s replica task. - 1 @@ -157,6 +160,9 @@ Environment variables for ``run`` - sky-2023-07-06-21-18-31-563597_myclus_1 For managed spot jobs: sky-managed-2023-07-06-21-18-31-563597_my-job-name_1-0 + * - ``SKYPILOT_CLUSTER_INFO`` + - A JSON string containing information about the cluster. To access the information, you could parse the JSON string in bash ``echo $SKYPILOT_CLUSTER_INFO | jq .cloud`` or in Python ``json.loads(os.environ['SKYPILOT_CLUSTER_INFO'])['cloud']``. + - {"cluster_name": "my-cluster-name", "cloud": "GCP", "region": "us-central1", "zone": "us-central1-a"} * - ``SKYPILOT_SERVE_REPLICA_ID`` - The ID of a replica within the service (starting from 1). Available only for a :ref:`service `'s replica task. - 1 diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py index 378ccffd733..44ade8c9c5e 100644 --- a/sky/backends/cloud_vm_ray_backend.py +++ b/sky/backends/cloud_vm_ray_backend.py @@ -234,9 +234,10 @@ def add_prologue(self, job_id: int) -> None: import io import os import pathlib - import sys import selectors + import shlex import subprocess + import sys import tempfile import textwrap import time @@ -3016,6 +3017,7 @@ def _setup(self, handle: CloudVmRayResourceHandle, task: task_lib.Task, def _setup_node(node_id: int) -> None: setup_envs = task.envs.copy() + setup_envs.update(self._skypilot_predefined_env_vars(handle)) setup_envs['SKYPILOT_SETUP_NODE_IPS'] = '\n'.join(internal_ips) setup_envs['SKYPILOT_SETUP_NODE_RANK'] = str(node_id) runner = command_runner.SSHCommandRunner(ip_list[node_id], @@ -4521,6 +4523,25 @@ def get_storage_mounts_metadata( storage_metadata, sync_on_reconstruction=False) return storage_mounts + def _skypilot_predefined_env_vars( + self, handle: CloudVmRayResourceHandle) -> Dict[str, str]: + """Returns the SkyPilot predefined environment variables. + + TODO(zhwu): Check if a single variable for all the cluster info is more + desirable or separate variables for each piece of info. + NOTE: In order to avoid complication in a potential future separation + of the info into multiple env vars, we should not treat this json format + as a sink for all the cluster info. + """ + return { + 'SKYPILOT_CLUSTER_INFO': json.dumps({ + 'cluster_name': handle.cluster_name, + 'cloud': str(handle.launched_resources.cloud), + 'region': handle.launched_resources.region, + 'zone': handle.launched_resources.zone, + }) + } + def _get_task_env_vars(self, task: task_lib.Task, job_id: int, handle: CloudVmRayResourceHandle) -> Dict[str, str]: """Returns the environment variables for the task.""" @@ -4533,6 +4554,7 @@ def _get_task_env_vars(self, task: task_lib.Task, job_id: int, self.run_timestamp, cluster_name=handle.cluster_name, job_id=str(job_id)) + env_vars.update(self._skypilot_predefined_env_vars(handle)) return env_vars def _execute_task_one_node(self, handle: CloudVmRayResourceHandle, diff --git a/sky/skylet/log_lib.py b/sky/skylet/log_lib.py index a1d040dbcdb..e8b4de8b7fa 100644 --- a/sky/skylet/log_lib.py +++ b/sky/skylet/log_lib.py @@ -6,6 +6,7 @@ import io import multiprocessing.pool import os +import shlex import subprocess import sys import tempfile @@ -275,7 +276,7 @@ def make_task_bash_script(codegen: str, ] if env_vars is not None: for k, v in env_vars.items(): - script.append(f'export {k}="{v}"') + script.append(f'export {k}={shlex.quote(str(v))}') script += [ codegen, '', # New line at EOF. diff --git a/tests/test_smoke.py b/tests/test_smoke.py index 95717eeb3e9..944818f05dc 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -282,6 +282,11 @@ def test_minimal(generic_cloud: str): # Ensure the raylet process has the correct file descriptor limit. f'sky exec {name} "prlimit -n --pid=\$(pgrep -f \'raylet/raylet --raylet_socket_name\') | grep \'"\'1048576 1048576\'"\'"', f'sky logs {name} 2 --status', # Ensure the job succeeded. + # Check the cluster info + f'sky exec {name} \'echo $SKYPILOT_CLUSTER_INFO | jq .cluster_name | grep {name}\'', + f'sky logs {name} 3 --status', # Ensure the job succeeded. + f'sky exec {name} \'echo $SKYPILOT_CLUSTER_INFO | jq .cloud | grep -i {generic_cloud}\'', + f'sky logs {name} 4 --status', # Ensure the job succeeded. ], f'sky down -y {name}', _get_timeout(generic_cloud), @@ -300,6 +305,8 @@ def test_aws_region(): f'sky exec {name} examples/minimal.yaml', f'sky logs {name} 1 --status', # Ensure the job succeeded. f'sky status --all | grep {name} | grep us-east-2', # Ensure the region is correct. + f'sky exec {name} \'echo $SKYPILOT_CLUSTER_INFO | jq .region | grep us-east-2\'', + f'sky logs {name} 2 --status', # Ensure the job succeeded. ], f'sky down -y {name}', ) @@ -318,6 +325,8 @@ def test_gcp_region_and_service_account(): f'sky exec {name} \'curl -H "Metadata-Flavor: Google" "http://metadata.google.internal/computeMetadata/v1/instance/service-accounts/default/identity?format=standard&audience=gcp"\'', f'sky logs {name} 2 --status', # Ensure the job succeeded. f'sky status --all | grep {name} | grep us-central1', # Ensure the region is correct. + f'sky exec {name} \'echo $SKYPILOT_CLUSTER_INFO | jq .region | grep us-central1\'', + f'sky logs {name} 3 --status', # Ensure the job succeeded. ], f'sky down -y {name}', ) @@ -351,6 +360,10 @@ def test_azure_region(): f'sky exec {name} tests/test_yamls/minimal.yaml', f'sky logs {name} 1 --status', # Ensure the job succeeded. f'sky status --all | grep {name} | grep eastus2', # Ensure the region is correct. + f'sky exec {name} \'echo $SKYPILOT_CLUSTER_INFO | jq .region | grep eastus2\'', + f'sky logs {name} 2 --status', # Ensure the job succeeded. + f'sky exec {name} \'echo $SKYPILOT_CLUSTER_INFO | jq .zone | grep null\'', + f'sky logs {name} 3 --status', # Ensure the job succeeded. ], f'sky down -y {name}', ) @@ -420,6 +433,8 @@ def test_aws_images(): f'sky launch -y -c {name} examples/minimal.yaml', f'sky logs {name} 2 --status', f'sky logs {name} --status | grep "Job 2: SUCCEEDED"', # Equivalent. + f'sky exec {name} \'echo $SKYPILOT_CLUSTER_INFO | jq .cloud | grep -i aws\'', + f'sky logs {name} 3 --status', # Ensure the job succeeded. ], f'sky down -y {name}', ) @@ -438,6 +453,8 @@ def test_gcp_images(): f'sky launch -y -c {name} tests/test_yamls/minimal.yaml', f'sky logs {name} 2 --status', f'sky logs {name} --status | grep "Job 2: SUCCEEDED"', # Equivalent. + f'sky exec {name} \'echo $SKYPILOT_CLUSTER_INFO | jq .cloud | grep -i gcp\'', + f'sky logs {name} 3 --status', # Ensure the job succeeded. ], f'sky down -y {name}', ) @@ -456,6 +473,8 @@ def test_azure_images(): f'sky launch -y -c {name} tests/test_yamls/minimal.yaml', f'sky logs {name} 2 --status', f'sky logs {name} --status | grep "Job 2: SUCCEEDED"', # Equivalent. + f'sky exec {name} \'echo $SKYPILOT_CLUSTER_INFO | jq .cloud | grep -i azure\'', + f'sky logs {name} 3 --status', # Ensure the job succeeded. ], f'sky down -y {name}', )