Skip to content

Commit

Permalink
[UX] Add cluster info in task envs (#3426)
Browse files Browse the repository at this point in the history
* Add task name for the spot job

* Add dag name and task name for spot job

* fix dag

* starting 1

* format

* Address comments

* add env vars

* new line

* Add cluster info in the env vars

* add spot in the cluster info

* fix env var docs

* cloud change to str

* Fix quoting

* Add example for parsing json string

* format

* address comments

* Add smoke tests

* format

* update doc
  • Loading branch information
Michaelvll authored Apr 7, 2024
1 parent 48a5c63 commit c65b258
Show file tree
Hide file tree
Showing 4 changed files with 50 additions and 2 deletions.
6 changes: 6 additions & 0 deletions docs/source/running-jobs/environment-variables.rst
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,9 @@ Environment variables for ``setup``
- sky-2023-07-06-21-18-31-563597_myclus_1

For managed spot jobs: sky-managed-2023-07-06-21-18-31-563597_my-job-name_1-0
* - ``SKYPILOT_CLUSTER_INFO``
- A JSON string containing information about the cluster. To access the information, you could parse the JSON string in bash ``echo $SKYPILOT_CLUSTER_INFO | jq .cloud`` or in Python ``json.loads(os.environ['SKYPILOT_CLUSTER_INFO'])['cloud']``.
- {"cluster_name": "my-cluster-name", "cloud": "GCP", "region": "us-central1", "zone": "us-central1-a"}
* - ``SKYPILOT_SERVE_REPLICA_ID``
- The ID of a replica within the service (starting from 1). Available only for a :ref:`service <sky-serve>`'s replica task.
- 1
Expand Down Expand Up @@ -157,6 +160,9 @@ Environment variables for ``run``
- sky-2023-07-06-21-18-31-563597_myclus_1

For managed spot jobs: sky-managed-2023-07-06-21-18-31-563597_my-job-name_1-0
* - ``SKYPILOT_CLUSTER_INFO``
- A JSON string containing information about the cluster. To access the information, you could parse the JSON string in bash ``echo $SKYPILOT_CLUSTER_INFO | jq .cloud`` or in Python ``json.loads(os.environ['SKYPILOT_CLUSTER_INFO'])['cloud']``.
- {"cluster_name": "my-cluster-name", "cloud": "GCP", "region": "us-central1", "zone": "us-central1-a"}
* - ``SKYPILOT_SERVE_REPLICA_ID``
- The ID of a replica within the service (starting from 1). Available only for a :ref:`service <sky-serve>`'s replica task.
- 1
Expand Down
24 changes: 23 additions & 1 deletion sky/backends/cloud_vm_ray_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -234,9 +234,10 @@ def add_prologue(self, job_id: int) -> None:
import io
import os
import pathlib
import sys
import selectors
import shlex
import subprocess
import sys
import tempfile
import textwrap
import time
Expand Down Expand Up @@ -3016,6 +3017,7 @@ def _setup(self, handle: CloudVmRayResourceHandle, task: task_lib.Task,

def _setup_node(node_id: int) -> None:
setup_envs = task.envs.copy()
setup_envs.update(self._skypilot_predefined_env_vars(handle))
setup_envs['SKYPILOT_SETUP_NODE_IPS'] = '\n'.join(internal_ips)
setup_envs['SKYPILOT_SETUP_NODE_RANK'] = str(node_id)
runner = command_runner.SSHCommandRunner(ip_list[node_id],
Expand Down Expand Up @@ -4521,6 +4523,25 @@ def get_storage_mounts_metadata(
storage_metadata, sync_on_reconstruction=False)
return storage_mounts

def _skypilot_predefined_env_vars(
self, handle: CloudVmRayResourceHandle) -> Dict[str, str]:
"""Returns the SkyPilot predefined environment variables.
TODO(zhwu): Check if a single variable for all the cluster info is more
desirable or separate variables for each piece of info.
NOTE: In order to avoid complication in a potential future separation
of the info into multiple env vars, we should not treat this json format
as a sink for all the cluster info.
"""
return {
'SKYPILOT_CLUSTER_INFO': json.dumps({
'cluster_name': handle.cluster_name,
'cloud': str(handle.launched_resources.cloud),
'region': handle.launched_resources.region,
'zone': handle.launched_resources.zone,
})
}

def _get_task_env_vars(self, task: task_lib.Task, job_id: int,
handle: CloudVmRayResourceHandle) -> Dict[str, str]:
"""Returns the environment variables for the task."""
Expand All @@ -4533,6 +4554,7 @@ def _get_task_env_vars(self, task: task_lib.Task, job_id: int,
self.run_timestamp,
cluster_name=handle.cluster_name,
job_id=str(job_id))
env_vars.update(self._skypilot_predefined_env_vars(handle))
return env_vars

def _execute_task_one_node(self, handle: CloudVmRayResourceHandle,
Expand Down
3 changes: 2 additions & 1 deletion sky/skylet/log_lib.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import io
import multiprocessing.pool
import os
import shlex
import subprocess
import sys
import tempfile
Expand Down Expand Up @@ -275,7 +276,7 @@ def make_task_bash_script(codegen: str,
]
if env_vars is not None:
for k, v in env_vars.items():
script.append(f'export {k}="{v}"')
script.append(f'export {k}={shlex.quote(str(v))}')
script += [
codegen,
'', # New line at EOF.
Expand Down
19 changes: 19 additions & 0 deletions tests/test_smoke.py
Original file line number Diff line number Diff line change
Expand Up @@ -282,6 +282,11 @@ def test_minimal(generic_cloud: str):
# Ensure the raylet process has the correct file descriptor limit.
f'sky exec {name} "prlimit -n --pid=\$(pgrep -f \'raylet/raylet --raylet_socket_name\') | grep \'"\'1048576 1048576\'"\'"',
f'sky logs {name} 2 --status', # Ensure the job succeeded.
# Check the cluster info
f'sky exec {name} \'echo $SKYPILOT_CLUSTER_INFO | jq .cluster_name | grep {name}\'',
f'sky logs {name} 3 --status', # Ensure the job succeeded.
f'sky exec {name} \'echo $SKYPILOT_CLUSTER_INFO | jq .cloud | grep -i {generic_cloud}\'',
f'sky logs {name} 4 --status', # Ensure the job succeeded.
],
f'sky down -y {name}',
_get_timeout(generic_cloud),
Expand All @@ -300,6 +305,8 @@ def test_aws_region():
f'sky exec {name} examples/minimal.yaml',
f'sky logs {name} 1 --status', # Ensure the job succeeded.
f'sky status --all | grep {name} | grep us-east-2', # Ensure the region is correct.
f'sky exec {name} \'echo $SKYPILOT_CLUSTER_INFO | jq .region | grep us-east-2\'',
f'sky logs {name} 2 --status', # Ensure the job succeeded.
],
f'sky down -y {name}',
)
Expand All @@ -318,6 +325,8 @@ def test_gcp_region_and_service_account():
f'sky exec {name} \'curl -H "Metadata-Flavor: Google" "http://metadata.google.internal/computeMetadata/v1/instance/service-accounts/default/identity?format=standard&audience=gcp"\'',
f'sky logs {name} 2 --status', # Ensure the job succeeded.
f'sky status --all | grep {name} | grep us-central1', # Ensure the region is correct.
f'sky exec {name} \'echo $SKYPILOT_CLUSTER_INFO | jq .region | grep us-central1\'',
f'sky logs {name} 3 --status', # Ensure the job succeeded.
],
f'sky down -y {name}',
)
Expand Down Expand Up @@ -351,6 +360,10 @@ def test_azure_region():
f'sky exec {name} tests/test_yamls/minimal.yaml',
f'sky logs {name} 1 --status', # Ensure the job succeeded.
f'sky status --all | grep {name} | grep eastus2', # Ensure the region is correct.
f'sky exec {name} \'echo $SKYPILOT_CLUSTER_INFO | jq .region | grep eastus2\'',
f'sky logs {name} 2 --status', # Ensure the job succeeded.
f'sky exec {name} \'echo $SKYPILOT_CLUSTER_INFO | jq .zone | grep null\'',
f'sky logs {name} 3 --status', # Ensure the job succeeded.
],
f'sky down -y {name}',
)
Expand Down Expand Up @@ -420,6 +433,8 @@ def test_aws_images():
f'sky launch -y -c {name} examples/minimal.yaml',
f'sky logs {name} 2 --status',
f'sky logs {name} --status | grep "Job 2: SUCCEEDED"', # Equivalent.
f'sky exec {name} \'echo $SKYPILOT_CLUSTER_INFO | jq .cloud | grep -i aws\'',
f'sky logs {name} 3 --status', # Ensure the job succeeded.
],
f'sky down -y {name}',
)
Expand All @@ -438,6 +453,8 @@ def test_gcp_images():
f'sky launch -y -c {name} tests/test_yamls/minimal.yaml',
f'sky logs {name} 2 --status',
f'sky logs {name} --status | grep "Job 2: SUCCEEDED"', # Equivalent.
f'sky exec {name} \'echo $SKYPILOT_CLUSTER_INFO | jq .cloud | grep -i gcp\'',
f'sky logs {name} 3 --status', # Ensure the job succeeded.
],
f'sky down -y {name}',
)
Expand All @@ -456,6 +473,8 @@ def test_azure_images():
f'sky launch -y -c {name} tests/test_yamls/minimal.yaml',
f'sky logs {name} 2 --status',
f'sky logs {name} --status | grep "Job 2: SUCCEEDED"', # Equivalent.
f'sky exec {name} \'echo $SKYPILOT_CLUSTER_INFO | jq .cloud | grep -i azure\'',
f'sky logs {name} 3 --status', # Ensure the job succeeded.
],
f'sky down -y {name}',
)
Expand Down

0 comments on commit c65b258

Please sign in to comment.