From 50d7bc08302919ee8c89e87b2741db76010a5882 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Wed, 20 Nov 2024 13:37:26 -0800 Subject: [PATCH] [Jobs] Disable deduplication for logs (#4388) Disable dedup --- sky/backends/cloud_vm_ray_backend.py | 7 +++++++ sky/provision/instance_setup.py | 9 --------- 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py index d00560ece23..2ea67285b8d 100644 --- a/sky/backends/cloud_vm_ray_backend.py +++ b/sky/backends/cloud_vm_ray_backend.py @@ -269,6 +269,13 @@ def add_prologue(self, job_id: int) -> None: import time from typing import Dict, List, Optional, Tuple, Union + # Set the environment variables to avoid deduplicating logs and + # scheduler events. This should be set in driver code, since we are + # not using `ray job submit` anymore, and the environment variables + # from the ray cluster is not inherited. + os.environ['RAY_DEDUP_LOGS'] = '0' + os.environ['RAY_SCHEDULER_EVENTS'] = '0' + import ray import ray.util as ray_util diff --git a/sky/provision/instance_setup.py b/sky/provision/instance_setup.py index 8c390adaf87..df1a96427c1 100644 --- a/sky/provision/instance_setup.py +++ b/sky/provision/instance_setup.py @@ -277,16 +277,8 @@ def start_ray_on_head_node(cluster_name: str, custom_resource: Optional[str], for key, value in cluster_info.custom_ray_options.items(): ray_options += f' --{key}={value}' - # Unset AWS_ACCESS_KEY_ID AWS_SECRET_ACCESS_KEY to avoid using credentials - # from environment variables set by user. SkyPilot's ray cluster should use - # the `~/.aws/` credentials, as that is the one used to create the cluster, - # and the autoscaler module started by the `ray start` command should use - # the same credentials. Otherwise, `ray status` will fail to fetch the - # available nodes. - # Reference: https://github.com/skypilot-org/skypilot/issues/2441 cmd = ( f'{constants.SKY_RAY_CMD} stop; ' - 'unset AWS_ACCESS_KEY_ID AWS_SECRET_ACCESS_KEY; ' 'RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 ' # worker_maximum_startup_concurrency controls the maximum number of # workers that can be started concurrently. However, it also controls @@ -372,7 +364,6 @@ def start_ray_on_worker_nodes(cluster_name: str, no_restart: bool, # Unset AWS_ACCESS_KEY_ID AWS_SECRET_ACCESS_KEY, see the comment in # `start_ray_on_head_node`. cmd = ( - f'unset AWS_ACCESS_KEY_ID AWS_SECRET_ACCESS_KEY; ' 'RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 ' f'{constants.SKY_RAY_CMD} start --disable-usage-stats {ray_options} || ' 'exit 1;' + _RAY_PRLIMIT)