Skip to content

Commit

Permalink
source bashrc for ray cluster
Browse files Browse the repository at this point in the history
  • Loading branch information
Michaelvll committed May 8, 2024
1 parent b53131e commit 65001c8
Show file tree
Hide file tree
Showing 2 changed files with 17 additions and 10 deletions.
3 changes: 1 addition & 2 deletions sky/backends/cloud_vm_ray_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -3287,8 +3287,7 @@ def _exec_code_on_head(
returncode, stdout, stderr = self.run_on_head(handle,
job_submit_cmd,
stream_logs=False,
require_outputs=True,
source_bashrc=True)
require_outputs=True)

# Happens when someone calls `sky exec` but remote is outdated
# necessitating calling `sky launch`.
Expand Down
24 changes: 16 additions & 8 deletions sky/provision/instance_setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -295,10 +295,14 @@ def start_ray_on_head_node(cluster_name: str, custom_resource: Optional[str],
_RAY_PRLIMIT + _DUMP_RAY_PORTS + RAY_HEAD_WAIT_INITIALIZED_COMMAND)
logger.info(f'Running command on head node: {cmd}')
# TODO(zhwu): add the output to log files.
returncode, stdout, stderr = head_runner.run(cmd,
stream_logs=False,
log_path=log_path_abs,
require_outputs=True)
returncode, stdout, stderr = head_runner.run(
cmd,
stream_logs=False,
log_path=log_path_abs,
require_outputs=True,
# Source bashrc for starting ray cluster to make sure actors started by
# ray will have the correct PATH.
source_bashrc=True)
if returncode:
raise RuntimeError('Failed to start ray on the head node '
f'(exit code {returncode}). Error: \n'
Expand Down Expand Up @@ -382,10 +386,14 @@ def _setup_ray_worker(runner_and_id: Tuple[command_runner.CommandRunner,
runner, instance_id = runner_and_id
log_dir = metadata_utils.get_instance_log_dir(cluster_name, instance_id)
log_path_abs = str(log_dir / ('ray_cluster' + '.log'))
return runner.run(cmd,
stream_logs=False,
require_outputs=True,
log_path=log_path_abs)
return runner.run(
cmd,
stream_logs=False,
require_outputs=True,
log_path=log_path_abs,
# Source bashrc for starting ray cluster to make sure actors started
# by ray will have the correct PATH.
source_bashrc=True)

results = subprocess_utils.run_in_parallel(
_setup_ray_worker, list(zip(worker_runners, cache_ids)))
Expand Down

0 comments on commit 65001c8

Please sign in to comment.