Skip to content

Commit

Permalink
Fix venv
Browse files Browse the repository at this point in the history
  • Loading branch information
Michaelvll committed Nov 19, 2023
1 parent 6753b2d commit db60092
Show file tree
Hide file tree
Showing 8 changed files with 31 additions and 32 deletions.
7 changes: 3 additions & 4 deletions sky/skylet/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,8 +95,9 @@ def run_in_python_env(command):
# cause error and waiting for the error to be reported: #2273.
'which conda | grep /opt/conda || conda init > /dev/null;'
# Create a separate conda environment for SkyPilot dependencies.
f'[ -d ~/{SKY_REMOTE_PYTHON_ENV} ] || '
f'python -m venv ~/{SKY_REMOTE_PYTHON_ENV}; '
f'[ -d {SKY_REMOTE_PYTHON_ENV} ] || '
f'python -m venv {SKY_REMOTE_PYTHON_ENV}; '
f'source {SKY_REMOTE_PYTHON_ENV}/bin/activate; '
f'echo "function skypy () {{ {_RUN_PYTHON} }}" >> ~/.bashrc;'
f'echo "function skypip () {{ {_RUN_PIP} }}" >> ~/.bashrc;'
f'echo "function skyray () {{ {_RUN_RAY} }}" >> ~/.bashrc;')
Expand All @@ -105,10 +106,8 @@ def run_in_python_env(command):
'(type -a python | grep -q python3) || '
'echo "alias python=python3" >> ~/.bashrc;'
'(type -a pip | grep -q pip3) || echo "alias pip=pip3" >> ~/.bashrc;'
'source ~/.bashrc;'
'mkdir -p ~/sky_workdir && mkdir -p ~/.sky/sky_app && '
'touch ~/.sudo_as_admin_successful;'
f'source {SKY_REMOTE_PYTHON_ENV}/bin/activate; '
f'(pip list | grep "ray " | grep "{SKY_REMOTE_RAY_VERSION}" '
'2>&1 > /dev/null || '
f'pip install --exists-action w -U '
Expand Down
8 changes: 4 additions & 4 deletions sky/templates/azure-ray.yml.j2
Original file line number Diff line number Diff line change
Expand Up @@ -161,14 +161,14 @@ setup_commands:
# current num items (num SSH connections): 2
head_start_ray_commands:
# NOTE: --disable-usage-stats in `ray start` saves 10 seconds of idle wait.
- skyray stop; export RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0; skyray start --disable-usage-stats --head --port={{ray_port}} --dashboard-port={{ray_dashboard_port}} --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1;
- source ~/skypilot-runtime/bin/activate; ray stop; export RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0; ray start --disable-usage-stats --head --port={{ray_port}} --dashboard-port={{ray_dashboard_port}} --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1;
which prlimit && for id in $(pgrep -f raylet/raylet); do sudo prlimit --nofile=1048576:1048576 --pid=$id || true; done;
{{dump_port_command}};
{{dump_port_command}}; deactivate

{%- if num_nodes > 1 %}
worker_start_ray_commands:
- skyray stop; export RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0; skyray start --disable-usage-stats --address=$RAY_HEAD_IP:{{ray_port}} --object-manager-port=8076 {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1;
which prlimit && for id in $(pgrep -f raylet/raylet); do sudo prlimit --nofile=1048576:1048576 --pid=$id || true; done;
- source ~/skypilot-runtime/bin/activate; ray stop; export RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0; ray start --disable-usage-stats --address=$RAY_HEAD_IP:{{ray_port}} --object-manager-port=8076 {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1;
which prlimit && for id in $(pgrep -f raylet/raylet); do sudo prlimit --nofile=1048576:1048576 --pid=$id || true; done; deactivate
{%- else %}
worker_start_ray_commands: []
{%- endif %}
Expand Down
8 changes: 4 additions & 4 deletions sky/templates/gcp-ray.yml.j2
Original file line number Diff line number Diff line change
Expand Up @@ -327,16 +327,16 @@ head_start_ray_commands:
# Line "which prlimit ..": increase the limit of the number of open files for the raylet process, as the `ulimit` may not take effect at this point, because it requires
# all the sessions to be reloaded. This is a workaround.
- export SKYPILOT_NUM_GPUS=0 && which nvidia-smi > /dev/null && SKYPILOT_NUM_GPUS=$(nvidia-smi --query-gpu=index,name --format=csv,noheader | wc -l);
skyray stop; export RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0; skyray start --disable-usage-stats --head --port={{ray_port}} --dashboard-port={{ray_dashboard_port}} --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml {{"--resources='%s'" % custom_resources if custom_resources}} --num-gpus=$SKYPILOT_NUM_GPUS --temp-dir {{ray_temp_dir}} || exit 1;
source ~/skypilot-runtime/bin/activate; ray stop; export RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0; ray start --disable-usage-stats --head --port={{ray_port}} --dashboard-port={{ray_dashboard_port}} --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml {{"--resources='%s'" % custom_resources if custom_resources}} --num-gpus=$SKYPILOT_NUM_GPUS --temp-dir {{ray_temp_dir}} || exit 1;
which prlimit && for id in $(pgrep -f raylet/raylet); do sudo prlimit --nofile=1048576:1048576 --pid=$id || true; done;
{{dump_port_command}};
{{dump_port_command}}; deactivate

# Worker commands are needed for TPU VM Pods
{%- if num_nodes > 1 or tpu_vm %}
worker_start_ray_commands:
- SKYPILOT_NUM_GPUS=0 && which nvidia-smi > /dev/null && SKYPILOT_NUM_GPUS=$(nvidia-smi --query-gpu=index,name --format=csv,noheader | wc -l);
skyray stop; export RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0; skyray start --disable-usage-stats --address=$RAY_HEAD_IP:{{ray_port}} --object-manager-port=8076 {{"--resources='%s'" % custom_resources if custom_resources}} --num-gpus=$SKYPILOT_NUM_GPUS --temp-dir {{ray_temp_dir}} || exit 1;
which prlimit && for id in $(pgrep -f raylet/raylet); do sudo prlimit --nofile=1048576:1048576 --pid=$id || true; done;
source ~/skypilot-runtime/bin/activate; ray stop; export RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0; ray start --disable-usage-stats --address=$RAY_HEAD_IP:{{ray_port}} --object-manager-port=8076 {{"--resources='%s'" % custom_resources if custom_resources}} --num-gpus=$SKYPILOT_NUM_GPUS --temp-dir {{ray_temp_dir}} || exit 1;
which prlimit && for id in $(pgrep -f raylet/raylet); do sudo prlimit --nofile=1048576:1048576 --pid=$id || true; done; deactivate
{%- else %}
worker_start_ray_commands: []
{%- endif %}
Expand Down
8 changes: 4 additions & 4 deletions sky/templates/ibm-ray.yml.j2
Original file line number Diff line number Diff line change
Expand Up @@ -118,14 +118,14 @@ head_start_ray_commands:
# NOTE: --disable-usage-stats in `ray start` saves 10 seconds of idle wait.
# Line "which prlimit ..": increase the limit of the number of open files for the raylet process, as the `ulimit` may not take effect at this point, because it requires
# all the sessions to be reloaded. This is a workaround.
- skyray stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 skyray start --disable-usage-stats --head --port={{ray_port}} --dashboard-port={{ray_dashboard_port}} --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1;
- source ~/skypilot-runtime/bin/activate; ray stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 ray start --disable-usage-stats --head --port={{ray_port}} --dashboard-port={{ray_dashboard_port}} --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1;
which prlimit && for id in $(pgrep -f raylet/raylet); do sudo prlimit --nofile=1048576:1048576 --pid=$id || true; done;
{{dump_port_command}};
{{dump_port_command}}; deactivate

{%- if num_nodes > 1 %}
worker_start_ray_commands:
- skyray stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 skyray start --disable-usage-stats --address=$RAY_HEAD_IP:{{ray_port}} --object-manager-port=8076 {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1;
which prlimit && for id in $(pgrep -f raylet/raylet); do sudo prlimit --nofile=1048576:1048576 --pid=$id || true; done;
- source ~/skypilot-runtime/bin/activate; ray stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 ray start --disable-usage-stats --address=$RAY_HEAD_IP:{{ray_port}} --object-manager-port=8076 {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1;
which prlimit && for id in $(pgrep -f raylet/raylet); do sudo prlimit --nofile=1048576:1048576 --pid=$id || true; done; deactivate
{%- else %}
worker_start_ray_commands: []
{%- endif %}
Expand Down
8 changes: 4 additions & 4 deletions sky/templates/kubernetes-ray.yml.j2
Original file line number Diff line number Diff line change
Expand Up @@ -311,14 +311,14 @@ head_start_ray_commands:
# all the sessions to be reloaded. This is a workaround.
# We manually set --object-store-memory=500000000 to avoid ray from allocating a very large object store in each pod that may cause problems for other pods.
- ((ps aux | grep -v nohup | grep -v grep | grep -q -- "python3 -m sky.skylet.skylet") || nohup python3 -m sky.skylet.skylet >> ~/.sky/skylet.log 2>&1 &);
skyray stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 skyray start --disable-usage-stats --head --port={{ray_port}} --dashboard-port={{ray_dashboard_port}} --dashboard-host 0.0.0.0 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} --object-store-memory=500000000 || exit 1;
source ~/skypilot-runtime/bin/activate; ray stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 ray start --disable-usage-stats --head --port={{ray_port}} --dashboard-port={{ray_dashboard_port}} --dashboard-host 0.0.0.0 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} --object-store-memory=500000000 || exit 1;
which prlimit && for id in $(pgrep -f raylet/raylet); do sudo prlimit --nofile=1048576:1048576 --pid=$id || true; done;
{{dump_port_command}};
{{dump_port_command}}; deactivate

{%- if num_nodes > 1 %}
worker_start_ray_commands:
- skyray stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 skyray start --disable-usage-stats --address=$RAY_HEAD_IP:{{ray_port}} --object-manager-port=8076 {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} --object-store-memory=500000000 || exit 1;
which prlimit && for id in $(pgrep -f raylet/raylet); do sudo prlimit --nofile=1048576:1048576 --pid=$id || true; done;
- source ~/skypilot-runtime/bin/activate; ray stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 ray start --disable-usage-stats --address=$RAY_HEAD_IP:{{ray_port}} --object-manager-port=8076 {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} --object-store-memory=500000000 || exit 1;
which prlimit && for id in $(pgrep -f raylet/raylet); do sudo prlimit --nofile=1048576:1048576 --pid=$id || true; done; deactivate
{%- else %}
worker_start_ray_commands: []
{%- endif %}
Expand Down
8 changes: 4 additions & 4 deletions sky/templates/lambda-ray.yml.j2
Original file line number Diff line number Diff line change
Expand Up @@ -88,14 +88,14 @@ setup_commands:
# Increment the following for catching performance bugs easier:
# current num items (num SSH connections): 2
head_start_ray_commands:
- skyray stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 skyray start --disable-usage-stats --head --port={{ray_port}} --dashboard-port={{ray_dashboard_port}} --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1;
- source ~/skypilot-runtime/bin/activate; ray stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 ray start --disable-usage-stats --head --port={{ray_port}} --dashboard-port={{ray_dashboard_port}} --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1;
which prlimit && for id in $(pgrep -f raylet/raylet); do sudo prlimit --nofile=1048576:1048576 --pid=$id || true; done;
{{dump_port_command}};
{{dump_port_command}}; deactivate

{%- if num_nodes > 1 %}
worker_start_ray_commands:
- skyray stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 skyray start --disable-usage-stats --address=$RAY_HEAD_IP:{{ray_port}} --object-manager-port=8076 {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1;
which prlimit && for id in $(pgrep -f raylet/raylet); do sudo prlimit --nofile=1048576:1048576 --pid=$id || true; done;
- source ~/skypilot-runtime/bin/activate; ray stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 ray start --disable-usage-stats --address=$RAY_HEAD_IP:{{ray_port}} --object-manager-port=8076 {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1;
which prlimit && for id in $(pgrep -f raylet/raylet); do sudo prlimit --nofile=1048576:1048576 --pid=$id || true; done; deactivate
{%- else %}
worker_start_ray_commands: []
{%- endif %}
Expand Down
8 changes: 4 additions & 4 deletions sky/templates/oci-ray.yml.j2
Original file line number Diff line number Diff line change
Expand Up @@ -113,14 +113,14 @@ head_start_ray_commands:
# NOTE: --disable-usage-stats in `ray start` saves 10 seconds of idle wait.
# Line "which prlimit ..": increase the limit of the number of open files for the raylet process, as the `ulimit` may not take effect at this point, because it requires
# all the sessions to be reloaded. This is a workaround.
- skyray stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 skyray start --disable-usage-stats --head --port={{ray_port}} --dashboard-port={{ray_dashboard_port}} --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1;
- source ~/skypilot-runtime/bin/activate; ray stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 ray start --disable-usage-stats --head --port={{ray_port}} --dashboard-port={{ray_dashboard_port}} --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1;
which prlimit && for id in $(pgrep -f raylet/raylet); do sudo prlimit --nofile=1048576:1048576 --pid=$id || true; done;
{{dump_port_command}};
{{dump_port_command}}; deactivate

{%- if num_nodes > 1 %}
worker_start_ray_commands:
- skyray stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 skyray start --disable-usage-stats --address=$RAY_HEAD_IP:{{ray_port}} --object-manager-port=8076 {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1;
which prlimit && for id in $(pgrep -f raylet/raylet); do sudo prlimit --nofile=1048576:1048576 --pid=$id || true; done;
- source ~/skypilot-runtime/bin/activate; ray stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 ray start --disable-usage-stats --address=$RAY_HEAD_IP:{{ray_port}} --object-manager-port=8076 {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1;
which prlimit && for id in $(pgrep -f raylet/raylet); do sudo prlimit --nofile=1048576:1048576 --pid=$id || true; done; deactivate
{%- else %}
worker_start_ray_commands: []
{%- endif %}
Expand Down
8 changes: 4 additions & 4 deletions sky/templates/scp-ray.yml.j2
Original file line number Diff line number Diff line change
Expand Up @@ -88,14 +88,14 @@ head_start_ray_commands:
# NOTE: --disable-usage-stats in `ray start` saves 10 seconds of idle wait.
# Line "which prlimit ..": increase the limit of the number of open files for the raylet process, as the `ulimit` may not take effect at this point, because it requires
# all the sessions to be reloaded. This is a workaround.
- skyray stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 skyray start --disable-usage-stats --head --port={{ray_port}} --dashboard-port={{ray_dashboard_port}} --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1;
which prlimit && for id in $(pgrep -f raylet/raylet); do sudo prlimit --nofile=1048576:1048576 --pid=$id || true; done;
- source ~/skypilot-runtime/bin/activate; ray stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 ray start --disable-usage-stats --head --port={{ray_port}} --dashboard-port={{ray_dashboard_port}} --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1;
which prlimit && for id in $(pgrep -f raylet/raylet); do sudo prlimit --nofile=1048576:1048576 --pid=$id || true; done; deactivate
{{dump_port_command}};

{%- if num_nodes > 1 %}
worker_start_ray_commands:
- skyray stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 skyray start --disable-usage-stats --address=$RAY_HEAD_IP:{{ray_port}} --object-manager-port=8076 {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1;
which prlimit && for id in $(pgrep -f raylet/raylet); do sudo prlimit --nofile=1048576:1048576 --pid=$id || true; done;
- source ~/skypilot-runtime/bin/activate; ray stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 ray start --disable-usage-stats --address=$RAY_HEAD_IP:{{ray_port}} --object-manager-port=8076 {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1;
which prlimit && for id in $(pgrep -f raylet/raylet); do sudo prlimit --nofile=1048576:1048576 --pid=$id || true; done; deactivate
{%- else %}
worker_start_ray_commands: []
{%- endif %}
Expand Down

0 comments on commit db60092

Please sign in to comment.