Skip to content

Commit

Permalink
Address comments
Browse files Browse the repository at this point in the history
  • Loading branch information
Michaelvll committed Jan 13, 2024
1 parent 07498d9 commit b11446e
Show file tree
Hide file tree
Showing 4 changed files with 14 additions and 48 deletions.
4 changes: 1 addition & 3 deletions sky/clouds/runpod.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,8 +178,6 @@ def _get_feasible_launchable_resources(
self, resources: 'resources_lib.Resources'
) -> Tuple[List['resources_lib.Resources'], List[str]]:
"""Returns a list of feasible resources for the given resources."""
if resources.use_spot:
return ([], [])
if resources.instance_type is not None:
assert resources.is_launchable(), resources
resources = resources.copy(accelerators=None)
Expand Down Expand Up @@ -244,7 +242,7 @@ def check_credentials(cls) -> Tuple[bool, Optional[str]]:
return True, None

except ImportError:
return False, ('Failed to import runpod.'
return False, ('Failed to import runpod. '
'To install, run: pip install skypilot[runpod]')

def get_credential_file_mounts(self) -> Dict[str, str]:
Expand Down
16 changes: 8 additions & 8 deletions sky/provision/runpod/instance.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,18 +159,18 @@ def get_cluster_info(
running_instances = _filter_instances(cluster_name_on_cloud, ['RUNNING'])
instances: Dict[str, List[common.InstanceInfo]] = {}
head_instance_id = None
for node_id, node_info in running_instances.items():
instances[node_id] = [
for instance_id, instance_info in running_instances.items():
instances[instance_id] = [
common.InstanceInfo(
instance_id=node_id,
internal_ip=node_info['internal_ip'],
external_ip=node_info['external_ip'],
ssh_port=node_info['ssh_port'],
instance_id=instance_id,
internal_ip=instance_info['internal_ip'],
external_ip=instance_info['external_ip'],
ssh_port=instance_info['ssh_port'],
tags={},
)
]
if node_info['name'].endswith('-head'):
head_instance_id = node_id
if instance_info['name'].endswith('-head'):
head_instance_id = instance_id

return common.ClusterInfo(
instances=instances,
Expand Down
6 changes: 3 additions & 3 deletions sky/provision/runpod/utils.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
"""RunPod library wrapper for SkyPilot."""

import time
from typing import Dict, List
from typing import Any, Dict, List

from sky import sky_logging
from sky.adaptors import runpod
Expand Down Expand Up @@ -64,11 +64,11 @@ def wrapper(*args, **kwargs):
return wrapper


def list_instances() -> Dict[str, dict]:
def list_instances() -> Dict[str, Dict[str, Any]]:
"""Lists instances associated with API key."""
instances = runpod.runpod().get_pods()

instance_dict: Dict[str, dict] = {}
instance_dict: Dict[str, Dict[str, Any]] = {}
for instance in instances:
info = {}

Expand Down
36 changes: 2 additions & 34 deletions sky/templates/runpod-ray.yml.j2
Original file line number Diff line number Diff line change
Expand Up @@ -72,37 +72,5 @@ setup_commands:
python3 -c "from sky.skylet.ray_patches import patch; patch()" || exit 1;
[ -f /etc/fuse.conf ] && sudo sed -i 's/#user_allow_other/user_allow_other/g' /etc/fuse.conf || (sudo sh -c 'echo "user_allow_other" > /etc/fuse.conf');

# Command to start ray on the head node. You don't need to change this.
# NOTE: these are very performance-sensitive. Each new item opens/closes an SSH
# connection, which is expensive. Try your best to co-locate commands into fewer
# items! The same comment applies for worker_start_ray_commands.
#
# Increment the following for catching performance bugs easier:
# current num items (num SSH connections): 1
head_start_ray_commands:
# NOTE: --disable-usage-stats in `ray start` saves 10 seconds of idle wait.
# Line "which prlimit ..": increase the limit of the number of open files for the raylet process, as the `ulimit` may not take effect at this point, because it requires
# all the sessions to be reloaded. This is a workaround.
- export SKYPILOT_NUM_GPUS=0 && which nvidia-smi > /dev/null && SKYPILOT_NUM_GPUS=$(nvidia-smi --query-gpu=index,name --format=csv,noheader | wc -l);
ray stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 ray start --disable-usage-stats --head --port={{ray_port}} --dashboard-port={{ray_dashboard_port}} --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml {{"--resources='%s'" % custom_resources if custom_resources}} --num-gpus=$SKYPILOT_NUM_GPUS --temp-dir {{ray_temp_dir}} || exit 1;
which prlimit && for id in $(pgrep -f raylet/raylet); do sudo prlimit --nofile=1048576:1048576 --pid=$id || true; done;
{{dump_port_command}};

# Worker commands are needed for TPU VM Pods
{%- if num_nodes > 1 or tpu_vm %}
worker_start_ray_commands:
- SKYPILOT_NUM_GPUS=0 && which nvidia-smi > /dev/null && SKYPILOT_NUM_GPUS=$(nvidia-smi --query-gpu=index,name --format=csv,noheader | wc -l);
ray stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 ray start --disable-usage-stats --address=$RAY_HEAD_IP:{{ray_port}} --object-manager-port=8076 {{"--resources='%s'" % custom_resources if custom_resources}} --num-gpus=$SKYPILOT_NUM_GPUS --temp-dir {{ray_temp_dir}} || exit 1;
which prlimit && for id in $(pgrep -f raylet/raylet); do sudo prlimit --nofile=1048576:1048576 --pid=$id || true; done;
{%- else %}
worker_start_ray_commands: []
{%- endif %}

head_node: {}
worker_nodes: {}

# These fields are required for external cloud providers.
head_setup_commands: []
worker_setup_commands: []
cluster_synced_files: []
file_mounts_sync_continuously: False
# Command to start ray clusters are now placed in `sky.provision.instance_setup`.
# We do not need to list it here anymore.

0 comments on commit b11446e

Please sign in to comment.