From b11446e3c37fc6b8632efced49d628f638383136 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Sat, 13 Jan 2024 03:56:27 +0000 Subject: [PATCH] Address comments --- sky/clouds/runpod.py | 4 +--- sky/provision/runpod/instance.py | 16 +++++++------- sky/provision/runpod/utils.py | 6 +++--- sky/templates/runpod-ray.yml.j2 | 36 ++------------------------------ 4 files changed, 14 insertions(+), 48 deletions(-) diff --git a/sky/clouds/runpod.py b/sky/clouds/runpod.py index c362ab39d04..f07bd18ee46 100644 --- a/sky/clouds/runpod.py +++ b/sky/clouds/runpod.py @@ -178,8 +178,6 @@ def _get_feasible_launchable_resources( self, resources: 'resources_lib.Resources' ) -> Tuple[List['resources_lib.Resources'], List[str]]: """Returns a list of feasible resources for the given resources.""" - if resources.use_spot: - return ([], []) if resources.instance_type is not None: assert resources.is_launchable(), resources resources = resources.copy(accelerators=None) @@ -244,7 +242,7 @@ def check_credentials(cls) -> Tuple[bool, Optional[str]]: return True, None except ImportError: - return False, ('Failed to import runpod.' + return False, ('Failed to import runpod. ' 'To install, run: pip install skypilot[runpod]') def get_credential_file_mounts(self) -> Dict[str, str]: diff --git a/sky/provision/runpod/instance.py b/sky/provision/runpod/instance.py index c51d8b9ab3f..9f3a1d92886 100644 --- a/sky/provision/runpod/instance.py +++ b/sky/provision/runpod/instance.py @@ -159,18 +159,18 @@ def get_cluster_info( running_instances = _filter_instances(cluster_name_on_cloud, ['RUNNING']) instances: Dict[str, List[common.InstanceInfo]] = {} head_instance_id = None - for node_id, node_info in running_instances.items(): - instances[node_id] = [ + for instance_id, instance_info in running_instances.items(): + instances[instance_id] = [ common.InstanceInfo( - instance_id=node_id, - internal_ip=node_info['internal_ip'], - external_ip=node_info['external_ip'], - ssh_port=node_info['ssh_port'], + instance_id=instance_id, + internal_ip=instance_info['internal_ip'], + external_ip=instance_info['external_ip'], + ssh_port=instance_info['ssh_port'], tags={}, ) ] - if node_info['name'].endswith('-head'): - head_instance_id = node_id + if instance_info['name'].endswith('-head'): + head_instance_id = instance_id return common.ClusterInfo( instances=instances, diff --git a/sky/provision/runpod/utils.py b/sky/provision/runpod/utils.py index 3040b0e329b..00b24aee0a8 100644 --- a/sky/provision/runpod/utils.py +++ b/sky/provision/runpod/utils.py @@ -1,7 +1,7 @@ """RunPod library wrapper for SkyPilot.""" import time -from typing import Dict, List +from typing import Any, Dict, List from sky import sky_logging from sky.adaptors import runpod @@ -64,11 +64,11 @@ def wrapper(*args, **kwargs): return wrapper -def list_instances() -> Dict[str, dict]: +def list_instances() -> Dict[str, Dict[str, Any]]: """Lists instances associated with API key.""" instances = runpod.runpod().get_pods() - instance_dict: Dict[str, dict] = {} + instance_dict: Dict[str, Dict[str, Any]] = {} for instance in instances: info = {} diff --git a/sky/templates/runpod-ray.yml.j2 b/sky/templates/runpod-ray.yml.j2 index a8350ae265a..fa3598e429e 100644 --- a/sky/templates/runpod-ray.yml.j2 +++ b/sky/templates/runpod-ray.yml.j2 @@ -72,37 +72,5 @@ setup_commands: python3 -c "from sky.skylet.ray_patches import patch; patch()" || exit 1; [ -f /etc/fuse.conf ] && sudo sed -i 's/#user_allow_other/user_allow_other/g' /etc/fuse.conf || (sudo sh -c 'echo "user_allow_other" > /etc/fuse.conf'); -# Command to start ray on the head node. You don't need to change this. -# NOTE: these are very performance-sensitive. Each new item opens/closes an SSH -# connection, which is expensive. Try your best to co-locate commands into fewer -# items! The same comment applies for worker_start_ray_commands. -# -# Increment the following for catching performance bugs easier: -# current num items (num SSH connections): 1 -head_start_ray_commands: - # NOTE: --disable-usage-stats in `ray start` saves 10 seconds of idle wait. - # Line "which prlimit ..": increase the limit of the number of open files for the raylet process, as the `ulimit` may not take effect at this point, because it requires - # all the sessions to be reloaded. This is a workaround. - - export SKYPILOT_NUM_GPUS=0 && which nvidia-smi > /dev/null && SKYPILOT_NUM_GPUS=$(nvidia-smi --query-gpu=index,name --format=csv,noheader | wc -l); - ray stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 ray start --disable-usage-stats --head --port={{ray_port}} --dashboard-port={{ray_dashboard_port}} --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml {{"--resources='%s'" % custom_resources if custom_resources}} --num-gpus=$SKYPILOT_NUM_GPUS --temp-dir {{ray_temp_dir}} || exit 1; - which prlimit && for id in $(pgrep -f raylet/raylet); do sudo prlimit --nofile=1048576:1048576 --pid=$id || true; done; - {{dump_port_command}}; - -# Worker commands are needed for TPU VM Pods -{%- if num_nodes > 1 or tpu_vm %} -worker_start_ray_commands: - - SKYPILOT_NUM_GPUS=0 && which nvidia-smi > /dev/null && SKYPILOT_NUM_GPUS=$(nvidia-smi --query-gpu=index,name --format=csv,noheader | wc -l); - ray stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 ray start --disable-usage-stats --address=$RAY_HEAD_IP:{{ray_port}} --object-manager-port=8076 {{"--resources='%s'" % custom_resources if custom_resources}} --num-gpus=$SKYPILOT_NUM_GPUS --temp-dir {{ray_temp_dir}} || exit 1; - which prlimit && for id in $(pgrep -f raylet/raylet); do sudo prlimit --nofile=1048576:1048576 --pid=$id || true; done; -{%- else %} -worker_start_ray_commands: [] -{%- endif %} - -head_node: {} -worker_nodes: {} - -# These fields are required for external cloud providers. -head_setup_commands: [] -worker_setup_commands: [] -cluster_synced_files: [] -file_mounts_sync_continuously: False +# Command to start ray clusters are now placed in `sky.provision.instance_setup`. +# We do not need to list it here anymore.