Skip to content

Commit

Permalink
[Spot] Allow 2x spot jobs on a spot controller (#3191)
Browse files Browse the repository at this point in the history
* Allow 2x more spot jobs on a controller

* rename

* format

* Add addition requirement of controller memory

* refactor

* remove unused import

* Update sky/spot/constants.py

Co-authored-by: Zongheng Yang <[email protected]>

* address comments

* rephrase

* wording

---------

Co-authored-by: Zongheng Yang <[email protected]>
  • Loading branch information
Michaelvll and concretevitamin authored Feb 21, 2024
1 parent 0c27d4d commit 40fa245
Show file tree
Hide file tree
Showing 7 changed files with 31 additions and 13 deletions.
13 changes: 6 additions & 7 deletions sky/backends/backend_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,6 @@
from sky import exceptions
from sky import global_user_state
from sky import provision as provision_lib
from sky import serve as serve_lib
from sky import sky_logging
from sky import skypilot_config
from sky import status_lib
Expand Down Expand Up @@ -2467,11 +2466,11 @@ def get_task_demands_dict(task: 'task_lib.Task') -> Dict[str, float]:
optionally accelerator demands.
"""
# TODO: Custom CPU and other memory resources are not supported yet.
# For sky serve controller task, we set the CPU resource to a smaller
# value to support a larger number of services.
# For sky spot/serve controller task, we set the CPU resource to a smaller
# value to support a larger number of spot jobs and services.
resources_dict = {
'CPU': (serve_lib.SERVICES_TASK_CPU_DEMAND
if task.service_name is not None else DEFAULT_TASK_CPU_DEMAND)
'CPU': (constants.CONTROLLER_PROCESS_CPU_DEMAND
if task.is_controller_task() else DEFAULT_TASK_CPU_DEMAND)
}
if task.best_resources is not None:
resources = task.best_resources
Expand All @@ -2491,8 +2490,8 @@ def get_task_resources_str(task: 'task_lib.Task') -> str:
The resources string is only used as a display purpose, so we only show
the accelerator demands (if any). Otherwise, the CPU demand is shown.
"""
task_cpu_demand = (serve_lib.SERVICES_TASK_CPU_DEMAND if task.service_name
is not None else DEFAULT_TASK_CPU_DEMAND)
task_cpu_demand = (constants.CONTROLLER_PROCESS_CPU_DEMAND if
task.is_controller_task() else DEFAULT_TASK_CPU_DEMAND)
if task.best_resources is not None:
accelerator_dict = task.best_resources.accelerators
if accelerator_dict is None:
Expand Down
1 change: 0 additions & 1 deletion sky/serve/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
from sky.serve.constants import ENDPOINT_PROBE_INTERVAL_SECONDS
from sky.serve.constants import INITIAL_VERSION
from sky.serve.constants import LB_CONTROLLER_SYNC_INTERVAL_SECONDS
from sky.serve.constants import SERVICES_TASK_CPU_DEMAND
from sky.serve.constants import SKYSERVE_METADATA_DIR
from sky.serve.core import down
from sky.serve.core import status
Expand Down
9 changes: 6 additions & 3 deletions sky/serve/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,9 +53,12 @@
# do some log rotation.
CONTROLLER_RESOURCES = {'cpus': '4+', 'disk_size': 200}

# A default controller with 4 vCPU and 16 GB memory can run up to 16 services.
SERVICES_MEMORY_USAGE_GB = 1.0
SERVICES_TASK_CPU_DEMAND = 0.25
# Due to the CPU/memory usage of the controller process launched with sky job (
# use ray job under the hood), we need to reserve some CPU/memory for each serve
# controller process.
# Serve: A default controller with 4 vCPU and 16 GB memory can run up to 16
# services.
CONTROLLER_MEMORY_USAGE_GB = 1.0

# A period of time to initialize your service. Any readiness probe failures
# during this period will be ignored.
Expand Down
3 changes: 2 additions & 1 deletion sky/serve/serve_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,8 @@
SKY_SERVE_CONTROLLER_NAME: str = (
f'sky-serve-controller-{common_utils.get_user_hash()}')
_SYSTEM_MEMORY_GB = psutil.virtual_memory().total // (1024**3)
NUM_SERVICE_THRESHOLD = _SYSTEM_MEMORY_GB // constants.SERVICES_MEMORY_USAGE_GB
NUM_SERVICE_THRESHOLD = (_SYSTEM_MEMORY_GB //
constants.CONTROLLER_MEMORY_USAGE_GB)
_CONTROLLER_URL = 'http://localhost:{CONTROLLER_PORT}'

_SKYPILOT_PROVISION_LOG_PATTERN = r'.*tail -n100 -f (.*provision\.log).*'
Expand Down
9 changes: 9 additions & 0 deletions sky/skylet/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,3 +107,12 @@
# controller and sky serve controller.
# TODO(tian): Refactor to controller_utils. Current blocker: circular import.
CONTROLLER_IDLE_MINUTES_TO_AUTOSTOP = 10

# Due to the CPU/memory usage of the controller process launched with sky job (
# use ray job under the hood), we need to reserve some CPU/memory for each spot/
# serve controller process.
# Spot: A default controller with 8 vCPU and 32 GB memory can manage up to 32
# spot jobs.
# Serve: A default controller with 4 vCPU and 16 GB memory can run up to 16
# services.
CONTROLLER_PROCESS_CPU_DEMAND = 0.25
5 changes: 4 additions & 1 deletion sky/spot/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,11 @@
# Use default CPU instance type for spot controller, i.e.
# m6i.2xlarge (8vCPUs, 32 GB) for AWS, Standard_D8s_v4 (8vCPUs, 32 GB)
# for Azure, and n1-standard-8 (8 vCPUs, 32 GB) for GCP.
# Based on profiling, memory should be at least 3x (in GB) as num vCPUs to avoid
# OOM (each vCPU can have 4 spot controller processes as we set the CPU
# requirement to 0.25, and 3 GB is barely enough for 4 spot processes).
# We use 50 GB disk size to reduce the cost.
CONTROLLER_RESOURCES = {'disk_size': 50}
CONTROLLER_RESOURCES = {'memory': '3x', 'disk_size': 50}

# Max length of the cluster name for GCP is 35, the user hash to be attached is
# 4+1 chars, and we assume the maximum length of the job id is 4+1, so the max
Expand Down
4 changes: 4 additions & 0 deletions sky/task.py
Original file line number Diff line number Diff line change
Expand Up @@ -983,6 +983,10 @@ def get_local_to_remote_file_mounts(self) -> Optional[Dict[str, str]]:
d[k] = v
return d

def is_controller_task(self) -> bool:
"""Returns whether this task is a spot/serve controller process."""
return self.spot_dag is not None or self.service_name is not None

def get_cloud_to_remote_file_mounts(self) -> Optional[Dict[str, str]]:
"""Returns file mounts of the form (dst=VM path, src=cloud URL).
Expand Down

0 comments on commit 40fa245

Please sign in to comment.