diff --git a/sky/backends/backend_utils.py b/sky/backends/backend_utils.py index 7ba052bb7c2..58f621c8596 100644 --- a/sky/backends/backend_utils.py +++ b/sky/backends/backend_utils.py @@ -32,6 +32,7 @@ from sky import exceptions from sky import global_user_state from sky import provision as provision_lib +from sky import serve as serve_lib from sky import sky_logging from sky import skypilot_config from sky import status_lib @@ -2525,7 +2526,7 @@ def get_task_demands_dict(task: 'task_lib.Task') -> Dict[str, float]: # For sky serve controller task, we set the CPU resource to a smaller # value to support a larger number of services. resources_dict = { - 'CPU': (constants.SERVICES_TASK_CPU_DEMAND + 'CPU': (serve_lib.SERVICES_TASK_CPU_DEMAND if task.service_name is not None else DEFAULT_TASK_CPU_DEMAND) } if task.best_resources is not None: @@ -2546,7 +2547,7 @@ def get_task_resources_str(task: 'task_lib.Task') -> str: The resources string is only used as a display purpose, so we only show the accelerator demands (if any). Otherwise, the CPU demand is shown. """ - task_cpu_demand = (constants.SERVICES_TASK_CPU_DEMAND if task.service_name + task_cpu_demand = (serve_lib.SERVICES_TASK_CPU_DEMAND if task.service_name is not None else DEFAULT_TASK_CPU_DEMAND) if task.best_resources is not None: accelerator_dict = task.best_resources.accelerators diff --git a/sky/execution.py b/sky/execution.py index e4e3c6e7843..55094ba7695 100644 --- a/sky/execution.py +++ b/sky/execution.py @@ -143,7 +143,7 @@ def _maybe_clone_disk_from_cluster(clone_disk_from: Optional[str], return task -def execute( +def _execute( entrypoint: Union['sky.Task', 'sky.Dag'], dryrun: bool = False, down: bool = False, @@ -402,6 +402,7 @@ def launch( # pylint: disable=invalid-name _is_launched_by_spot_controller: bool = False, _is_launched_by_sky_serve_controller: bool = False, + _disable_controller_check: bool = False, ) -> Tuple[Optional[int], Optional[backends.ResourceHandle]]: # NOTE(dev): Keep the docstring consistent between the Python API and CLI. """Launch a cluster or task. @@ -490,10 +491,11 @@ def launch( if dryrun. """ entrypoint = task - controller_utils.check_cluster_name_not_controller( - cluster_name, operation_str='sky.launch') + if not _disable_controller_check: + controller_utils.check_cluster_name_not_controller( + cluster_name, operation_str='sky.launch') - return execute( + return _execute( entrypoint=entrypoint, dryrun=dryrun, down=down, @@ -590,7 +592,7 @@ def exec( # pylint: disable=redefined-builtin operation='executing tasks', check_cloud_vm_ray_backend=False, dryrun=dryrun) - return execute( + return _execute( entrypoint=entrypoint, dryrun=dryrun, down=down, @@ -701,7 +703,7 @@ def spot_launch( f'Launching managed spot job {dag.name!r} from spot controller...' f'{colorama.Style.RESET_ALL}') print('Launching spot controller...') - execute( + _execute( entrypoint=controller_task, stream_logs=stream_logs, cluster_name=controller_name, diff --git a/sky/serve/__init__.py b/sky/serve/__init__.py index d6976abce11..7f905add412 100644 --- a/sky/serve/__init__.py +++ b/sky/serve/__init__.py @@ -4,6 +4,7 @@ from sky.serve.constants import ENDPOINT_PROBE_INTERVAL_SECONDS from sky.serve.constants import INITIAL_VERSION from sky.serve.constants import LB_CONTROLLER_SYNC_INTERVAL_SECONDS +from sky.serve.constants import SERVICES_TASK_CPU_DEMAND from sky.serve.constants import SKYSERVE_METADATA_DIR from sky.serve.core import down from sky.serve.core import status diff --git a/sky/serve/constants.py b/sky/serve/constants.py index 5b8fa8f206a..4f11e8a1f3c 100644 --- a/sky/serve/constants.py +++ b/sky/serve/constants.py @@ -53,6 +53,10 @@ # do some log rotation. CONTROLLER_RESOURCES = {'cpus': '4+', 'disk_size': 200} +# A default controller with 4 vCPU and 16 GB memory can run up to 16 services. +SERVICES_MEMORY_USAGE_GB = 1.0 +SERVICES_TASK_CPU_DEMAND = 0.25 + # A period of time to initialize your service. Any readiness probe failures # during this period will be ignored. DEFAULT_INITIAL_DELAY_SECONDS = 1200 diff --git a/sky/serve/core.py b/sky/serve/core.py index 1ac2202aa1f..bce384b14bf 100644 --- a/sky/serve/core.py +++ b/sky/serve/core.py @@ -42,10 +42,6 @@ def up( task: sky.Task to serve up. service_name: Name of the service. """ - # This is to avoid circular import. - # pylint: disable=import-outside-toplevel - from sky import execution - if service_name is None: service_name = serve_utils.generate_service_name() @@ -163,14 +159,15 @@ def up( # whether the service is already running. If the id is the same # with the current job id, we know the service is up and running # for the first time; otherwise it is a name conflict. - controller_job_id, controller_handle = execution.execute( - entrypoint=controller_task, + controller_job_id, controller_handle = sky.launch( + task=controller_task, stream_logs=False, cluster_name=controller_name, detach_run=True, idle_minutes_to_autostop=constants. CONTROLLER_IDLE_MINUTES_TO_AUTOSTOP, retry_until_up=True, + _disable_controller_check=True, ) style = colorama.Style diff --git a/sky/serve/serve_utils.py b/sky/serve/serve_utils.py index 9e8176e08a6..8431dfab9f8 100644 --- a/sky/serve/serve_utils.py +++ b/sky/serve/serve_utils.py @@ -25,7 +25,6 @@ from sky import status_lib from sky.serve import constants from sky.serve import serve_state -from sky.skylet import constants as skylet_constants from sky.skylet import job_lib from sky.utils import common_utils from sky.utils import log_utils @@ -38,8 +37,7 @@ SKY_SERVE_CONTROLLER_NAME: str = ( f'sky-serve-controller-{common_utils.get_user_hash()}') _SYSTEM_MEMORY_GB = psutil.virtual_memory().total // (1024**3) -NUM_SERVICE_THRESHOLD = (_SYSTEM_MEMORY_GB // - skylet_constants.SERVICES_MEMORY_USAGE_GB) +NUM_SERVICE_THRESHOLD = _SYSTEM_MEMORY_GB // constants.SERVICES_MEMORY_USAGE_GB _CONTROLLER_URL = 'http://localhost:{CONTROLLER_PORT}' _SKYPILOT_PROVISION_LOG_PATTERN = r'.*tail -n100 -f (.*provision\.log).*' diff --git a/sky/skylet/constants.py b/sky/skylet/constants.py index 5692d641665..1ef51ca2c7b 100644 --- a/sky/skylet/constants.py +++ b/sky/skylet/constants.py @@ -54,12 +54,6 @@ # Port on the remote spot controller that the dashboard is running on. SPOT_DASHBOARD_REMOTE_PORT = 5000 -# A default controller with 4 vCPU and 16 GB memory can run up to 16 services. -# TODO(tian): This is to fix circular imports. Move this back to -# sky.serve.constants. -SERVICES_MEMORY_USAGE_GB = 1.0 -SERVICES_TASK_CPU_DEMAND = 0.25 - # Docker default options DEFAULT_DOCKER_CONTAINER_NAME = 'sky_container' DEFAULT_DOCKER_PORT = 10022