Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Envoy as an alternative Sky Serve load balancer implementation #4256

Open
wants to merge 8 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions sky/clouds/cloud.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ class CloudImplementationFeatures(enum.Enum):
STORAGE_MOUNTING = 'storage_mounting'
HOST_CONTROLLERS = 'host_controllers' # Can run jobs/serve controllers
AUTO_TERMINATE = 'auto_terminate' # Pod/VM can stop or down itself
HOST_ENVOY_LOAD_BALANCER = 'host_envoy_load_balancer'


class Region(collections.namedtuple('Region', ['name'])):
Expand Down
4 changes: 4 additions & 0 deletions sky/clouds/kubernetes.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,10 @@ class Kubernetes(clouds.Cloud):
'tiers are not '
'supported in '
'Kubernetes.',
clouds.CloudImplementationFeatures.HOST_ENVOY_LOAD_BALANCER:
'Envoy load balancer is not '
'supported on Kubernetes '
'controllers.'
}

IMAGE_CPU = 'skypilot:custom-cpu-ubuntu-2004'
Expand Down
14 changes: 14 additions & 0 deletions sky/serve/constants.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
"""Constants used for SkyServe."""

from enum import Enum

CONTROLLER_TEMPLATE = 'sky-serve-controller.yaml.j2'

SKYSERVE_METADATA_DIR = '~/.sky/serve'
Expand Down Expand Up @@ -100,3 +102,15 @@
TERMINATE_REPLICA_VERSION_MISMATCH_ERROR = (
'The version of service is outdated and does not support manually '
'terminating replicas. Please terminate the service and spin up again.')

# TODO(ejj) ultimately these should be configurable by users.
ENVOY_THREADS = '1'
ENVOY_VERSION = '1.32.0'


class LbType(Enum):
PYTHON = 'python'
ENVOY = 'envoy'


ALL_LB_TYPES = [t.value for t in LbType]
11 changes: 10 additions & 1 deletion sky/serve/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

import sky
from sky import backends
from sky import clouds
from sky import exceptions
from sky import sky_logging
from sky import task as task_lib
Expand Down Expand Up @@ -152,9 +153,17 @@ def up(
serve_utils.generate_remote_config_yaml_file_name(service_name))
controller_log_file = (
serve_utils.generate_remote_controller_log_file_name(service_name))

requested_features = set()
lb_type = task_config.get('service', {}).get('load_balancer_type', None)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

im wondering if we want a load_balancer field to host both the policy and type field

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

could we do this as a follow up PR? Since we've already got the load_balancer_policy field I think it would make more sense.

That said if you prefer I can tack it on to this one

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think the more PR we separate it to, the more backward compatibility we need to tackle in the future. But cc @Michaelvll for some inputs here

if lb_type == serve_constants.LbType.ENVOY.value:
requested_features.add(
clouds.CloudImplementationFeatures.HOST_ENVOY_LOAD_BALANCER)

controller_resources = controller_utils.get_controller_resources(
controller=controller_utils.Controllers.SKY_SERVE_CONTROLLER,
task_resources=task.resources)
task_resources=task.resources,
requested_features=requested_features)

vars_to_fill = {
'remote_task_yaml_path': remote_tmp_task_yaml_path,
Expand Down
Loading