Skip to content

Commit

Permalink
Merge branch 'master' of https://github.com/skypilot-org/skypilot int…
Browse files Browse the repository at this point in the history
…o k8s_enable_job_controller
  • Loading branch information
romilbhardwaj committed May 8, 2024
2 parents 4e3a3cd + 0a03995 commit 9c09025
Show file tree
Hide file tree
Showing 5 changed files with 38 additions and 12 deletions.
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,4 @@ sky_logs/
sky/clouds/service_catalog/data_fetchers/*.csv
.vscode/
.idea/

.env
10 changes: 10 additions & 0 deletions sky/clouds/aws.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,8 @@ class AWSIdentityType(enum.Enum):

IAM_ROLE = 'iam-role'

CONTAINER_ROLE = 'container-role'

# Name Value Type Location
# ---- ----- ---- --------
# profile <not set> None None
Expand Down Expand Up @@ -545,6 +547,12 @@ def check_credentials(cls) -> Tuple[bool, Optional[str]]:
# jobs-controller) created by an SSO account, i.e. the VM will be
# assigned the IAM role: skypilot-v1.
hints = f'AWS IAM role is set.{single_cloud_hint}'
elif identity_type == AWSIdentityType.CONTAINER_ROLE:
# Similar to the IAM ROLE, an ECS container may not store credentials
# in the~/.aws/credentials file. So we don't check for the existence of
# the file. i.e. the container will be assigned the IAM role of the
# task: skypilot-v1.
hints = f'AWS container-role is set.{single_cloud_hint}'
else:
# This file is required because it is required by the VMs launched on
# other clouds to access private s3 buckets and resources like EC2.
Expand Down Expand Up @@ -604,6 +612,8 @@ def _is_access_key_of_type(type_str: str) -> bool:
return AWSIdentityType.SSO
elif _is_access_key_of_type(AWSIdentityType.IAM_ROLE.value):
return AWSIdentityType.IAM_ROLE
elif _is_access_key_of_type(AWSIdentityType.CONTAINER_ROLE.value):
return AWSIdentityType.CONTAINER_ROLE
elif _is_access_key_of_type(AWSIdentityType.ENV.value):
return AWSIdentityType.ENV
else:
Expand Down
8 changes: 7 additions & 1 deletion sky/skylet/autostop_lib.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,10 +75,16 @@ def set_autostopping_started() -> None:
configs.set_config(_AUTOSTOP_INDICATOR, str(psutil.boot_time()))


def get_is_autostopping_payload() -> str:
def get_is_autostopping() -> bool:
"""Returns whether the cluster is in the process of autostopping."""
result = configs.get_config(_AUTOSTOP_INDICATOR)
is_autostopping = (result == str(psutil.boot_time()))
return is_autostopping


def get_is_autostopping_payload() -> str:
"""Payload for whether the cluster is in the process of autostopping."""
is_autostopping = get_is_autostopping()
return common_utils.encode_payload(is_autostopping)


Expand Down
2 changes: 2 additions & 0 deletions sky/skylet/providers/azure/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,8 @@ def _configure_resource_group(config):
create_or_update = get_azure_sdk_function(
client=resource_client.deployments, function_name="create_or_update"
)
# TODO (skypilot): this takes a long time (> 40 seconds) for stopping an
# azure VM, and this can be called twice during ray down.
outputs = (
create_or_update(
resource_group_name=resource_group,
Expand Down
28 changes: 18 additions & 10 deletions sky/skylet/providers/azure/node_provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
bootstrap_azure,
get_azure_sdk_function,
)
from sky.skylet import autostop_lib
from sky.skylet.providers.command_runner import SkyDockerCommandRunner
from sky.provision import docker_utils

Expand Down Expand Up @@ -61,16 +62,23 @@ class AzureNodeProvider(NodeProvider):

def __init__(self, provider_config, cluster_name):
NodeProvider.__init__(self, provider_config, cluster_name)
# TODO(suquark): This is a temporary patch for resource group.
# By default, Ray autoscaler assumes the resource group is still here even
# after the whole cluster is destroyed. However, now we deletes the resource
# group after tearing down the cluster. To comfort the autoscaler, we need
# to create/update it here, so the resource group always exists.
from sky.skylet.providers.azure.config import _configure_resource_group

_configure_resource_group(
{"cluster_name": cluster_name, "provider": provider_config}
)
if not autostop_lib.get_is_autostopping():
# TODO(suquark): This is a temporary patch for resource group.
# By default, Ray autoscaler assumes the resource group is still
# here even after the whole cluster is destroyed. However, now we
# deletes the resource group after tearing down the cluster. To
# comfort the autoscaler, we need to create/update it here, so the
# resource group always exists.
#
# We should not re-configure the resource group again, when it is
# running on the remote VM and the autostopping is in progress,
# because the VM is running which guarantees the resource group
# exists.
from sky.skylet.providers.azure.config import _configure_resource_group

_configure_resource_group(
{"cluster_name": cluster_name, "provider": provider_config}
)
subscription_id = provider_config["subscription_id"]
self.cache_stopped_nodes = provider_config.get("cache_stopped_nodes", True)
# Sky only supports Azure CLI credential for now.
Expand Down

0 comments on commit 9c09025

Please sign in to comment.