Skip to content

Commit

Permalink
make cloud init more readable
Browse files Browse the repository at this point in the history
  • Loading branch information
Michaelvll committed Jul 7, 2024
1 parent 8bd49c9 commit ae344ad
Show file tree
Hide file tree
Showing 6 changed files with 35 additions and 51 deletions.
31 changes: 0 additions & 31 deletions sky/authentication.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
is an exception, due to the limitation of the cloud provider. See the
comments in setup_lambda_authentication)
"""
import base64
import copy
import functools
import os
Expand Down Expand Up @@ -270,36 +269,6 @@ def setup_gcp_authentication(config: Dict[str, Any]) -> Dict[str, Any]:
return configure_ssh_info(config)


# In Azure, cloud-init script must be encoded in base64. See
# https://learn.microsoft.com/en-us/azure/virtual-machines/custom-data
# for more information. Here we decode it and replace the ssh user
# and public key content, then encode it back.
def setup_azure_authentication(config: Dict[str, Any]) -> Dict[str, Any]:
_, public_key_path = get_or_generate_keys()
with open(public_key_path, 'r', encoding='utf-8') as f:
public_key = f.read().strip()
for node_type in config['available_node_types']:
node_config = config['available_node_types'][node_type]['node_config']
cloud_init = (
node_config['azure_arm_parameters']['cloudInitSetupCommands'])
cloud_init = base64.b64decode(cloud_init).decode('utf-8')
cloud_init = cloud_init.replace('skypilot:ssh_user',
config['auth']['ssh_user'])
cloud_init = cloud_init.replace('skypilot:ssh_public_key_content',
public_key)
cloud_init = base64.b64encode(
cloud_init.encode('utf-8')).decode('utf-8')
node_config['azure_arm_parameters']['cloudInitSetupCommands'] = (
cloud_init)
config_str = common_utils.dump_yaml_str(config)
config_str = config_str.replace('skypilot:ssh_user',
config['auth']['ssh_user'])
config_str = config_str.replace('skypilot:ssh_public_key_content',
public_key)
config = yaml.safe_load(config_str)
return config


def setup_lambda_authentication(config: Dict[str, Any]) -> Dict[str, Any]:

get_or_generate_keys()
Expand Down
16 changes: 11 additions & 5 deletions sky/backends/backend_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,7 +156,8 @@
('provider', 'tpu_node'),
('provider', 'security_group', 'GroupName'),
('available_node_types', 'ray.head.default', 'node_config', 'UserData'),
('available_node_types', 'ray.worker.default', 'node_config', 'UserData'),
('available_node_types', 'ray.head.default', 'node_config',
'azure_arm_parameters', 'cloudInitSetupCommands'),
]


Expand Down Expand Up @@ -1029,13 +1030,18 @@ def _add_auth_to_cluster_config(cloud: clouds.Cloud, cluster_config_file: str):
"""
config = common_utils.read_yaml(cluster_config_file)
# Check the availability of the cloud type.
if isinstance(cloud, (clouds.AWS, clouds.OCI, clouds.SCP, clouds.Vsphere,
clouds.Cudo, clouds.Paperspace)):
if isinstance(cloud, (
clouds.AWS,
clouds.OCI,
clouds.SCP,
clouds.Vsphere,
clouds.Cudo,
clouds.Paperspace,
clouds.Azure,
)):
config = auth.configure_ssh_info(config)
elif isinstance(cloud, clouds.GCP):
config = auth.setup_gcp_authentication(config)
elif isinstance(cloud, clouds.Azure):
config = auth.setup_azure_authentication(config)
elif isinstance(cloud, clouds.Lambda):
config = auth.setup_lambda_authentication(config)
elif isinstance(cloud, clouds.Kubernetes):
Expand Down
6 changes: 2 additions & 4 deletions sky/clouds/azure.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
"""Azure."""
import base64
import functools
import json
import os
Expand Down Expand Up @@ -324,8 +323,7 @@ def make_deploy_resources_variables(self,
# restarted, identified by a file /tmp/__restarted is existing.
# Also, add default user to docker group.
# pylint: disable=line-too-long
cloud_init_setup_commands = base64.b64encode(
textwrap.dedent("""\
cloud_init_setup_commands = textwrap.dedent("""\
#cloud-config
runcmd:
- sed -i 's/#Banner none/Banner none/' /etc/ssh/sshd_config
Expand All @@ -341,7 +339,7 @@ def make_deploy_resources_variables(self,
- path: /etc/apt/apt.conf.d/10cloudinit-disable
content: |
APT::Periodic::Enable "0";
""").encode('utf-8')).decode('utf-8')
""").split('\n')

def _failover_disk_tier() -> Optional[resources_utils.DiskTier]:
if (r.disk_tier is not None and
Expand Down
4 changes: 4 additions & 0 deletions sky/provision/azure/instance.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Azure instance provisioning."""
import base64
import copy
import enum
import json
Expand Down Expand Up @@ -223,6 +224,9 @@ def _create_instances(
template_params['msi'] = provider_config['msi']
template_params['nsg'] = provider_config['nsg']
template_params['subnet'] = provider_config['subnet']
template_params['cloudInitSetupCommands'] = base64.b64encode(
template_params['cloudInitSetupCommands'].encode('utf-8')).decode(
'utf-8')

if node_config.get('need_nvidia_driver_extension', False):
# pylint: disable=line-too-long
Expand Down
22 changes: 13 additions & 9 deletions sky/provision/docker_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,15 +166,19 @@ def _run(self,
stream_logs=False,
separate_stderr=separate_stderr,
log_path=self.log_path)
if (not wait_for_docker_daemon or
DOCKER_PERMISSION_DENIED_STR not in stdout + stderr):
break

if time.time() - start > _DOCKER_WAIT_FOR_SOCKET_TIMEOUT_SECONDS:
break
logger.info(
'Failed to run docker command, retrying in 15 seconds...')
time.sleep(15)
if (DOCKER_PERMISSION_DENIED_STR in stdout + stderr and
wait_for_docker_daemon):
if time.time() - start > _DOCKER_SOCKET_WAIT_TIMEOUT_SECONDS:
if rc == 0:
# Set returncode to 1 if failed to connect to docker
# daemon after timeout.
rc = 1
break
logger.info('Failed to connect to docker daemon. It might be '
'initializing, retrying in 30 seconds...')
time.sleep(30)
continue
break
subprocess_utils.handle_returncode(
rc,
cmd,
Expand Down
7 changes: 5 additions & 2 deletions sky/templates/azure-ray.yml.j2
Original file line number Diff line number Diff line change
Expand Up @@ -72,14 +72,17 @@ available_node_types:
imageVersion: {{image_version}}
osDiskSizeGB: {{disk_size}}
osDiskTier: {{disk_tier}}
cloudInitSetupCommands: {{cloud_init_setup_commands}}
# optionally set priority to use Spot instances
{%- if use_spot %}
# optionally set priority to use Spot instances
priority: Spot
# set a maximum price for spot instances if desired
# billingProfile:
# maxPrice: -1
{%- endif %}
cloudInitSetupCommands: |-
{%- for cmd in cloud_init_setup_commands %}
{{ cmd }}
{%- endfor %}
need_nvidia_driver_extension: {{need_nvidia_driver_extension}}
# TODO: attach disk

Expand Down

0 comments on commit ae344ad

Please sign in to comment.