Skip to content

Commit

Permalink
Update Azure default image (#2468)
Browse files Browse the repository at this point in the history
* use latest ubuntu-hpc by default

* Update sky/clouds/azure.py

Co-authored-by: Wei-Lin Chiang <[email protected]>

* Update sky/clouds/azure.py

Co-authored-by: Wei-Lin Chiang <[email protected]>

* add skip for spot smoke test when azure is used

* finish user group setting. TODO: debug why Permission denied (publickey).

* fix

* deprecate cloud init

* remove debug msg

* change azure timeout for autostop/down

* add more timeouts

* add comments for azure cloud init

* stash some changes

* merge to master and inline _replace_ssh_info_in_cloud_init

* update comments

* apply suggestions from code review

---------

Co-authored-by: Wei-Lin Chiang <[email protected]>
  • Loading branch information
cblmemo and infwinston authored Nov 16, 2023
1 parent ef8839a commit 4464aee
Show file tree
Hide file tree
Showing 4 changed files with 99 additions and 30 deletions.
31 changes: 31 additions & 0 deletions sky/authentication.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
is an exception, due to the limitation of the cloud provider. See the
comments in setup_lambda_authentication)
"""
import base64
import copy
import functools
import os
Expand Down Expand Up @@ -262,6 +263,36 @@ def setup_gcp_authentication(config: Dict[str, Any]) -> Dict[str, Any]:
return configure_ssh_info(config)


# In Azure, cloud-init script must be encoded in base64. See
# https://learn.microsoft.com/en-us/azure/virtual-machines/custom-data
# for more information. Here we decode it and replace the ssh user
# and public key content, then encode it back.
def setup_azure_authentication(config: Dict[str, Any]) -> Dict[str, Any]:
_, public_key_path = get_or_generate_keys()
with open(public_key_path, 'r') as f:
public_key = f.read().strip()
for node_type in config['available_node_types']:
node_config = config['available_node_types'][node_type]['node_config']
cloud_init = (
node_config['azure_arm_parameters']['cloudInitSetupCommands'])
cloud_init = base64.b64decode(cloud_init).decode('utf-8')
cloud_init = cloud_init.replace('skypilot:ssh_user',
config['auth']['ssh_user'])
cloud_init = cloud_init.replace('skypilot:ssh_public_key_content',
public_key)
cloud_init = base64.b64encode(
cloud_init.encode('utf-8')).decode('utf-8')
node_config['azure_arm_parameters']['cloudInitSetupCommands'] = (
cloud_init)
config_str = common_utils.dump_yaml_str(config)
config_str = config_str.replace('skypilot:ssh_user',
config['auth']['ssh_user'])
config_str = config_str.replace('skypilot:ssh_public_key_content',
public_key)
config = yaml.safe_load(config_str)
return config


def setup_lambda_authentication(config: Dict[str, Any]) -> Dict[str, Any]:

get_or_generate_keys()
Expand Down
4 changes: 3 additions & 1 deletion sky/backends/backend_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -1171,10 +1171,12 @@ def _add_auth_to_cluster_config(cloud: clouds.Cloud, cluster_config_file: str):
"""
config = common_utils.read_yaml(cluster_config_file)
# Check the availability of the cloud type.
if isinstance(cloud, (clouds.AWS, clouds.Azure, clouds.OCI, clouds.SCP)):
if isinstance(cloud, (clouds.AWS, clouds.OCI, clouds.SCP)):
config = auth.configure_ssh_info(config)
elif isinstance(cloud, clouds.GCP):
config = auth.setup_gcp_authentication(config)
elif isinstance(cloud, clouds.Azure):
config = auth.setup_azure_authentication(config)
elif isinstance(cloud, clouds.Lambda):
config = auth.setup_lambda_authentication(config)
elif isinstance(cloud, clouds.Kubernetes):
Expand Down
54 changes: 33 additions & 21 deletions sky/clouds/azure.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,37 +134,47 @@ def get_default_instance_type(
clouds='azure')

def _get_image_config(self, gen_version, instance_type):
# TODO(tian): images for Azure is not well organized. We should refactor
# it to images.csv like AWS.
# az vm image list \
# --publisher microsoft-dsvm --all --output table
# nvidia-driver: 495.29.05, cuda: 11.5

# The latest image 2022.09.14/2022.08.11/22.06.10/22.05.11/
# 22.04.27/22.04.05 has even older nvidia driver 470.57.02,
# cuda: 11.4
# nvidia-driver: 535.54.03, cuda: 12.2
# see: https://github.com/Azure/azhpc-images/releases/tag/ubuntu-hpc-20230803
# All A100 instances is of gen2, so it will always use
# the latest ubuntu-hpc:2204 image.
image_config = {
'image_publisher': 'microsoft-dsvm',
'image_offer': 'ubuntu-2004',
'image_sku': '2004-gen2',
'image_version': '21.11.04'
'image_offer': 'ubuntu-hpc',
'image_sku': '2204',
'image_version': '22.04.2023080201'
}

# ubuntu-2004 v21.10.21 and v21.11.04 do not work on K80
# due to an NVIDIA driver issue.
# ubuntu-2004 v21.08.30, K80 requires image with old NVIDIA driver version
acc = self.get_accelerators_from_instance_type(instance_type)
if acc is not None:
acc_name = list(acc.keys())[0]
if acc_name == 'K80':
image_config['image_version'] = '21.08.30'

# ubuntu-2004 does not work on A100
if instance_type in [
'Standard_ND96asr_v4', 'Standard_ND96amsr_A100_v4'
]:
image_config['image_offer'] = 'ubuntu-hpc'
image_config['image_sku'] = '2004'
image_config['image_version'] = '20.04.2021120101'
image_config = {
'image_publisher': 'microsoft-dsvm',
'image_offer': 'ubuntu-2004',
'image_sku': '2004-gen2',
'image_version': '21.08.30'
}

# ubuntu-2004 v21.11.04, the previous image we used in the past for
# V1 HyperV instance before we change default image to ubuntu-hpc.
# In Azure, all instances with K80 (Standard_NC series), some
# instances with M60 (Standard_NV series) and some cpu instances
# (Basic_A, Standard_D, ...) are V1 instance. For these instances,
# we use the previous image.
if gen_version == 'V1':
image_config['image_sku'] = '2004'
image_config = {
'image_publisher': 'microsoft-dsvm',
'image_offer': 'ubuntu-2004',
'image_sku': '2004',
'image_version': '21.11.04'
}

return image_config

@classmethod
Expand Down Expand Up @@ -251,13 +261,15 @@ def make_deploy_resources_variables(
# This script will modify /etc/ssh/sshd_config and add a bash script
# into .bashrc. The bash script will restart sshd if it has not been
# restarted, identified by a file /tmp/__restarted is existing.
# Also, add default user to docker group.
# pylint: disable=line-too-long
cloud_init_setup_commands = base64.b64encode(
textwrap.dedent("""\
#cloud-config
runcmd:
- sed -i 's/#Banner none/Banner none/' /etc/ssh/sshd_config
- echo '\\nif [ ! -f "/tmp/__restarted" ]; then\\n sudo systemctl restart ssh\\n sleep 2\\n touch /tmp/__restarted\\nfi' >> /home/azureuser/.bashrc
- echo '\\nif [ ! -f "/tmp/__restarted" ]; then\\n sudo systemctl restart ssh\\n sleep 2\\n touch /tmp/__restarted\\nfi' >> /home/skypilot:ssh_user/.bashrc
- usermod -aG docker skypilot:ssh_user
write_files:
- path: /etc/apt/apt.conf.d/20auto-upgrades
content: |
Expand Down
40 changes: 32 additions & 8 deletions tests/test_smoke.py
Original file line number Diff line number Diff line change
Expand Up @@ -733,13 +733,15 @@ def test_gcp_stale_job_manual_restart():
@pytest.mark.no_scp # SCP does not support num_nodes > 1 yet
def test_env_check(generic_cloud: str):
name = _get_cluster_name()
total_timeout_minutes = 25 if generic_cloud == 'azure' else 15
test = Test(
'env_check',
[
f'sky launch -y -c {name} --cloud {generic_cloud} --detach-setup examples/env_check.yaml',
f'sky logs {name} 1 --status', # Ensure the job succeeded.
],
f'sky down -y {name}',
timeout=total_timeout_minutes * 60,
)
run_one_test(test)

Expand Down Expand Up @@ -1150,6 +1152,7 @@ def test_scp_job_queue():
@pytest.mark.no_kubernetes # Kubernetes not support num_nodes > 1 yet
def test_job_queue_multinode(generic_cloud: str):
name = _get_cluster_name()
total_timeout_minutes = 30 if generic_cloud == 'azure' else 15
test = Test(
'job_queue_multinode',
[
Expand Down Expand Up @@ -1180,6 +1183,7 @@ def test_job_queue_multinode(generic_cloud: str):
f'sky logs {name} 7 --status',
],
f'sky down -y {name}',
timeout=total_timeout_minutes * 60,
)
run_one_test(test)

Expand Down Expand Up @@ -1444,6 +1448,7 @@ def test_tpu_vm_pod():
@pytest.mark.no_scp # SCP does not support num_nodes > 1 yet
def test_multi_hostname(generic_cloud: str):
name = _get_cluster_name()
total_timeout_minutes = 25 if generic_cloud == 'azure' else 15
test = Test(
'multi_hostname',
[
Expand All @@ -1454,6 +1459,7 @@ def test_multi_hostname(generic_cloud: str):
f'sky logs {name} 2 --status', # Ensure the job succeeded.
],
f'sky down -y {name}',
timeout=total_timeout_minutes * 60,
)
run_one_test(test)

Expand Down Expand Up @@ -1589,6 +1595,12 @@ def test_azure_start_stop():
@pytest.mark.no_kubernetes # Kubernetes does not autostop yet
def test_autostop(generic_cloud: str):
name = _get_cluster_name()
# Azure takes ~ 7m15s (435s) to autostop a VM, so here we use 600 to ensure
# the VM is stopped.
autostop_timeout = 600 if generic_cloud == 'azure' else 250
# Launching and starting Azure clusters can take a long time too. e.g., restart
# a stopped Azure cluster can take 7m. So we set the total timeout to 70m.
total_timeout_minutes = 70 if generic_cloud == 'azure' else 20
test = Test(
'autostop',
[
Expand All @@ -1603,7 +1615,7 @@ def test_autostop(generic_cloud: str):
f's=$(sky status {name} --refresh); echo "$s"; echo; echo; echo "$s" | grep {name} | grep UP',

# Ensure the cluster is STOPPED.
'sleep 250',
f'sleep {autostop_timeout}',
f's=$(sky status {name} --refresh); echo "$s"; echo; echo; echo "$s" | grep {name} | grep STOPPED',

# Ensure the cluster is UP and the autostop setting is reset ('-').
Expand All @@ -1621,7 +1633,7 @@ def test_autostop(generic_cloud: str):
f'sky autostop -y {name} -i 1', # Should restart the timer.
'sleep 45',
f's=$(sky status {name} --refresh); echo "$s"; echo; echo; echo "$s" | grep {name} | grep UP',
'sleep 250',
f'sleep {autostop_timeout}',
f's=$(sky status {name} --refresh); echo "$s"; echo; echo; echo "$s" | grep {name} | grep STOPPED',

# Test restarting the idleness timer via exec:
Expand All @@ -1632,11 +1644,11 @@ def test_autostop(generic_cloud: str):
f'sky exec {name} echo hi', # Should restart the timer.
'sleep 45',
f's=$(sky status {name} --refresh); echo "$s"; echo; echo; echo "$s" | grep {name} | grep UP',
'sleep 250',
f'sleep {autostop_timeout}',
f's=$(sky status {name} --refresh); echo "$s"; echo; echo; echo "$s" | grep {name} | grep STOPPED',
],
f'sky down -y {name}',
timeout=20 * 60,
timeout=total_timeout_minutes * 60,
)
run_one_test(test)

Expand All @@ -1645,6 +1657,10 @@ def test_autostop(generic_cloud: str):
@pytest.mark.no_scp # SCP does not support num_nodes > 1 yet. Run test_scp_autodown instead.
def test_autodown(generic_cloud: str):
name = _get_cluster_name()
# Azure takes ~ 13m30s (810s) to autodown a VM, so here we use 900 to ensure
# the VM is terminated.
autodown_timeout = 900 if generic_cloud == 'azure' else 240
total_timeout_minutes = 90 if generic_cloud == 'azure' else 20
test = Test(
'autodown',
[
Expand All @@ -1656,23 +1672,23 @@ def test_autodown(generic_cloud: str):
'sleep 45',
f's=$(sky status {name} --refresh); echo "$s"; echo; echo; echo "$s" | grep {name} | grep UP',
# Ensure the cluster is terminated.
'sleep 200',
f'sleep {autodown_timeout}',
f's=$(SKYPILOT_DEBUG=0 sky status {name} --refresh) && echo "$s" && {{ echo "$s" | grep {name} | grep "Autodowned cluster\|terminated on the cloud"; }} || {{ echo "$s" | grep {name} && exit 1 || exit 0; }}',
f'sky launch -y -d -c {name} --cloud {generic_cloud} --num-nodes 2 --down tests/test_yamls/minimal.yaml',
f'sky status | grep {name} | grep UP', # Ensure the cluster is UP.
f'sky exec {name} --cloud {generic_cloud} tests/test_yamls/minimal.yaml',
f'sky status | grep {name} | grep "1m (down)"',
'sleep 240',
f'sleep {autodown_timeout}',
# Ensure the cluster is terminated.
f's=$(SKYPILOT_DEBUG=0 sky status {name} --refresh) && echo "$s" && {{ echo "$s" | grep {name} | grep "Autodowned cluster\|terminated on the cloud"; }} || {{ echo "$s" | grep {name} && exit 1 || exit 0; }}',
f'sky launch -y -d -c {name} --cloud {generic_cloud} --num-nodes 2 --down tests/test_yamls/minimal.yaml',
f'sky autostop -y {name} --cancel',
'sleep 240',
f'sleep {autodown_timeout}',
# Ensure the cluster is still UP.
f's=$(SKYPILOT_DEBUG=0 sky status {name} --refresh) && echo "$s" && echo "$s" | grep {name} | grep UP',
],
f'sky down -y {name}',
timeout=20 * 60,
timeout=total_timeout_minutes * 60,
)
run_one_test(test)

Expand Down Expand Up @@ -1813,6 +1829,7 @@ def test_cancel_ibm():


# ---------- Testing use-spot option ----------
@pytest.mark.no_azure # Azure does not support spot instances
@pytest.mark.no_lambda_cloud # Lambda Cloud does not support spot instances
@pytest.mark.no_ibm # IBM Cloud does not support spot instances
@pytest.mark.no_scp # SCP does not support spot instances
Expand All @@ -1834,6 +1851,7 @@ def test_use_spot(generic_cloud: str):


# ---------- Testing managed spot ----------
@pytest.mark.no_azure # Azure does not support spot instances
@pytest.mark.no_lambda_cloud # Lambda Cloud does not support spot instances
@pytest.mark.no_ibm # IBM Cloud does not support spot instances
@pytest.mark.no_scp # SCP does not support spot instances
Expand Down Expand Up @@ -1867,6 +1885,7 @@ def test_spot(generic_cloud: str):
run_one_test(test)


@pytest.mark.no_azure # Azure does not support spot instances
@pytest.mark.no_lambda_cloud # Lambda Cloud does not support spot instances
@pytest.mark.no_ibm # IBM Cloud does not support spot instances
@pytest.mark.no_scp # SCP does not support spot instances
Expand Down Expand Up @@ -1906,6 +1925,7 @@ def test_spot_pipeline(generic_cloud: str):
run_one_test(test)


@pytest.mark.no_azure # Azure does not support spot instances
@pytest.mark.no_lambda_cloud # Lambda Cloud does not support spot instances
@pytest.mark.no_ibm # IBM Cloud does not support spot instances
@pytest.mark.no_scp # SCP does not support spot instances
Expand All @@ -1929,6 +1949,7 @@ def test_spot_failed_setup(generic_cloud: str):
run_one_test(test)


@pytest.mark.no_azure # Azure does not support spot instances
@pytest.mark.no_lambda_cloud # Lambda Cloud does not support spot instances
@pytest.mark.no_ibm # IBM Cloud does not support spot instances
@pytest.mark.no_scp # SCP does not support spot instances
Expand Down Expand Up @@ -2121,6 +2142,7 @@ def test_spot_pipeline_recovery_gcp():
run_one_test(test)


@pytest.mark.no_azure # Azure does not support spot instances
@pytest.mark.no_lambda_cloud # Lambda Cloud does not support spot instances
@pytest.mark.no_ibm # IBM Cloud does not support spot instances
@pytest.mark.no_scp # SCP does not support spot instances
Expand Down Expand Up @@ -2342,6 +2364,7 @@ def test_spot_cancellation_gcp():


# ---------- Testing storage for managed spot ----------
@pytest.mark.no_azure # Azure does not support spot instances
@pytest.mark.no_lambda_cloud # Lambda Cloud does not support spot instances
@pytest.mark.no_ibm # IBM Cloud does not support spot instances
@pytest.mark.no_scp # SCP does not support spot instances
Expand Down Expand Up @@ -2397,6 +2420,7 @@ def test_spot_tpu():


# ---------- Testing env for spot ----------
@pytest.mark.no_azure # Azure does not support spot instances
@pytest.mark.no_lambda_cloud # Lambda Cloud does not support spot instances
@pytest.mark.no_ibm # IBM Cloud does not support spot instances
@pytest.mark.no_scp # SCP does not support spot instances
Expand Down

0 comments on commit 4464aee

Please sign in to comment.