Skip to content

Commit

Permalink
Separate resource group region and VM, attached resources
Browse files Browse the repository at this point in the history
  • Loading branch information
landscapepainter committed Jul 31, 2024
1 parent 15b4d75 commit d97f1ba
Show file tree
Hide file tree
Showing 6 changed files with 52 additions and 32 deletions.
2 changes: 1 addition & 1 deletion sky/clouds/azure.py
Original file line number Diff line number Diff line change
Expand Up @@ -322,7 +322,7 @@ def make_deploy_resources_variables(

# Determine resource group for deploying the instance.
resource_group_name = skypilot_config.get_nested(
('azure', 'resource_group'), None)
('azure', 'resource_group_vm'), None)
use_external_resource_group = resource_group_name is not None
if resource_group_name is None:
resource_group_name = f'{cluster_name.name_on_cloud}-{region_name}'
Expand Down
8 changes: 7 additions & 1 deletion sky/provision/azure/azure-config-template.json
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,17 @@
"metadata": {
"description": "Subnet parameters."
}
},
"location": {
"type": "string",
"metadata": {
"description": "Location of where the resources are allocated."
}
}
},
"variables": {
"contributor": "[subscriptionResourceId('Microsoft.Authorization/roleDefinitions', 'b24988ac-6180-42a0-ab88-20f7382dd24c')]",
"location": "[resourceGroup().location]",
"location": "[parameters('location')]",
"msiName": "[concat('sky-', parameters('clusterId'), '-msi')]",
"roleAssignmentName": "[concat('sky-', parameters('clusterId'), '-ra')]",
"nsgName": "[concat('sky-', parameters('clusterId'), '-nsg')]",
Expand Down
8 changes: 7 additions & 1 deletion sky/provision/azure/azure-vm-template.json
Original file line number Diff line number Diff line change
Expand Up @@ -123,10 +123,16 @@
"metadata": {
"description": "Base64 encoded cloud-init setup commands."
}
},
"location": {
"type": "string",
"metadata": {
"description": "Location of where the resources are allocated."
}
}
},
"variables": {
"location": "[resourceGroup().location]",
"location": "[parameters('location')]",
"networkInterfaceNamePrivate": "[concat(parameters('vmName'), '-nic')]",
"networkInterfaceNamePublic": "[concat(parameters('vmName'), '-nic-public')]",
"networkInterfaceName": "[if(parameters('provisionPublicIp'), variables('networkInterfaceNamePublic'), variables('networkInterfaceNamePrivate'))]",
Expand Down
63 changes: 35 additions & 28 deletions sky/provision/azure/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,34 +70,38 @@ def bootstrap_instances(
if 'tags' in provider_config:
params['tags'] = provider_config['tags']

logger.info(f'Creating/Updating resource group: {resource_group}')
rg_create_or_update = get_azure_sdk_function(
client=resource_client.resource_groups,
function_name='create_or_update')
rg_creation_start = time.time()
retry = 0
while (time.time() - rg_creation_start <
_RESOURCE_GROUP_WAIT_FOR_DELETION_TIMEOUT):
try:
rg_create_or_update(resource_group_name=resource_group,
parameters=params)
break
except azure.exceptions().ResourceExistsError as e:
if 'ResourceGroupBeingDeleted' in str(e):
if retry % 5 == 0:
logger.info(
f'Azure resource group {resource_group} of a recent '
f'terminated cluster {cluster_name_on_cloud} is being '
'deleted. It can only be provisioned after it is fully'
'deleted. Waiting...')
time.sleep(1)
retry += 1
continue
raise
else:
raise TimeoutError(
f'Timed out waiting for resource group {resource_group} to be '
'deleted.')
# When resource group is user specified, it already exists in certain
# region.
if not use_external_resource_group:
logger.info(f'Creating/Updating resource group: {resource_group}')
rg_create_or_update = get_azure_sdk_function(
client=resource_client.resource_groups,
function_name='create_or_update')
rg_creation_start = time.time()
retry = 0
while (time.time() - rg_creation_start <
_RESOURCE_GROUP_WAIT_FOR_DELETION_TIMEOUT):
try:
rg_create_or_update(resource_group_name=resource_group,
parameters=params)
break
except azure.exceptions().ResourceExistsError as e:
if 'ResourceGroupBeingDeleted' in str(e):
if retry % 5 == 0:
logger.info(
f'Azure resource group {resource_group} of a '
'recent terminated cluster '
f'{cluster_name_on_cloud} is being deleted. It can'
' only be provisioned after it is fully deleted. '
'Waiting...')
time.sleep(1)
retry += 1
continue
raise
else:
raise TimeoutError(
f'Timed out waiting for resource group {resource_group} to be '
'deleted.')

# load the template file
current_path = Path(__file__).parent
Expand Down Expand Up @@ -127,6 +131,9 @@ def bootstrap_instances(
# as we have already appended the user hash to the cluster
# name.
'value': cluster_name_on_cloud
},
'location': {
'value': params['location']
}
},
}
Expand Down
1 change: 1 addition & 0 deletions sky/provision/azure/instance.py
Original file line number Diff line number Diff line change
Expand Up @@ -227,6 +227,7 @@ def _create_instances(
template_params['msi'] = provider_config['msi']
template_params['nsg'] = provider_config['nsg']
template_params['subnet'] = provider_config['subnet']
template_params['location'] = provider_config['location']
# In Azure, cloud-init script must be encoded in base64. For more
# information, see:
# https://learn.microsoft.com/en-us/azure/virtual-machines/custom-data
Expand Down
2 changes: 1 addition & 1 deletion sky/utils/schemas.py
Original file line number Diff line number Diff line change
Expand Up @@ -756,7 +756,7 @@ def get_config_schema():
'storage_account': {
'type': 'string',
},
'resource_group': {
'resource_group_vm': {
'type': 'string',
},
}
Expand Down

0 comments on commit d97f1ba

Please sign in to comment.