Skip to content

Commit

Permalink
[SKY-1304] AWS throws InvalidGroup.Duplicate when user submits a lo…
Browse files Browse the repository at this point in the history
…t of parallel launch (#4584)

* wpi

* wip

* wip

* wip
  • Loading branch information
weih1121 authored Jan 21, 2025
1 parent 34b3ded commit 8cf6c86
Showing 1 changed file with 48 additions and 26 deletions.
74 changes: 48 additions & 26 deletions sky/provision/aws/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -553,17 +553,28 @@ def _configure_security_group(ec2, vpc_id: str, expected_sg_name: str,

def _get_or_create_vpc_security_group(ec2, vpc_id: str,
expected_sg_name: str) -> Any:
# Figure out which security groups with this name exist for each VPC...
vpc_to_existing_sg = {
sg.vpc_id: sg for sg in _get_security_groups_from_vpc_ids(
ec2,
[vpc_id],
[expected_sg_name],
)
}
"""Find or create a security group in the specified VPC.
if vpc_id in vpc_to_existing_sg:
return vpc_to_existing_sg[vpc_id]
Args:
ec2: The initialized EC2 client object.
vpc_id: The ID of the VPC where the security group should be queried
or created.
expected_sg_name: The expected name of the security group.
Returns:
The security group object containing the details of the security group.
Raises:
exceptions.NoClusterLaunchedError: If the security group creation fails
and is not due to an existing duplicate.
botocore.exceptions.ClientError: If the security group creation fails
due to AWS service issues.
"""
# Figure out which security groups with this name exist for each VPC...
security_group = _get_security_group_from_vpc_id(ec2, vpc_id,
expected_sg_name)
if security_group is not None:
return security_group

try:
# create a new security group
Expand All @@ -573,34 +584,45 @@ def _get_or_create_vpc_security_group(ec2, vpc_id: str,
VpcId=vpc_id,
)
except ec2.meta.client.exceptions.ClientError as e:
if e.response['Error']['Code'] == 'InvalidGroup.Duplicate':
# The security group already exists, but we didn't see it
# because of eventual consistency.
logger.warning(f'{expected_sg_name} already exists when creating.')
security_group = _get_security_group_from_vpc_id(
ec2, vpc_id, expected_sg_name)
assert (security_group is not None and
security_group.group_name == expected_sg_name), (
f'Expected {expected_sg_name} but got {security_group}')
logger.info(
f'Found existing security group {colorama.Style.BRIGHT}'
f'{security_group.group_name}{colorama.Style.RESET_ALL} '
f'[id={security_group.id}]')
return security_group
message = ('Failed to create security group. Error: '
f'{common_utils.format_exception(e)}')
logger.warning(message)
raise exceptions.NoClusterLaunchedError(message) from e

security_group = _get_security_groups_from_vpc_ids(ec2, [vpc_id],
[expected_sg_name])

assert security_group, 'Failed to create security group'
security_group = security_group[0]

security_group = _get_security_group_from_vpc_id(ec2, vpc_id,
expected_sg_name)
assert security_group is not None, 'Failed to create security group'
logger.info(f'Created new security group {colorama.Style.BRIGHT}'
f'{security_group.group_name}{colorama.Style.RESET_ALL} '
f'[id={security_group.id}]')
return security_group


def _get_security_groups_from_vpc_ids(ec2, vpc_ids: List[str],
group_names: List[str]) -> List[Any]:
unique_vpc_ids = list(set(vpc_ids))
unique_group_names = set(group_names)

def _get_security_group_from_vpc_id(ec2, vpc_id: str,
group_name: str) -> Optional[Any]:
"""Get security group by VPC ID and group name."""
existing_groups = list(
ec2.security_groups.filter(Filters=[{
'Name': 'vpc-id',
'Values': unique_vpc_ids
'Values': [vpc_id]
}]))
filtered_groups = [
sg for sg in existing_groups if sg.group_name in unique_group_names
]
return filtered_groups

for sg in existing_groups:
if sg.group_name == group_name:
return sg

return None

0 comments on commit 8cf6c86

Please sign in to comment.