Skip to content

refact/lb/sg: isolate security group deletion fragments from EnsureLoadBalancerDeleted #1159

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
238 changes: 141 additions & 97 deletions pkg/providers/v1/aws.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ import (
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/types"
"k8s.io/apimachinery/pkg/util/sets"
"k8s.io/apimachinery/pkg/util/wait"
"k8s.io/client-go/informers"
informercorev1 "k8s.io/client-go/informers/core/v1"
clientset "k8s.io/client-go/kubernetes"
Expand Down Expand Up @@ -2872,6 +2873,139 @@ func (c *Cloud) updateInstanceSecurityGroupsForLoadBalancer(ctx context.Context,
return nil
}

// deleteSecurityGroupsWithBackoff deletes a list of security group IDs with retries and exponential backoff.
// The function attempts to delete each security group in the provided list, handling potential dependency violations
// caused by resources still being associated with the security groups (e.g., load balancers in the process of deletion).
//
// Parameters:
// - `ctx`: The context for the operation.
// - `svcName`: The name of the service associated with the security groups.
// - `securityGroupIDs`: A map of security group IDs to be deleted.
//
// Behavior:
// - If the list of security group IDs is empty, the function returns immediately.
// - The function retries deletion for up to 10 minutes, with an initial backoff of 5 seconds that doubles with each retry.
// - Dependency violations are logged and ignored, allowing retries until the timeout is reached.
// - If all security groups are successfully deleted, the function exits.
// - If the timeout is reached and some security groups remain, an error is returned.
//
// Returns:
// - `error`: An error if any security groups could not be deleted within the timeout period.
func (c *Cloud) deleteSecurityGroupsWithBackoff(ctx context.Context, svcName string, securityGroupIDs map[string]struct{}) error {
if len(securityGroupIDs) == 0 {
return nil
}
err := wait.PollUntilContextTimeout(ctx, 5*time.Second, 10*time.Minute, true, func(ctx context.Context) (bool, error) {
for securityGroupID := range securityGroupIDs {
_, err := c.ec2.DeleteSecurityGroup(ctx, &ec2.DeleteSecurityGroupInput{
GroupId: &securityGroupID,
})
if err == nil {
delete(securityGroupIDs, securityGroupID)
continue
}
ignore := false
var ae smithy.APIError
if errors.As(err, &ae) {
if ae.ErrorCode() == "DependencyViolation" {
klog.V(2).Infof("Ignoring DependencyViolation while deleting load-balancer security group (%s), assuming because LB is in process of deleting", securityGroupID)
ignore = true
}
}
if !ignore {
return true, fmt.Errorf("error while deleting load balancer security group (%s): %q", securityGroupID, err)
}
}

if len(securityGroupIDs) == 0 {
klog.V(2).Info("Deleted all security groups for load balancer: ", svcName)
return true, nil
}

klog.V(2).Infof("Waiting for load-balancer %q to delete so we can delete security groups: %v", svcName, securityGroupIDs)
return false, nil
})
if err != nil {
ids := []string{}
for id := range securityGroupIDs {
ids = append(ids, id)
}
return fmt.Errorf("could not delete security groups %v for Load Balancer %q: %w", strings.Join(ids, ","), svcName, err)
}
return nil
}

// buildSecurityGroupsToDelete evaluates all deletion criteria and creates a list of valid security group IDs to be deleted.
// It returns two maps:
// - `securityGroupIDs`: A map of security group IDs that are eligible for deletion.
// - `taggedLBSecurityGroups`: A map of security group IDs that are tagged and associated with the load balancer.
// The function filters security groups based on the following criteria:
// - Excludes security groups defined in the Cloud Configuration.
// - Excludes security groups with no cluster tags.
// - Excludes security groups annotated with `service.beta.kubernetes.io/aws-load-balancer-security-groups` or
// `service.beta.kubernetes.io/aws-load-balancer-extra-security-groups`.
//
// Parameters:
// - `ctx`: The context for the operation.
// - `service`: The Kubernetes service object.
// - `lbSecurityGroups`: A list of security group IDs associated with the load balancer.
// Returns:
// - `securityGroupIDs`: A map of security group IDs to be deleted.
// - `taggedLBSecurityGroups`: A map of tagged security group IDs.
// - `error`: An error if the operation fails.
func (c *Cloud) buildSecurityGroupsToDelete(ctx context.Context, service *v1.Service, lbSecurityGroups []string) (map[string]struct{}, map[string]struct{}, error) {
securityGroupIDs := map[string]struct{}{}
taggedLBSecurityGroups := map[string]struct{}{}

describeRequest := &ec2.DescribeSecurityGroupsInput{}
describeRequest.Filters = []ec2types.Filter{
newEc2Filter("group-id", lbSecurityGroups...),
}
response, err := c.ec2.DescribeSecurityGroups(ctx, describeRequest)
if err != nil {
return nil, nil, fmt.Errorf("error querying security groups for ELB: %q", err)
}

annotatedSgSet := map[string]bool{}
annotatedSgsList := getSGListFromAnnotation(service.Annotations[ServiceAnnotationLoadBalancerSecurityGroups])
annotatedExtraSgsList := getSGListFromAnnotation(service.Annotations[ServiceAnnotationLoadBalancerExtraSecurityGroups])
annotatedSgsList = append(annotatedSgsList, annotatedExtraSgsList...)

for _, sg := range annotatedSgsList {
annotatedSgSet[sg] = true
}

for _, sg := range response {
sgID := aws.StringValue(sg.GroupId)

if sgID == c.cfg.Global.ElbSecurityGroup {
//We don't want to delete a security group that was defined in the Cloud Configuration.
continue
}
if sgID == "" {
klog.Warningf("Ignoring empty security group in %s", service.Name)
continue
}

if !c.tagging.hasClusterTag(sg.Tags) {
klog.Warningf("Ignoring security group with no cluster tag in %s", service.Name)
continue
} else {
taggedLBSecurityGroups[sgID] = struct{}{}
}

// This is an extra protection of deletion of non provisioned Security Group which is annotated with `service.beta.kubernetes.io/aws-load-balancer-security-groups`.
if _, ok := annotatedSgSet[sgID]; ok {
klog.Warningf("Ignoring security group with annotation `service.beta.kubernetes.io/aws-load-balancer-security-groups` or service.beta.kubernetes.io/aws-load-balancer-extra-security-groups in %s", service.Name)
continue
}

securityGroupIDs[sgID] = struct{}{}
}

return securityGroupIDs, taggedLBSecurityGroups, nil
}

// EnsureLoadBalancerDeleted implements LoadBalancer.EnsureLoadBalancerDeleted.
func (c *Cloud) EnsureLoadBalancerDeleted(ctx context.Context, clusterName string, service *v1.Service) error {
if isLBExternal(service.Annotations) {
Expand Down Expand Up @@ -2943,57 +3077,13 @@ func (c *Cloud) EnsureLoadBalancerDeleted(ctx context.Context, clusterName strin
// if the load balancer security group is being deleted.
securityGroupIDs := map[string]struct{}{}
taggedLBSecurityGroups := map[string]struct{}{}
{
// Delete the security group(s) for the load balancer
// Note that this is annoying: the load balancer disappears from the API immediately, but it is still
// deleting in the background. We get a DependencyViolation until the load balancer has deleted itself

var loadBalancerSGs = lb.SecurityGroups

describeRequest := &ec2.DescribeSecurityGroupsInput{}
describeRequest.Filters = []ec2types.Filter{
newEc2Filter("group-id", loadBalancerSGs...),
}
response, err := c.ec2.DescribeSecurityGroups(ctx, describeRequest)
if err != nil {
return fmt.Errorf("error querying security groups for ELB: %q", err)
}
annotatedSgSet := map[string]bool{}
annotatedSgsList := getSGListFromAnnotation(service.Annotations[ServiceAnnotationLoadBalancerSecurityGroups])
annotatedExtraSgsList := getSGListFromAnnotation(service.Annotations[ServiceAnnotationLoadBalancerExtraSecurityGroups])
annotatedSgsList = append(annotatedSgsList, annotatedExtraSgsList...)

for _, sg := range annotatedSgsList {
annotatedSgSet[sg] = true
}

for _, sg := range response {
sgID := aws.StringValue(sg.GroupId)

if sgID == c.cfg.Global.ElbSecurityGroup {
//We don't want to delete a security group that was defined in the Cloud Configuration.
continue
}
if sgID == "" {
klog.Warningf("Ignoring empty security group in %s", service.Name)
continue
}

if !c.tagging.hasClusterTag(sg.Tags) {
klog.Warningf("Ignoring security group with no cluster tag in %s", service.Name)
continue
} else {
taggedLBSecurityGroups[sgID] = struct{}{}
}

// This is an extra protection of deletion of non provisioned Security Group which is annotated with `service.beta.kubernetes.io/aws-load-balancer-security-groups`.
if _, ok := annotatedSgSet[sgID]; ok {
klog.Warningf("Ignoring security group with annotation `service.beta.kubernetes.io/aws-load-balancer-security-groups` or service.beta.kubernetes.io/aws-load-balancer-extra-security-groups in %s", service.Name)
continue
}

securityGroupIDs[sgID] = struct{}{}
}
// Delete the security group(s) for the load balancer
// Note that this is annoying: the load balancer disappears from the API immediately, but it is still
// deleting in the background. We get a DependencyViolation until the load balancer has deleted itself
securityGroupIDs, taggedLBSecurityGroups, err = c.buildSecurityGroupsToDelete(ctx, service, lb.SecurityGroups)
if err != nil {
return fmt.Errorf("unable to build security groups to delete: %w", err)
}

{
Expand Down Expand Up @@ -3028,53 +3118,7 @@ func (c *Cloud) EnsureLoadBalancerDeleted(ctx context.Context, clusterName strin
}
}

{

// Loop through and try to delete them
timeoutAt := time.Now().Add(time.Second * 600)
for {
for securityGroupID := range securityGroupIDs {
request := &ec2.DeleteSecurityGroupInput{}
request.GroupId = &securityGroupID
_, err := c.ec2.DeleteSecurityGroup(ctx, request)
if err == nil {
delete(securityGroupIDs, securityGroupID)
} else {
ignore := false
var ae smithy.APIError
if errors.As(err, &ae) {
if ae.ErrorCode() == "DependencyViolation" {
klog.V(2).Infof("Ignoring DependencyViolation while deleting load-balancer security group (%s), assuming because LB is in process of deleting", securityGroupID)
ignore = true
}
}
if !ignore {
return fmt.Errorf("error while deleting load balancer security group (%s): %q", securityGroupID, err)
}
}
}

if len(securityGroupIDs) == 0 {
klog.V(2).Info("Deleted all security groups for load balancer: ", service.Name)
break
}

if time.Now().After(timeoutAt) {
ids := []string{}
for id := range securityGroupIDs {
ids = append(ids, id)
}

return fmt.Errorf("timed out deleting ELB: %s. Could not delete security groups %v", service.Name, strings.Join(ids, ","))
}

klog.V(2).Info("Waiting for load-balancer to delete so we can delete security groups: ", service.Name)

time.Sleep(10 * time.Second)
}
}

return nil
return c.deleteSecurityGroupsWithBackoff(ctx, service.Name, securityGroupIDs)
}

// UpdateLoadBalancer implements LoadBalancer.UpdateLoadBalancer
Expand Down