diff --git a/docs/log-cleanup.md b/docs/log-cleanup.md new file mode 100644 index 000000000..edda1d4bd --- /dev/null +++ b/docs/log-cleanup.md @@ -0,0 +1,235 @@ +# Log Cleanup Functionality + +This document describes the log cleanup functionality added to address issue #147, which ensures that EMF (Embedded Metric Format) and Container Insights logs are cleaned up after test execution. + +## Overview + +The CloudWatch Agent Test framework now automatically cleans up log groups created during test execution to prevent accumulation of test artifacts in CloudWatch Logs. This helps reduce costs and keeps the AWS account clean. + +## Features + +### Automatic Cleanup Patterns + +The cleanup functionality automatically identifies and cleans up log groups matching these patterns: + +**Container Insights:** +- `/aws/ecs/containerinsights/*/performance` +- `/aws/ecs/containerinsights/*/application` +- `/aws/eks/containerinsights/*/performance` +- `/aws/eks/containerinsights/*/application` +- `/aws/containerinsights/*` + +**EMF (Embedded Metric Format):** +- `*EMF*` (case variations) +- `/aws/lambda/*` (Lambda EMF logs) +- `EMFECSNameSpace` (ECS test namespace) +- `EMFEKSNameSpace` (EKS test namespace) + +**ECS Task Logs:** +- `/ecs/*` +- `/aws/ecs/*` + +**Test-specific Patterns:** +- `*-test-*` +- `*test*` +- `cwagent-*` +- `cloudwatch-agent-*` + +### Safety Features + +1. **Dry Run by Default**: All cleanup operations default to dry-run mode for safety +2. **Age-based Protection**: Only cleans logs older than specified age (default: 1-2 hours) +3. **Exclude Patterns**: Never touches production logs (patterns with `production` or `prod`) +4. **Error Isolation**: Cleanup errors don't fail tests +5. **Detailed Logging**: Comprehensive logging of all cleanup operations + +## Configuration + +### Environment Variables + +| Variable | Default | Description | +|----------|---------|-------------| +| `CWAGENT_SKIP_LOG_CLEANUP` | `false` | Set to `true` to completely disable log cleanup | +| `CWAGENT_FORCE_LOG_CLEANUP` | `false` | Set to `true` to perform actual deletion (not dry-run) | + +### Examples + +```bash +# Skip all cleanup (useful for debugging) +export CWAGENT_SKIP_LOG_CLEANUP=true + +# Enable actual cleanup (use with caution) +export CWAGENT_FORCE_LOG_CLEANUP=true + +# Default behavior (dry-run only) +unset CWAGENT_SKIP_LOG_CLEANUP +unset CWAGENT_FORCE_LOG_CLEANUP +``` + +## Integration with Test Framework + +### Automatic Integration + +Cleanup is automatically integrated into the test framework: + +- **BaseTestRunner**: Includes default cleanup for all tests +- **ECSTestRunner**: Specialized cleanup for ECS-specific log groups +- **Test Suites**: Cleanup runs after each test completion + +### Manual Usage + +```go +import "github.com/aws/amazon-cloudwatch-agent-test/util/awsservice" + +// Simple cleanup with defaults +err := awsservice.CleanupTestLogGroups(true) // dry-run +if err != nil { + log.Printf("Cleanup failed: %v", err) +} + +// Custom cleanup configuration +config := awsservice.LogGroupCleanupConfig{ + IncludePatterns: []string{"my-test-logs-*"}, + ExcludePatterns: []string{"*-important-*"}, + DryRun: false, + MaxAge: &time.Duration(2 * time.Hour), +} + +result, err := awsservice.CleanupLogGroupsByPattern(config) +if err != nil { + log.Printf("Cleanup failed: %v", err) +} else { + log.Printf("Deleted %d log groups", len(result.DeletedLogGroups)) +} +``` + +## Best Practices + +### For Development + +1. **Always test with dry-run first**: + ```bash + # This is the default, but be explicit + unset CWAGENT_FORCE_LOG_CLEANUP + ``` + +2. **Review cleanup results**: + ```bash + # Look for cleanup logs in test output + grep -i "cleanup" test_output.log + ``` + +3. **Use skip for debugging**: + ```bash + # When you need to examine logs after test failure + export CWAGENT_SKIP_LOG_CLEANUP=true + ``` + +### For CI/CD + +1. **Enable cleanup in CI pipelines**: + ```yaml + environment: + CWAGENT_FORCE_LOG_CLEANUP: "true" + ``` + +2. **Monitor cleanup in logs**: + ```bash + # Check if cleanup is working + grep "cleanup completed" ci_logs.txt + ``` + +3. **Handle cleanup failures gracefully**: + - Cleanup failures don't fail tests + - Monitor for cleanup warnings in CI logs + +## Troubleshooting + +### Common Issues + +1. **Permission Errors**: + ``` + Error: failed to delete log group: AccessDenied + ``` + **Solution**: Ensure IAM role has `logs:DeleteLogGroup` permission + +2. **Resource Not Found**: + ``` + Error: ResourceNotFoundException + ``` + **Solution**: This is normal - log group was already deleted + +3. **Too Many Log Groups**: + ``` + Warning: Found 1000+ log groups to evaluate + ``` + **Solution**: Consider more specific include patterns + +### Debug Mode + +To debug cleanup issues: + +```bash +# Enable verbose logging and dry-run +export CWAGENT_SKIP_LOG_CLEANUP=false +unset CWAGENT_FORCE_LOG_CLEANUP # Ensures dry-run + +# Run your test and check output +go test -v ./test/emf/ 2>&1 | grep -i cleanup +``` + +### Manual Cleanup + +For manual cleanup operations: + +```go +package main + +import ( + "log" + "github.com/aws/amazon-cloudwatch-agent-test/util/awsservice" +) + +func main() { + // List what would be cleaned up + logGroups, err := awsservice.ListEMFAndContainerInsightsLogGroups() + if err != nil { + log.Fatalf("Failed to list log groups: %v", err) + } + + log.Printf("Found %d log groups that match cleanup patterns:", len(logGroups)) + for _, group := range logGroups { + log.Printf(" - %s", group) + } + + // Perform actual cleanup (use with caution) + // err = awsservice.CleanupTestLogGroups(false) +} +``` + +## Monitoring and Metrics + +The cleanup functionality provides detailed metrics: + +- **Deleted Log Groups**: Count of successfully deleted log groups +- **Skipped Log Groups**: Count of log groups skipped (age, exclusions) +- **Errors**: Count and details of cleanup errors +- **Total Processed**: Total number of log groups evaluated + +## Security Considerations + +1. **IAM Permissions**: Requires `logs:DescribeLogGroups` and `logs:DeleteLogGroup` +2. **Exclude Patterns**: Production logs are automatically excluded +3. **Age Constraints**: Only older logs are eligible for cleanup +4. **Dry Run Default**: Safe defaults prevent accidental deletions + +## Contributing + +When adding new test types that create log groups: + +1. **Add appropriate patterns** to the cleanup configuration +2. **Test with dry-run** to ensure patterns work correctly +3. **Document new patterns** in this file +4. **Consider safety exclusions** for any special cases + +For questions or issues, please refer to the main project documentation or create an issue. diff --git a/test/emf/emf_container_test.go b/test/emf/emf_container_test.go index 488b1963a..47ee0fcbe 100644 --- a/test/emf/emf_container_test.go +++ b/test/emf/emf_container_test.go @@ -6,6 +6,8 @@ package emf import ( + "log" + "os" "time" "github.com/aws/aws-sdk-go-v2/aws" @@ -15,6 +17,7 @@ import ( "github.com/aws/amazon-cloudwatch-agent-test/test/metric/dimension" "github.com/aws/amazon-cloudwatch-agent-test/test/status" "github.com/aws/amazon-cloudwatch-agent-test/test/test_runner" + "github.com/aws/amazon-cloudwatch-agent-test/util/awsservice" ) type EMFTestRunner struct { @@ -54,6 +57,47 @@ func (t *EMFTestRunner) GetMeasuredMetrics() []string { return []string{"EMFCounter"} } +// CleanupAfterTest implements EMF-specific cleanup +func (t *EMFTestRunner) CleanupAfterTest() error { + // Check if cleanup is disabled + if skipCleanup := os.Getenv("CWAGENT_SKIP_LOG_CLEANUP"); skipCleanup == "true" { + log.Printf("EMF log cleanup skipped due to CWAGENT_SKIP_LOG_CLEANUP environment variable") + return nil + } + + // EMF-specific cleanup patterns + emfCleanupConfig := awsservice.LogGroupCleanupConfig{ + IncludePatterns: []string{ + "EMFECSNameSpace", // ECS test namespace + "EMFEKSNameSpace", // EKS test namespace + ".*EMF.*", // General EMF patterns + ".*emf.*", // Lowercase variants + }, + ExcludePatterns: []string{ + ".*production.*", + ".*prod.*", + }, + DryRun: os.Getenv("CWAGENT_FORCE_LOG_CLEANUP") != "true", + } + + // Add age constraint for safety - only clean logs older than 1 hour + maxAge := 1 * time.Hour + emfCleanupConfig.MaxAge = &maxAge + + log.Printf("Starting EMF-specific log cleanup (dry run: %v)", emfCleanupConfig.DryRun) + result, err := awsservice.CleanupLogGroupsByPattern(emfCleanupConfig) + if err != nil { + log.Printf("Warning: EMF log cleanup failed: %v", err) + // Don't fail the test due to cleanup issues + return nil + } + + log.Printf("EMF log cleanup completed. Deleted: %d, Skipped: %d, Errors: %d", + len(result.DeletedLogGroups), len(result.SkippedLogGroups), len(result.Errors)) + + return nil +} + func (t *EMFTestRunner) validateEMFMetrics(metricName string) status.TestResult { namespace := "" var dims []types.Dimension diff --git a/test/test_runner/base_test_runner.go b/test/test_runner/base_test_runner.go index b7f7e6b50..b0f348e07 100644 --- a/test/test_runner/base_test_runner.go +++ b/test/test_runner/base_test_runner.go @@ -11,6 +11,7 @@ import ( "log" "os" "path/filepath" + "strconv" "time" "github.com/aws/amazon-cloudwatch-agent-test/test/metric/dimension" @@ -33,6 +34,7 @@ type ITestRunner interface { GetMeasuredMetrics() []string SetupBeforeAgentRun() error SetupAfterAgentRun() error + CleanupAfterTest() error // New method for cleanup UseSSM() bool SSMParameterName() string SetUpConfig() error @@ -84,6 +86,33 @@ func (t *BaseTestRunner) SetupAfterAgentRun() error { return nil } +// CleanupAfterTest provides default cleanup behavior for EMF and Container Insights logs +func (t *BaseTestRunner) CleanupAfterTest() error { + // Check if cleanup is disabled via environment variable + if skipCleanup, _ := strconv.ParseBool(os.Getenv("CWAGENT_SKIP_LOG_CLEANUP")); skipCleanup { + log.Printf("Log cleanup skipped due to CWAGENT_SKIP_LOG_CLEANUP environment variable") + return nil + } + + // Check if we should do a dry run (default behavior for safety) + dryRun := true + if forceCleanup, _ := strconv.ParseBool(os.Getenv("CWAGENT_FORCE_LOG_CLEANUP")); forceCleanup { + dryRun = false + log.Printf("Performing actual log cleanup due to CWAGENT_FORCE_LOG_CLEANUP environment variable") + } + + // Perform cleanup + log.Printf("Starting log cleanup (dry run: %v)", dryRun) + err := awsservice.CleanupTestLogGroups(dryRun) + if err != nil { + log.Printf("Warning: Log cleanup failed: %v", err) + // Don't fail the test due to cleanup issues + return nil + } + + return nil +} + func (t *BaseTestRunner) GetAgentRunDuration() time.Duration { return 30 * time.Second } @@ -100,9 +129,22 @@ func (t *BaseTestRunner) SetAgentConfig(agentConfig AgentConfig) { t.AgentConfig = agentConfig } +// Run executes the test and includes cleanup func (t *TestRunner) Run() status.TestGroupResult { testName := t.TestRunner.GetTestName() log.Printf("Running %v", testName) + + // Store cleanup error separately to avoid masking test results + var cleanupErr error + defer func() { + // Always attempt cleanup, even if test failed + log.Printf("Performing cleanup for test: %s", testName) + cleanupErr = t.TestRunner.CleanupAfterTest() + if cleanupErr != nil { + log.Printf("Cleanup completed with warnings: %v", cleanupErr) + } + }() + err := t.RunAgent() if err != nil { log.Printf("%v test group failed while running agent: %v", testName, err) @@ -117,7 +159,19 @@ func (t *TestRunner) Run() status.TestGroupResult { }, } } - return t.TestRunner.Validate() + + result := t.TestRunner.Validate() + + // Add cleanup status to test results if there were any issues + if cleanupErr != nil { + result.TestResults = append(result.TestResults, status.TestResult{ + Name: "Log Cleanup", + Status: status.SUCCESSFUL, // We don't fail tests due to cleanup issues + Reason: fmt.Errorf("cleanup completed with warnings: %w", cleanupErr), + }) + } + + return result } func (t *TestRunner) RunAgent() error { diff --git a/test/test_runner/ecs_test_runner.go b/test/test_runner/ecs_test_runner.go index bb2e9ef3d..3fcf2da05 100644 --- a/test/test_runner/ecs_test_runner.go +++ b/test/test_runner/ecs_test_runner.go @@ -58,6 +58,17 @@ func (t *ECSTestRunner) Run(s ITestSuite, e *environment.MetaData) { name := t.Runner.GetTestName() log.Printf("Running %s", name) + // Store cleanup status + var cleanupErr error + defer func() { + // Always attempt cleanup after test completion + log.Printf("Performing ECS log cleanup for test: %s", name) + cleanupErr = t.performCleanup() + if cleanupErr != nil { + log.Printf("ECS cleanup completed with warnings: %v", cleanupErr) + } + }() + //runs agent restart with given config only when it's available agentConfigFileName := t.Runner.GetAgentConfigFileName() if len(agentConfigFileName) != 0 { @@ -81,8 +92,61 @@ func (t *ECSTestRunner) Run(s ITestSuite, e *environment.MetaData) { testGroupResult := t.Runner.Validate() + // Add cleanup status if there were issues + if cleanupErr != nil { + testGroupResult.TestResults = append(testGroupResult.TestResults, status.TestResult{ + Name: "Log Cleanup", + Status: status.SUCCESSFUL, // Don't fail tests due to cleanup issues + Reason: fmt.Errorf("cleanup completed with warnings: %w", cleanupErr), + }) + } + s.AddToSuiteResult(testGroupResult) if testGroupResult.GetStatus() != status.SUCCESSFUL { log.Printf("%s test group failed", name) } } + +// performCleanup handles cleanup of Container Insights and EMF logs specific to ECS +func (t *ECSTestRunner) performCleanup() error { + // Check if cleanup is disabled + if skipCleanup := os.Getenv("CWAGENT_SKIP_LOG_CLEANUP"); skipCleanup == "true" { + log.Printf("ECS log cleanup skipped due to CWAGENT_SKIP_LOG_CLEANUP environment variable") + return nil + } + + // ECS-specific cleanup patterns + ecsCleanupConfig := awsservice.LogGroupCleanupConfig{ + IncludePatterns: []string{ + "/aws/ecs/containerinsights/.*/performance", + "/aws/ecs/containerinsights/.*/application", + "/ecs/.*", + ".*EMFECSNameSpace.*", + "cwagent-ecs-.*", + }, + ExcludePatterns: []string{ + ".*production.*", + ".*prod.*", + }, + DryRun: os.Getenv("CWAGENT_FORCE_LOG_CLEANUP") != "true", + } + + // Add age constraint for safety + maxAge := 2 * time.Hour // Only clean logs older than 2 hours for ECS + ecsCleanupConfig.MaxAge = &maxAge + + log.Printf("Starting ECS-specific log cleanup (dry run: %v)", ecsCleanupConfig.DryRun) + result, err := awsservice.CleanupLogGroupsByPattern(ecsCleanupConfig) + if err != nil { + return fmt.Errorf("ECS log cleanup failed: %w", err) + } + + log.Printf("ECS log cleanup completed. Deleted: %d, Skipped: %d, Errors: %d", + len(result.DeletedLogGroups), len(result.SkippedLogGroups), len(result.Errors)) + + if len(result.Errors) > 0 { + return fmt.Errorf("ECS cleanup completed with %d errors", len(result.Errors)) + } + + return nil +} diff --git a/util/awsservice/log_cleanup.go b/util/awsservice/log_cleanup.go new file mode 100644 index 000000000..725980e41 --- /dev/null +++ b/util/awsservice/log_cleanup.go @@ -0,0 +1,338 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: MIT + +package awsservice + +import ( + "context" + "fmt" + "log" + "regexp" + "strings" + "time" + + "github.com/aws/aws-sdk-go-v2/aws" + "github.com/aws/aws-sdk-go-v2/service/cloudwatchlogs" + "github.com/aws/aws-sdk-go-v2/service/cloudwatchlogs/types" +) + +// Common log group patterns for EMF and Container Insights +var ( + // Container Insights patterns + ContainerInsightsPatterns = []string{ + "/aws/ecs/containerinsights/.*/performance", + "/aws/ecs/containerinsights/.*/application", + "/aws/eks/containerinsights/.*/performance", + "/aws/eks/containerinsights/.*/application", + "/aws/containerinsights/.*", + } + + // EMF patterns - these are more flexible since EMF logs can be in various namespaces + EMFPatterns = []string{ + ".*EMF.*", // Common EMF pattern + ".*emf.*", // Lowercase variant + "/aws/lambda/.*", // Lambda EMF logs + "EMFECSNameSpace", // From the test configuration + "EMFEKSNameSpace", // From the test configuration + } + + // ECS Task patterns + ECSTaskPatterns = []string{ + "/ecs/.*", + "/aws/ecs/.*", + } + + // Test-specific patterns that might be created during integration tests + TestPatterns = []string{ + ".*-test-.*", + ".*test.*", + "cwagent-.*", + "cloudwatch-agent-.*", + } +) + +// LogGroupCleanupConfig defines the configuration for log group cleanup +type LogGroupCleanupConfig struct { + IncludePatterns []string // Patterns to include for cleanup + ExcludePatterns []string // Patterns to exclude from cleanup (safety) + DryRun bool // If true, only list what would be deleted + MaxAge *time.Duration // Only delete log groups older than this + BatchSize int // Number of log groups to process in each batch +} + +// LogGroupCleanupResult contains the results of a cleanup operation +type LogGroupCleanupResult struct { + DeletedLogGroups []string + SkippedLogGroups []string + Errors []error + TotalProcessed int +} + +// CleanupEMFAndContainerInsightsLogs performs cleanup of EMF and Container Insights log groups +// This is the main entry point for cleaning up logs after test execution +func CleanupEMFAndContainerInsightsLogs(config LogGroupCleanupConfig) (*LogGroupCleanupResult, error) { + log.Printf("Starting cleanup of EMF and Container Insights log groups (DryRun: %v)", config.DryRun) + + // Combine all common patterns if no specific patterns provided + if len(config.IncludePatterns) == 0 { + config.IncludePatterns = append(config.IncludePatterns, ContainerInsightsPatterns...) + config.IncludePatterns = append(config.IncludePatterns, EMFPatterns...) + config.IncludePatterns = append(config.IncludePatterns, ECSTaskPatterns...) + config.IncludePatterns = append(config.IncludePatterns, TestPatterns...) + } + + // Set default batch size + if config.BatchSize <= 0 { + config.BatchSize = 50 + } + + return CleanupLogGroupsByPattern(config) +} + +// CleanupLogGroupsByPattern cleans up log groups matching specific patterns +func CleanupLogGroupsByPattern(config LogGroupCleanupConfig) (*LogGroupCleanupResult, error) { + result := &LogGroupCleanupResult{ + DeletedLogGroups: make([]string, 0), + SkippedLogGroups: make([]string, 0), + Errors: make([]error, 0), + } + + // Get all log groups + logGroups, err := listAllLogGroups() + if err != nil { + return result, fmt.Errorf("failed to list log groups: %w", err) + } + + log.Printf("Found %d total log groups to evaluate", len(logGroups)) + + // Compile regex patterns for efficiency + includeRegexes, err := compilePatterns(config.IncludePatterns) + if err != nil { + return result, fmt.Errorf("failed to compile include patterns: %w", err) + } + + excludeRegexes, err := compilePatterns(config.ExcludePatterns) + if err != nil { + return result, fmt.Errorf("failed to compile exclude patterns: %w", err) + } + + // Process log groups + for _, logGroup := range logGroups { + logGroupName := *logGroup.LogGroupName + result.TotalProcessed++ + + // Check if log group matches any include pattern + if !matchesAnyPattern(logGroupName, includeRegexes) { + continue + } + + // Check if log group matches any exclude pattern + if matchesAnyPattern(logGroupName, excludeRegexes) { + log.Printf("Skipping log group %s (matches exclude pattern)", logGroupName) + result.SkippedLogGroups = append(result.SkippedLogGroups, logGroupName) + continue + } + + // Check age constraint if specified + if config.MaxAge != nil { + if logGroup.CreationTime == nil { + continue + } + creationTime := time.UnixMilli(*logGroup.CreationTime) + if time.Since(creationTime) < *config.MaxAge { + log.Printf("Skipping log group %s (too recent: %v)", logGroupName, creationTime) + result.SkippedLogGroups = append(result.SkippedLogGroups, logGroupName) + continue + } + } + + if config.DryRun { + log.Printf("[DRY RUN] Would delete log group: %s", logGroupName) + result.DeletedLogGroups = append(result.DeletedLogGroups, logGroupName) + } else { + log.Printf("Deleting log group: %s", logGroupName) + err := deleteLogGroupWithRetry(logGroupName) + if err != nil { + log.Printf("Failed to delete log group %s: %v", logGroupName, err) + result.Errors = append(result.Errors, fmt.Errorf("failed to delete %s: %w", logGroupName, err)) + } else { + result.DeletedLogGroups = append(result.DeletedLogGroups, logGroupName) + } + } + } + + log.Printf("Cleanup completed. Deleted: %d, Skipped: %d, Errors: %d", + len(result.DeletedLogGroups), len(result.SkippedLogGroups), len(result.Errors)) + + return result, nil +} + +// CleanupTestLogGroups is a convenience function for cleaning up logs after tests +// It uses safe defaults and common test patterns +func CleanupTestLogGroups(dryRun bool) error { + maxAge := 1 * time.Hour // Only clean up logs older than 1 hour for safety + config := LogGroupCleanupConfig{ + DryRun: dryRun, + MaxAge: &maxAge, + // Add some exclude patterns for safety + ExcludePatterns: []string{ + "/aws/lambda/.*", // Don't delete Lambda logs unless specifically targeted + ".*production.*", // Never delete production logs + ".*prod.*", // Never delete prod logs + }, + } + + result, err := CleanupEMFAndContainerInsightsLogs(config) + if err != nil { + return err + } + + if len(result.Errors) > 0 { + return fmt.Errorf("cleanup completed with %d errors: %v", len(result.Errors), result.Errors[0]) + } + + return nil +} + +// ListEMFAndContainerInsightsLogGroups lists log groups that match EMF and Container Insights patterns +func ListEMFAndContainerInsightsLogGroups() ([]string, error) { + config := LogGroupCleanupConfig{ + DryRun: true, + } + + result, err := CleanupEMFAndContainerInsightsLogs(config) + if err != nil { + return nil, err + } + + return result.DeletedLogGroups, nil // In dry run mode, these are the groups that would be deleted +} + +// Helper functions + +func listAllLogGroups() ([]types.LogGroup, error) { + var allLogGroups []types.LogGroup + var nextToken *string + + for { + input := &cloudwatchlogs.DescribeLogGroupsInput{ + NextToken: nextToken, + Limit: aws.Int32(50), // AWS limit + } + + output, err := CwlClient.DescribeLogGroups(context.TODO(), input) + if err != nil { + return nil, err + } + + allLogGroups = append(allLogGroups, output.LogGroups...) + + if output.NextToken == nil { + break + } + nextToken = output.NextToken + } + + return allLogGroups, nil +} + +func compilePatterns(patterns []string) ([]*regexp.Regexp, error) { + var regexes []*regexp.Regexp + for _, pattern := range patterns { + regex, err := regexp.Compile(pattern) + if err != nil { + return nil, fmt.Errorf("invalid pattern '%s': %w", pattern, err) + } + regexes = append(regexes, regex) + } + return regexes, nil +} + +func matchesAnyPattern(text string, patterns []*regexp.Regexp) bool { + for _, pattern := range patterns { + if pattern.MatchString(text) { + return true + } + } + return false +} + +func deleteLogGroupWithRetry(logGroupName string) error { + maxRetries := 3 + for i := 0; i < maxRetries; i++ { + err := deleteLogGroupSafe(logGroupName) + if err == nil { + return nil + } + + // Check if it's a retryable error + if strings.Contains(err.Error(), "ResourceNotFoundException") { + // Log group already deleted + return nil + } + + if i < maxRetries-1 { + log.Printf("Retry %d/%d for deleting log group %s: %v", i+1, maxRetries, logGroupName, err) + time.Sleep(time.Duration(i+1) * time.Second) + } + } + + return fmt.Errorf("failed to delete log group %s after %d retries", logGroupName, maxRetries) +} + +func deleteLogGroupSafe(logGroupName string) error { + _, err := CwlClient.DeleteLogGroup(context.TODO(), &cloudwatchlogs.DeleteLogGroupInput{ + LogGroupName: aws.String(logGroupName), + }) + return err +} + +// GetLogGroupsByPrefix returns log groups matching a specific prefix +func GetLogGroupsByPrefix(prefix string) ([]string, error) { + input := &cloudwatchlogs.DescribeLogGroupsInput{ + LogGroupNamePrefix: aws.String(prefix), + } + + var logGroupNames []string + var nextToken *string + + for { + input.NextToken = nextToken + output, err := CwlClient.DescribeLogGroups(context.TODO(), input) + if err != nil { + return nil, err + } + + for _, logGroup := range output.LogGroups { + logGroupNames = append(logGroupNames, *logGroup.LogGroupName) + } + + if output.NextToken == nil { + break + } + nextToken = output.NextToken + } + + return logGroupNames, nil +} + +// CleanupLogGroupsByPrefix deletes all log groups with a specific prefix +func CleanupLogGroupsByPrefix(prefix string, dryRun bool) error { + logGroups, err := GetLogGroupsByPrefix(prefix) + if err != nil { + return err + } + + log.Printf("Found %d log groups with prefix '%s'", len(logGroups), prefix) + + for _, logGroupName := range logGroups { + if dryRun { + log.Printf("[DRY RUN] Would delete log group: %s", logGroupName) + } else { + log.Printf("Deleting log group: %s", logGroupName) + DeleteLogGroup(logGroupName) + } + } + + return nil +}