diff --git a/IaC/cdk/README.md b/IaC/cdk/README.md new file mode 100644 index 0000000000..99cc28eb4d --- /dev/null +++ b/IaC/cdk/README.md @@ -0,0 +1,21 @@ +# CDK Projects + +AWS CDK infrastructure as code for automated resource management. + +## openshift-resources-cleanup + +Automated Lambda that cleans up expired OpenShift/ROSA clusters. Scans all AWS regions every 15 minutes, validates TTL tags, and deletes cluster infrastructure (VPC, ELB, Route53, S3, EC2). + +```bash +cd openshift-resources-cleanup +just install && just bootstrap +just deploy # Deploy LIVE mode +just logs # View logs +``` + +See [openshift-resources-cleanup/README.md](openshift-resources-cleanup/README.md) for full documentation. + +## Requirements + +- AWS CLI configured +- `brew install uv just` diff --git a/IaC/cdk/openshift-resources-cleanup/.gitignore b/IaC/cdk/openshift-resources-cleanup/.gitignore new file mode 100644 index 0000000000..1ae5968a31 --- /dev/null +++ b/IaC/cdk/openshift-resources-cleanup/.gitignore @@ -0,0 +1,84 @@ +# CDK +cdk.out/ +.cdk.staging/ +cdk.context.json + +# Python +__pycache__/ +*.py[cod] +*$py.class +*.so +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# Virtual environments +venv/ +ENV/ +env/ +.venv + +# Testing +.pytest_cache/ +.coverage +htmlcov/ +.tox/ + +# IDEs +.vscode/ +.idea/ +*.swp +*.swo +*~ + +# OS +.DS_Store +Thumbs.db + +# Lambda artifacts +*.zip +/tmp/ + +# Lambda dependencies (installed at build time) +lambda/*.dist-info/ +lambda/bin/ +lambda/boto3/ +lambda/boto3-*.dist-info/ +lambda/botocore/ +lambda/botocore-*.dist-info/ +lambda/aws_lambda_powertools/ +lambda/aws_lambda_powertools-*.dist-info/ +lambda/aws_xray_sdk/ +lambda/aws_xray_sdk-*.dist-info/ +lambda/s3transfer/ +lambda/s3transfer-*.dist-info/ +lambda/jmespath/ +lambda/jmespath-*.dist-info/ +lambda/dateutil/ +lambda/python_dateutil-*.dist-info/ +lambda/urllib3/ +lambda/urllib3-*.dist-info/ +lambda/six.py +lambda/six-*.dist-info/ +lambda/typing_extensions.py +lambda/typing_extensions-*.dist-info/ +lambda/wrapt/ +lambda/wrapt-*.dist-info/ +lambda/.lock + +# Logs +*.log diff --git a/IaC/cdk/openshift-resources-cleanup/README.md b/IaC/cdk/openshift-resources-cleanup/README.md new file mode 100644 index 0000000000..6d4aeae25e --- /dev/null +++ b/IaC/cdk/openshift-resources-cleanup/README.md @@ -0,0 +1,56 @@ +# OpenShift Cluster Cleanup + +Automated Lambda that cleans up expired OpenShift/ROSA clusters. Runs every 15 minutes, scans for expired clusters, deletes all resources (VPC, ELB, Route53, S3, EC2). + +## Configuration + +| Setting | Default | How to Change | +|---------|---------|---------------| +| **Lambda Location** | `us-east-2` | `AWS_REGION=us-west-1 just deploy` | +| **Scan Regions** (comma-separated) | `all` | `just deploy us-east-2` or `just deploy us-east-1,eu-west-1,ap-south-1` | +| **AWS Profile** | `default` | `AWS_PROFILE=myprofile just deploy` | +| **Mode** | `LIVE` | `just deploy-dry` | + +## Quick Start + +```bash +brew install uv just && aws configure +cd IaC/cdk/openshift-resources-cleanup +just install && just bootstrap +just deploy # Scans all regions (default) +just deploy us-east-2 # Scans us-east-2 only +AWS_REGION=us-west-1 just deploy # Deploy Lambda to us-west-1 +``` + +## Commands + +```bash +just deploy [regions] # Deploy LIVE (default: all regions) +just deploy-dry [regions]# Deploy DRY_RUN (toggles LIVE off) +just logs # Tail CloudWatch logs +just params # Show configuration +just test # Run tests +``` + +Run `just` for all commands. + +## How It Works + +1. **Detect**: Scans EC2 for OpenShift/ROSA clusters (tags: `red-hat-clustertype: rosa` or name: `*-master-*`) +2. **Check TTL**: Reads `creation-time` + `delete-cluster-after-hours` tags, skips if not expired +3. **Delete**: Removes all resources in dependency order (instances → ELB → NAT → VPC → Route53 → S3) + +## Logs & Troubleshooting + +```bash +just logs # View real-time logs +just params # Check configuration +``` + +**Example output:** +``` +Detected OpenShift ROSA cluster: jvp-rosa1-qmdkk +Cluster TTL not expired (5.45 hours remaining) +``` + +**Common issues:** Missing TTL tags (`creation-time`, `delete-cluster-after-hours`), `DryRunMode=true`, permissions diff --git a/IaC/cdk/openshift-resources-cleanup/app.py b/IaC/cdk/openshift-resources-cleanup/app.py new file mode 100644 index 0000000000..9af2667570 --- /dev/null +++ b/IaC/cdk/openshift-resources-cleanup/app.py @@ -0,0 +1,28 @@ +#!/usr/bin/env python3 +"""CDK app for OpenShift Cluster Cleanup Lambda.""" + +import os +import aws_cdk as cdk +from stacks.resource_cleanup_stack import ResourceCleanupStack + +# Stack name (single source of truth) +STACK_NAME = "OpenShiftResourcesCleanupStack" + +app = cdk.App() + +ResourceCleanupStack( + app, + STACK_NAME, + description="OpenShift cluster infrastructure cleanup for AWS", + env=cdk.Environment( + account=os.getenv('CDK_DEFAULT_ACCOUNT'), + region=os.getenv('CDK_DEFAULT_REGION', 'us-east-2') + ), + tags={ + "Project": "PlatformEngineering", + "ManagedBy": "CDK", + "iit-billing-tag": "openshift-cleanup" + } +) + +app.synth() diff --git a/IaC/cdk/openshift-resources-cleanup/cdk.json b/IaC/cdk/openshift-resources-cleanup/cdk.json new file mode 100644 index 0000000000..92a7592a42 --- /dev/null +++ b/IaC/cdk/openshift-resources-cleanup/cdk.json @@ -0,0 +1,114 @@ +{ + "app": "python3 app.py", + "watch": { + "include": [ + "**" + ], + "exclude": [ + "README.md", + "cdk*.json", + "requirements*.txt", + "source.bat", + "**/__pycache__", + "**/*.pyc", + ".pytest_cache" + ] + }, + "context": { + "@aws-cdk/aws-lambda:recognizeLayerVersion": true, + "@aws-cdk/core:checkSecretUsage": true, + "@aws-cdk/core:target-partitions": [ + "aws", + "aws-cn" + ], + "@aws-cdk-containers/ecs-service-extensions:enableDefaultLogDriver": true, + "@aws-cdk/aws-ec2:uniqueImdsv2TemplateName": true, + "@aws-cdk/aws-ecs:arnFormatIncludesClusterName": true, + "@aws-cdk/aws-iam:minimizePolicies": true, + "@aws-cdk/core:validateSnapshotRemovalPolicy": true, + "@aws-cdk/aws-codepipeline:crossAccountKeyAliasStackSafeResourceName": true, + "@aws-cdk/aws-s3:createDefaultLoggingPolicy": true, + "@aws-cdk/aws-sns-subscriptions:restrictSqsDescryption": true, + "@aws-cdk/aws-apigateway:disableCloudWatchRole": true, + "@aws-cdk/core:enablePartitionLiterals": true, + "@aws-cdk/aws-events:eventsTargetQueueSameAccount": true, + "@aws-cdk/aws-iam:standardizedServicePrincipals": true, + "@aws-cdk/aws-ecs:disableExplicitDeploymentControllerForCircuitBreaker": true, + "@aws-cdk/aws-iam:importedRoleStackSafeDefaultPolicyName": true, + "@aws-cdk/aws-s3:serverAccessLogsUseBucketPolicy": true, + "@aws-cdk/aws-route53-patternslibrary:useCertificate": true, + "@aws-cdk/customresources:installLatestAwsSdkDefault": false, + "@aws-cdk/aws-rds:databaseProxyUniqueResourceName": true, + "@aws-cdk/aws-codedeploy:removeAlarmsFromDeploymentGroup": true, + "@aws-cdk/aws-apigateway:authorizerChangeDeploymentLogicalId": true, + "@aws-cdk/aws-ec2:launchTemplateDefaultUserData": true, + "@aws-cdk/aws-secretsmanager:useAttachedSecretResourcePolicyForSecretTargetAttachments": true, + "@aws-cdk/aws-redshift:columnId": true, + "@aws-cdk/aws-stepfunctions-tasks:enableEmrServicePolicyV2": true, + "@aws-cdk/aws-ec2:restrictDefaultSecurityGroup": true, + "@aws-cdk/aws-apigateway:requestValidatorUniqueId": true, + "@aws-cdk/aws-kms:aliasNameRef": true, + "@aws-cdk/aws-autoscaling:generateLaunchTemplateInsteadOfLaunchConfig": true, + "@aws-cdk/core:includePrefixInUniqueNameGeneration": true, + "@aws-cdk/aws-efs:denyAnonymousAccess": true, + "@aws-cdk/aws-opensearchservice:enableOpensearchMultiAzWithStandby": true, + "@aws-cdk/aws-lambda-nodejs:useLatestRuntimeVersion": true, + "@aws-cdk/aws-efs:mountTargetOrderInsensitiveLogicalId": true, + "@aws-cdk/aws-rds:auroraClusterChangeScopeOfInstanceParameterGroupWithEachParameters": true, + "@aws-cdk/aws-appsync:useArnForSourceApiAssociationIdentifier": true, + "@aws-cdk/aws-rds:preventRenderingDeprecatedCredentials": true, + "@aws-cdk/aws-codepipeline-actions:useNewDefaultBranchForCodeCommitSource": true, + "@aws-cdk/aws-cloudwatch-actions:changeLambdaPermissionLogicalIdForLambdaAction": true, + "@aws-cdk/aws-codepipeline:crossAccountKeysDefaultValueToFalse": true, + "@aws-cdk/aws-codepipeline:defaultPipelineTypeToV2": true, + "@aws-cdk/aws-kms:reduceCrossAccountRegionPolicyScope": true, + "@aws-cdk/aws-eks:nodegroupNameAttribute": true, + "@aws-cdk/aws-ec2:ebsDefaultGp3Volume": true, + "@aws-cdk/aws-ecs:removeDefaultDeploymentAlarm": true, + "@aws-cdk/custom-resources:logApiResponseDataPropertyTrueDefault": false, + "@aws-cdk/aws-s3:keepNotificationInImportedBucket": false, + "@aws-cdk/aws-apigateway:usagePlanKeyOrderInsensitiveId": true, + "@aws-cdk/aws-appsync:appSyncGraphQLAPIScopeLambdaPermission": true, + "@aws-cdk/aws-cloudfront:defaultSecurityPolicyTLSv1.2_2021": true, + "@aws-cdk/aws-dynamodb:resourcePolicyPerReplica": true, + "@aws-cdk/aws-dynamodb:retainTableReplica": true, + "@aws-cdk/aws-ec2-alpha:useResourceIdForVpcV2Migration": false, + "@aws-cdk/aws-ec2:bastionHostUseAmazonLinux2023ByDefault": true, + "@aws-cdk/aws-ec2:ec2SumTImeoutEnabled": true, + "@aws-cdk/aws-ec2:requirePrivateSubnetsForEgressOnlyInternetGateway": true, + "@aws-cdk/aws-ecs-patterns:secGroupsDisablesImplicitOpenListener": true, + "@aws-cdk/aws-ecs:disableEcsImdsBlocking": true, + "@aws-cdk/aws-ecs:enableImdsBlockingDeprecatedFeature": false, + "@aws-cdk/aws-ecs:reduceEc2FargateCloudWatchPermissions": true, + "@aws-cdk/aws-elasticloadbalancingV2:albDualstackWithoutPublicIpv4SecurityGroupRulesDefault": true, + "@aws-cdk/aws-events:requireEventBusPolicySid": true, + "@aws-cdk/aws-iam:oidcRejectUnauthorizedConnections": true, + "@aws-cdk/aws-kms:applyImportedAliasPermissionsToPrincipal": true, + "@aws-cdk/aws-lambda-nodejs:sdkV3ExcludeSmithyPackages": true, + "@aws-cdk/aws-lambda:createNewPoliciesWithAddToRolePolicy": false, + "@aws-cdk/aws-lambda:recognizeVersionProps": true, + "@aws-cdk/aws-lambda:useCdkManagedLogGroup": true, + "@aws-cdk/aws-rds:lowercaseDbIdentifier": true, + "@aws-cdk/aws-rds:setCorrectValueForDatabaseInstanceReadReplicaInstanceResourceId": true, + "@aws-cdk/aws-route53-patters:useCertificate": true, + "@aws-cdk/aws-route53-targets:userPoolDomainNameMethodWithoutCustomResource": true, + "@aws-cdk/aws-s3:publicAccessBlockedByDefault": true, + "@aws-cdk/aws-s3:setUniqueReplicationRoleName": true, + "@aws-cdk/aws-signer:signingProfileNamePassedToCfn": true, + "@aws-cdk/aws-stepfunctions-tasks:fixRunEcsTaskPolicy": true, + "@aws-cdk/aws-stepfunctions-tasks:useNewS3UriParametersForBedrockInvokeModelTask": true, + "@aws-cdk/aws-stepfunctions:useDistributedMapResultWriterV2": true, + "@aws-cdk/cognito:logUserPoolClientSecretValue": false, + "@aws-cdk/core:aspectPrioritiesMutating": true, + "@aws-cdk/core:aspectStabilization": true, + "@aws-cdk/core:cfnIncludeRejectComplexResourceUpdateCreatePolicyIntrinsics": true, + "@aws-cdk/core:enableAdditionalMetadataCollection": true, + "@aws-cdk/core:explicitStackTags": true, + "@aws-cdk/core:newStyleStackSynthesis": true, + "@aws-cdk/core:stackRelativeExports": true, + "@aws-cdk/pipelines:reduceAssetRoleTrustScope": true, + "@aws-cdk/pipelines:reduceCrossAccountActionRoleTrustScope": true, + "@aws-cdk/pipelines:reduceStageRoleTrustScope": true, + "@aws-cdk/s3-notifications:addS3TrustKeyPolicyForSnsSubscriptions": true + } +} diff --git a/IaC/cdk/openshift-resources-cleanup/justfile b/IaC/cdk/openshift-resources-cleanup/justfile new file mode 100644 index 0000000000..85e5723b82 --- /dev/null +++ b/IaC/cdk/openshift-resources-cleanup/justfile @@ -0,0 +1,324 @@ +# OpenShift Resources Cleanup - CDK Deployment Automation +# Usage: just + +# Default AWS profile and region +profile := env_var_or_default("AWS_PROFILE", "default") +region := env_var_or_default("AWS_REGION", "us-east-2") + +# Stack name (single source of truth) +stack_name := "OpenShiftResourcesCleanupStack" + +# Lambda function name from CDK stack outputs (dynamically retrieved) +# This ensures justfile always uses the function name defined in the CDK stack +# Falls back to "LambdaOpenShiftCleanup" if stack doesn't exist yet +lambda_function := `aws cloudformation describe-stacks --stack-name {{stack_name}} --profile ${AWS_PROFILE:-default} --region ${AWS_REGION:-us-east-2} --query "Stacks[0].Outputs[?OutputKey=='LambdaFunctionName'].OutputValue | [0]" --output text 2>/dev/null || echo "LambdaOpenShiftCleanup"` + +# Default recipe - show available commands +default: + @echo "OpenShift Resources Cleanup - CDK Deployment" + @echo "" + @echo "Quick Start:" + @echo " just install Install dependencies" + @echo " just deploy Deploy LIVE (scans all regions)" + @echo " just logs Tail CloudWatch logs" + @echo " just invoke-aws Test Lambda execution" + @echo "" + @echo "Deployment (Lambda deploys to {{region}}, configurable via AWS_REGION):" + @echo " just deploy Deploy LIVE (scans all regions)" + @echo " just deploy us-east-2 Deploy LIVE (scans us-east-2 only)" + @echo " just deploy us-east-1,eu-west-1 Deploy LIVE (scans specific regions)" + @echo " just deploy-dry Deploy DRY_RUN (scans all regions)" + @echo " just deploy-dry us-east-2 Deploy DRY_RUN (scans us-east-2)" + @echo "" + @echo "Examples:" + @echo " AWS_REGION=us-west-1 just deploy # Deploy Lambda to us-west-1" + @echo " just deploy us-east-2 # Scan only us-east-2" + @echo " just diff Preview infrastructure changes" + @echo " just synth Generate CloudFormation template" + @echo " just destroy Destroy the entire stack" + @echo "" + @echo "Monitoring:" + @echo " just logs Tail CloudWatch logs (follow)" + @echo " just logs-recent Show logs from last hour" + @echo " just invoke-aws Manually invoke Lambda" + @echo " just info Show Lambda configuration" + @echo " just outputs Show stack outputs" + @echo " just params Show stack parameters" + @echo "" + @echo "Testing & Quality:" + @echo " just test Run unit tests" + @echo " just test-coverage Run tests with detailed coverage" + @echo " just lint Run linters" + @echo " just format Format code" + @echo " just ci Full CI pipeline (lint + test + synth)" + @echo "" + @echo "Maintenance:" + @echo " just update-code Fast Lambda code update (no CDK)" + @echo " just upgrade Upgrade all dependencies" + @echo " just versions Show installed versions" + @echo " just clean Clean build artifacts" + @echo " just validate Validate CloudFormation template" + @echo "" + @echo "Run 'just --list' for all commands" + +# Install dependencies +install: + @echo "Installing CDK and Lambda dependencies..." + uv pip install -r requirements.txt + @echo "Installing Lambda dependencies..." + cd lambda && uv pip install -r openshift_resource_cleanup/requirements.txt + +# Bootstrap CDK (first time only) +bootstrap: + @echo "Bootstrapping CDK in {{region}} with profile {{profile}}..." + uv run cdk bootstrap aws://$(aws sts get-caller-identity --profile {{profile}} --query Account --output text)/{{region}} \ + --profile {{profile}} \ + --region {{region}} + +# Synthesize CloudFormation template +synth: bundle-lambda + @echo "Synthesizing CloudFormation template..." + uv run cdk synth --profile {{profile}} --region {{region}} + +# Check and cleanup failed stack state +check-stack: + #!/usr/bin/env bash + set -euo pipefail + STACK_STATUS=$(aws cloudformation describe-stacks --stack-name {{stack_name}} --profile {{profile}} --region {{region}} --query 'Stacks[0].StackStatus' --output text 2>/dev/null || echo "NONE") + if [ "$STACK_STATUS" = "ROLLBACK_COMPLETE" ]; then + echo "[!] Stack in ROLLBACK_COMPLETE state, cleaning up..." + aws logs delete-log-group --log-group-name /aws/lambda/{{lambda_function}} --profile {{profile}} --region {{region}} 2>/dev/null || true + aws cloudformation delete-stack --stack-name {{stack_name}} --profile {{profile}} --region {{region}} + aws cloudformation wait stack-delete-complete --stack-name {{stack_name}} --profile {{profile}} --region {{region}} + echo "[OK] Cleanup complete" + elif [ "$STACK_STATUS" != "NONE" ] && [ "$STACK_STATUS" != "CREATE_COMPLETE" ] && [ "$STACK_STATUS" != "UPDATE_COMPLETE" ]; then + echo "[!] Stack in unexpected state: $STACK_STATUS" + echo "Please check CloudFormation console and clean up manually" + exit 1 + fi + +# Deploy in LIVE mode (default: scan all regions) +deploy SCAN_REGIONS="all": check-stack bundle-lambda + @echo "[!] Deploying Lambda in LIVE mode" + @echo "Lambda deployment region: {{region}}" + @echo "Will scan regions: {{SCAN_REGIONS}}" + @echo "Press Ctrl+C to cancel, or Enter to continue..." + @read _ + uv run cdk deploy \ + --profile {{profile}} \ + --region {{region}} \ + --parameters DryRunMode=false \ + --parameters TargetRegions={{SCAN_REGIONS}} \ + --require-approval never + +# Deploy in DRY_RUN mode for testing +deploy-dry SCAN_REGIONS="all": check-stack bundle-lambda + @echo "Deploying in DRY_RUN mode (safe, logs only)" + @echo "Lambda deployment region: {{region}}" + @echo "Will scan regions: {{SCAN_REGIONS}}" + uv run cdk deploy \ + --profile {{profile}} \ + --region {{region}} \ + --parameters DryRunMode=true \ + --parameters TargetRegions={{SCAN_REGIONS}} \ + --require-approval never + +# Deploy with custom parameters +deploy-custom DRY_RUN="true" THRESHOLD="30" EMAIL="" REGIONS="all": bundle-lambda + @echo "Deploying with custom parameters..." + uv run cdk deploy \ + --profile {{profile}} \ + --region {{region}} \ + --parameters DryRunMode={{DRY_RUN}} \ + --parameters UntaggedThresholdMinutes={{THRESHOLD}} \ + --parameters NotificationEmail={{EMAIL}} \ + --parameters TargetRegions={{REGIONS}} + +# Destroy the stack (cleanup) +destroy: + @echo "[!] WARNING: This will destroy the entire stack!" + @echo "Stack: {{stack_name}}" + @echo "Region: {{region}}" + @echo "Profile: {{profile}}" + @echo "" + @echo "Press Ctrl+C to cancel, or Enter to continue..." + @read _ + uv run cdk destroy --profile {{profile}} --region {{region}} --force + +# Diff against deployed stack +diff: + @echo "Comparing local changes with deployed stack..." + uv run cdk diff --profile {{profile}} --region {{region}} + +# Tail CloudWatch logs +logs: + @echo "Tailing CloudWatch logs for Lambda..." + @aws logs tail /aws/lambda/{{lambda_function}} \ + --follow \ + --format short \ + --profile {{profile}} \ + --region {{region}} 2>/dev/null || echo "Log group not found. Lambda may not be deployed or hasn't been invoked yet. Deploy with: just deploy" + +# Tail recent logs (last hour) +logs-recent: + @echo "Showing logs from last hour..." + @aws logs tail /aws/lambda/{{lambda_function}} \ + --since 1h \ + --format short \ + --profile {{profile}} \ + --region {{region}} 2>/dev/null || echo "Log group not found. Lambda may not be deployed or hasn't been invoked yet. Deploy with: just deploy" + +# Invoke Lambda manually (AWS) +invoke-aws: + @echo "Invoking Lambda in AWS..." + @aws lambda invoke \ + --function-name {{lambda_function}} \ + --profile {{profile}} \ + --region {{region}} \ + --log-type Tail \ + /tmp/lambda-response.json 2>/dev/null && \ + (echo "\nResponse:" && cat /tmp/lambda-response.json | jq '.' && rm /tmp/lambda-response.json) || \ + echo "Lambda not found. Deploy first with: just deploy" + +# Get Lambda function info +info: + @echo "Lambda function information:" + @aws lambda get-function \ + --function-name {{lambda_function}} \ + --profile {{profile}} \ + --region {{region}} \ + --query 'Configuration.{Name:FunctionName,Runtime:Runtime,Memory:MemorySize,Timeout:Timeout,Modified:LastModified,Architecture:Architectures[0]}' \ + --output table 2>/dev/null || echo "Lambda not found. Deploy first with: just deploy" + +# Run unit tests +test: + @echo "Running unit tests..." + PYTHONPATH=lambda:$$PYTHONPATH uv run --python 3.13 --with pytest --with pytest-cov --with 'aws-lambda-powertools[tracer]' --with boto3 --with botocore --with freezegun pytest tests/ -v --cov=openshift_resource_cleanup + +# Run unit tests with detailed coverage report +test-coverage: + @echo "Running unit tests with coverage report..." + PYTHONPATH=lambda:$$PYTHONPATH uv run --python 3.13 --with pytest --with pytest-cov --with 'aws-lambda-powertools[tracer]' --with boto3 --with botocore --with freezegun pytest tests/ -v --cov=openshift_resource_cleanup --cov-report=term-missing + +# Run linting +lint: + @echo "Running linters..." + uv run --with ruff ruff check lambda/openshift_resource_cleanup/ + uv run --with black black --check lambda/openshift_resource_cleanup/ + cd lambda/openshift_resource_cleanup && uv run --with mypy mypy . + +# Format code +format: + @echo "Formatting code..." + uv run --with black black lambda/openshift_resource_cleanup/ + uv run --with ruff ruff check --fix lambda/openshift_resource_cleanup/ + +# Clean build artifacts +clean: + @echo "Cleaning build artifacts..." + rg -g '*.pyc' --files | xargs rm -f || true + rg -g '__pycache__' --files | xargs rm -rf || true + rm -rf cdk.out + rm -rf .pytest_cache + rm -rf tests/.pytest_cache + rm -rf lambda/openshift_resource_cleanup.egg-info + rm -f /tmp/lambda-response.json + @echo "Cleaning bundled Lambda dependencies..." + find lambda -maxdepth 1 -type f -name "*.py" ! -name "__init__.py" -delete || true + find lambda -maxdepth 1 -type d ! -name lambda ! -name openshift_resource_cleanup -exec rm -rf {} + 2>/dev/null || true + +# Full CI pipeline (lint, test, synth) +ci: lint test synth + @echo "[OK] CI pipeline completed successfully" + +# Watch for changes and auto-deploy +watch: + @echo "Watching for changes (auto-deploy on save)..." + uv run cdk watch --profile {{profile}} --region {{region}} + +# Show stack outputs +outputs: + @echo "Stack outputs:" + @aws cloudformation describe-stacks \ + --stack-name {{stack_name}} \ + --profile {{profile}} \ + --region {{region}} \ + --query 'Stacks[0].Outputs' \ + --output table 2>/dev/null || echo "Stack '{{stack_name}}' not found in region {{region}}. Deploy first with: just deploy" + +# Show stack parameters +params: + @echo "Stack parameters:" + @aws cloudformation describe-stacks \ + --stack-name {{stack_name}} \ + --profile {{profile}} \ + --region {{region}} \ + --query 'Stacks[0].Parameters' \ + --output table 2>/dev/null || echo "Stack '{{stack_name}}' not found in region {{region}}. Deploy first with: just deploy" + +# List all Lambda functions +list-lambdas: + @echo "All Lambda functions in {{region}}:" + aws lambda list-functions \ + --profile {{profile}} \ + --region {{region}} \ + --query 'Functions[?starts_with(FunctionName, `Lambda`)].{Name:FunctionName,Runtime:Runtime,Size:CodeSize,Modified:LastModified}' \ + --output table + +# Bundle Lambda with dependencies +bundle-lambda: + @echo "Bundling Lambda with dependencies..." + @echo "Installing dependencies in lambda directory..." + uv pip install -r lambda/openshift_resource_cleanup/requirements.txt --target lambda/ + @echo "[OK] Lambda bundled with dependencies" + +# Update Lambda code only (faster than full deploy) +update-code: bundle-lambda + @echo "Building Lambda package..." + @cd lambda && zip -r /tmp/lambda-code.zip . >/dev/null + @echo "Updating Lambda function code..." + @aws lambda update-function-code \ + --function-name {{lambda_function}} \ + --zip-file fileb:///tmp/lambda-code.zip \ + --profile {{profile}} \ + --region {{region}} >/dev/null 2>&1 && \ + (rm /tmp/lambda-code.zip && echo "[OK] Lambda code updated") || \ + (rm -f /tmp/lambda-code.zip && echo "Lambda not found. Deploy first with: just deploy") + +# Update Lambda environment variables +update-env DRY_RUN="true": + @echo "Updating Lambda environment variables..." + @aws lambda update-function-configuration \ + --function-name {{lambda_function}} \ + --environment "Variables={DRY_RUN={{DRY_RUN}}}" \ + --profile {{profile}} \ + --region {{region}} >/dev/null 2>&1 && \ + echo "[OK] Environment updated to DRY_RUN={{DRY_RUN}}" || \ + echo "Lambda not found. Deploy first with: just deploy" + +# Validate CloudFormation template +validate: + @echo "Validating CloudFormation template..." + uv run cdk synth --profile {{profile}} --region {{region}} > /tmp/template.yaml + aws cloudformation validate-template \ + --template-body file:///tmp/template.yaml \ + --profile {{profile}} \ + --region {{region}} + @rm /tmp/template.yaml + @echo "[OK] Template is valid" + +# Upgrade all dependencies +upgrade: + @echo "Upgrading CDK and Python dependencies..." + uv pip install --upgrade aws-cdk-lib constructs boto3 + @echo "Upgrading dev tools..." + uv pip install --upgrade pytest pytest-cov moto ruff black mypy + @echo "[OK] All dependencies upgraded" + @echo "Run 'just synth' to verify CDK works" + +# Show versions +versions: + @echo "CDK version:" + @uv run cdk --version + @echo "\nPython packages:" + @uv pip list | grep -E "(aws-cdk-lib|constructs|boto3|pytest|ruff|black|mypy)" diff --git a/IaC/cdk/openshift-resources-cleanup/lambda/openshift_resource_cleanup/__init__.py b/IaC/cdk/openshift-resources-cleanup/lambda/openshift_resource_cleanup/__init__.py new file mode 100644 index 0000000000..5d48d651aa --- /dev/null +++ b/IaC/cdk/openshift-resources-cleanup/lambda/openshift_resource_cleanup/__init__.py @@ -0,0 +1,8 @@ +"""OpenShift Cluster Cleanup Lambda for AWS.""" + +from .handler import lambda_handler + +__version__ = "3.0.0" +__description__ = "Automated OpenShift cluster infrastructure cleanup" + +__all__ = ["lambda_handler"] diff --git a/IaC/cdk/openshift-resources-cleanup/lambda/openshift_resource_cleanup/ec2/__init__.py b/IaC/cdk/openshift-resources-cleanup/lambda/openshift_resource_cleanup/ec2/__init__.py new file mode 100644 index 0000000000..037eaffe05 --- /dev/null +++ b/IaC/cdk/openshift-resources-cleanup/lambda/openshift_resource_cleanup/ec2/__init__.py @@ -0,0 +1,7 @@ +"""EC2 operations for OpenShift cluster cleanup.""" + +from .instances import execute_cleanup_action + +__all__ = [ + "execute_cleanup_action", +] diff --git a/IaC/cdk/openshift-resources-cleanup/lambda/openshift_resource_cleanup/ec2/instances.py b/IaC/cdk/openshift-resources-cleanup/lambda/openshift_resource_cleanup/ec2/instances.py new file mode 100644 index 0000000000..9e8ab920d6 --- /dev/null +++ b/IaC/cdk/openshift-resources-cleanup/lambda/openshift_resource_cleanup/ec2/instances.py @@ -0,0 +1,93 @@ +"""EC2 instance operations for OpenShift cluster cleanup.""" + +from __future__ import annotations +import boto3 +from botocore.exceptions import ClientError +from ..models import CleanupAction +from ..models.config import DRY_RUN, OPENSHIFT_CLEANUP_ENABLED +from ..utils import get_logger +from ..openshift.orchestrator import destroy_openshift_cluster +from ..openshift.detection import detect_openshift_infra_id + +logger = get_logger() + + +def execute_cleanup_action(action: CleanupAction, region: str) -> bool: + """Execute OpenShift cluster cleanup action.""" + ec2 = boto3.client("ec2", region_name=region) + + try: + if action.action == "TERMINATE_OPENSHIFT_CLUSTER": + if not action.cluster_name: + logger.error( + "Missing cluster_name for TERMINATE_OPENSHIFT_CLUSTER action", + extra={"instance_id": action.instance_id, "action": action.action}, + ) + return False + + if OPENSHIFT_CLEANUP_ENABLED: + cluster_name = action.cluster_name + infra_id = detect_openshift_infra_id(cluster_name, region) + if infra_id: + if DRY_RUN: + logger.info( + "Would TERMINATE_OPENSHIFT_CLUSTER", + extra={ + "dry_run": True, + "cluster_name": cluster_name, + "infra_id": infra_id, + "cluster_type": "openshift", + "region": region, + }, + ) + else: + logger.info( + "TERMINATE_OPENSHIFT_CLUSTER", + extra={ + "cluster_name": cluster_name, + "infra_id": infra_id, + "cluster_type": "openshift", + "region": region, + }, + ) + destroy_openshift_cluster(cluster_name, infra_id, region) + if DRY_RUN: + logger.info( + "Would TERMINATE instance for cluster", + extra={ + "dry_run": True, + "instance_id": action.instance_id, + "cluster_name": cluster_name, + }, + ) + else: + logger.info( + "TERMINATE instance for cluster", + extra={ + "instance_id": action.instance_id, + "cluster_name": cluster_name, + }, + ) + ec2.terminate_instances(InstanceIds=[action.instance_id]) + else: + logger.info( + "OpenShift cleanup disabled", + extra={ + "instance_id": action.instance_id, + "action": "SKIP", + }, + ) + return True + + except ClientError as e: + logger.error( + "Failed to execute cleanup action", + extra={ + "action": action.action, + "instance_id": action.instance_id, + "error": str(e), + }, + ) + return False + + return False diff --git a/IaC/cdk/openshift-resources-cleanup/lambda/openshift_resource_cleanup/handler.py b/IaC/cdk/openshift-resources-cleanup/lambda/openshift_resource_cleanup/handler.py new file mode 100644 index 0000000000..36f21f4fea --- /dev/null +++ b/IaC/cdk/openshift-resources-cleanup/lambda/openshift_resource_cleanup/handler.py @@ -0,0 +1,530 @@ +"""Main Lambda handler for OpenShift cluster cleanup.""" + +from __future__ import annotations +import json +import time +import datetime +import boto3 +from typing import Any + +from aws_lambda_powertools import Tracer, Metrics +from aws_lambda_powertools.metrics import MetricUnit +from aws_lambda_powertools.utilities.typing import LambdaContext + +from .models import CleanupAction +from .models.config import ( + DRY_RUN, + SNS_TOPIC_ARN, + TARGET_REGIONS, + OPENSHIFT_CLEANUP_ENABLED, + OPENSHIFT_BASE_DOMAIN, + LOG_LEVEL, +) +from .utils import convert_tags_to_dict, get_logger +from .ec2 import execute_cleanup_action + +logger = get_logger() +tracer = Tracer(service="openshift-cleanup") +metrics = Metrics(namespace="Percona/OpenShiftCleanup", service="openshift-cleanup") + + +def send_notification(actions: list[CleanupAction], region: str) -> None: + """Send SNS notification about OpenShift cleanup actions.""" + if not SNS_TOPIC_ARN or not actions: + return + + try: + sns = boto3.client("sns") + + message_lines = [ + f"OpenShift Cluster Cleanup Report - {region}", + f"Mode: {'DRY-RUN' if DRY_RUN else 'LIVE'}", + f"Timestamp: {datetime.datetime.now(datetime.timezone.utc).strftime('%Y-%m-%d %H:%M:%S UTC')}", + "", + f"Total Clusters: {len(actions)}", + "", + ] + + for action in actions: + message_lines.append(f"Cluster: {action.cluster_name or 'Unknown'}") + message_lines.append(f" Instance: {action.instance_id}") + message_lines.append(f" Name: {action.name}") + message_lines.append(f" Action: {action.action}") + message_lines.append(f" Reason: {action.reason}") + message_lines.append(f" Billing Tag: {action.billing_tag}") + if action.owner: + message_lines.append(f" Owner: {action.owner}") + message_lines.append("") + + message = "\n".join(message_lines) + subject = f"[{'DRY-RUN' if DRY_RUN else 'LIVE'}] OpenShift Cleanup: {len(actions)} clusters in {region}" + + sns.publish( + TopicArn=SNS_TOPIC_ARN, + Subject=subject[:100], + Message=message, + ) + + logger.info( + "Sent SNS notification", + extra={"clusters_count": len(actions), "region": region}, + ) + + except Exception as e: + logger.error(f"Failed to send SNS notification: {e}") + + +def check_cluster_ttl(tags_dict: dict[str, str]) -> tuple[bool, float]: + """Check if cluster TTL has expired. + + Args: + tags_dict: Dictionary of instance tags + + Returns: + Tuple of (should_delete, days_overdue): + - (True, days_overdue) if TTL expired or no TTL tags (unmanaged) + - (False, 0.0) if TTL not expired or malformed (fail-safe) + """ + creation_time_str = tags_dict.get("creation-time") + ttl_hours_str = tags_dict.get("delete-cluster-after-hours") + + # No TTL tags = unmanaged infrastructure, should delete + if not creation_time_str or not ttl_hours_str: + logger.info( + "Cluster has no TTL tags, marking for deletion (unmanaged infrastructure)", + extra={"tags": tags_dict}, + ) + return (True, 0.0) + + try: + # Parse creation time - try Unix timestamp first (real clusters use this) + # Then fall back to ISO format (for compatibility) + try: + # Try parsing as Unix timestamp (e.g., "1761053127") + creation_timestamp = float(creation_time_str) + creation_time = datetime.datetime.fromtimestamp( + creation_timestamp, tz=datetime.timezone.utc + ) + except (ValueError, OSError): + # Fall back to ISO format parsing (e.g., "2025-01-15T05:00:00Z") + creation_time = datetime.datetime.fromisoformat( + creation_time_str.replace("Z", "+00:00") + ) + + # Parse TTL hours + ttl_hours = float(ttl_hours_str) + + # Calculate expiry time + expiry_time = creation_time + datetime.timedelta(hours=ttl_hours) + current_time = datetime.datetime.now(datetime.timezone.utc) + + # Calculate time difference + time_diff = current_time - expiry_time + days_overdue = time_diff.total_seconds() / (24 * 3600) + + if days_overdue >= 0: + # TTL expired + logger.info( + "Cluster TTL expired", + extra={ + "creation_time": creation_time_str, + "ttl_hours": ttl_hours, + "days_overdue": round(days_overdue, 2), + }, + ) + return (True, days_overdue) + else: + # TTL not expired yet + hours_remaining = -days_overdue * 24 + logger.info( + "Cluster TTL not expired, skipping deletion", + extra={ + "creation_time": creation_time_str, + "ttl_hours": ttl_hours, + "hours_remaining": round(hours_remaining, 2), + }, + ) + return (False, 0.0) + + except (ValueError, TypeError) as e: + # Malformed TTL tags - fail-safe: don't delete + logger.warning( + f"Failed to parse TTL tags, skipping deletion (fail-safe): {e}", + extra={ + "creation_time": creation_time_str, + "ttl_hours": ttl_hours_str, + "error": str(e), + }, + ) + return (False, 0.0) + + +def is_openshift_instance(instance: dict, region: str) -> tuple[bool, str | None]: + """Check if instance belongs to an OpenShift cluster (not EKS or other K8s). + + Args: + instance: EC2 instance dictionary + region: AWS region name + + Returns: + Tuple of (is_openshift, infra_id): + - (True, infra_id) if this is an OpenShift instance + - (False, None) if not OpenShift + """ + tags_dict = convert_tags_to_dict(instance.get("Tags", [])) + + # Check 1: Red Hat ROSA specific tag (most reliable) + if tags_dict.get("red-hat-clustertype") == "rosa": + # Extract infra ID from kubernetes.io/cluster tag + for tag in instance.get("Tags", []): + if tag["Key"].startswith("kubernetes.io/cluster/"): + infra_id = tag["Key"].split("/")[-1] + logger.info( + "Detected OpenShift ROSA cluster via red-hat-clustertype tag", + extra={"infra_id": infra_id, "instance_id": instance["InstanceId"]}, + ) + return (True, infra_id) + + # Check 2: Red Hat managed tag + if tags_dict.get("red-hat-managed") == "true": + for tag in instance.get("Tags", []): + if tag["Key"].startswith("kubernetes.io/cluster/"): + infra_id = tag["Key"].split("/")[-1] + logger.info( + "Detected OpenShift cluster via red-hat-managed tag", + extra={"infra_id": infra_id, "instance_id": instance["InstanceId"]}, + ) + return (True, infra_id) + + # Check 3: OpenShift Cluster API tag (not used by EKS) + for tag in instance.get("Tags", []): + if tag["Key"].startswith("sigs.k8s.io/cluster-api-provider-aws/cluster/"): + infra_id = tag["Key"].split("/")[-1] + logger.info( + "Detected OpenShift cluster via cluster-api tag", + extra={"infra_id": infra_id, "instance_id": instance["InstanceId"]}, + ) + return (True, infra_id) + + # Check 4: Instance name pattern (fallback for older detection) + instance_name = tags_dict.get("Name", "") + if "-master-" in instance_name or "openshift" in instance_name.lower(): + # Try to extract cluster name from instance name + # Format: clustername-xxxxx-master-0 + parts = instance_name.split("-") + if len(parts) >= 3: + cluster_name = None + for i, part in enumerate(parts): + if part == "master" and i > 0: + cluster_name = "-".join(parts[: i - 1]) + break + + if cluster_name: + # Verify it's actually OpenShift by checking for infra ID + from .openshift.detection import detect_openshift_infra_id + + infra_id = detect_openshift_infra_id(cluster_name, region) + if infra_id: + logger.info( + "Detected OpenShift cluster via instance name pattern", + extra={ + "infra_id": infra_id, + "cluster_name": cluster_name, + "instance_id": instance["InstanceId"], + }, + ) + return (True, infra_id) + + return (False, None) + + +def extract_cluster_name_from_infra_id(infra_id: str) -> str: + """Extract base cluster name from infra ID. + + Example: jvp-rosa1-qmdkk -> jvp-rosa1 + """ + parts = infra_id.split("-") + if len(parts) >= 2: + # Infra ID format is typically: clustername-randomid + # Return everything except the last part (random ID) + return "-".join(parts[:-1]) + return infra_id + + +@tracer.capture_method +def cleanup_region(region: str, execution_id: str | None = None) -> list[CleanupAction]: + """Process OpenShift cluster cleanup for a single region.""" + start_time = time.time() + logger.info( + "Processing region for OpenShift cleanup", + extra={ + "region": region, + "execution_id": execution_id, + "stage": "region_start", + }, + ) + + ec2 = boto3.client("ec2", region_name=region) + actions = [] + + # Track instance scan statistics + instance_scan_count = 0 + openshift_clusters_found = 0 + + try: + # Scan for running instances to detect OpenShift clusters + response = ec2.describe_instances( + Filters=[{"Name": "instance-state-name", "Values": ["running", "stopped"]}] + ) + + # Track clusters we've already processed (by infra_id) + processed_clusters = set() + + for reservation in response["Reservations"]: + for instance in reservation["Instances"]: + instance_scan_count += 1 + tags_dict = convert_tags_to_dict(instance.get("Tags", [])) + instance_name = tags_dict.get("Name", "") + + # Check if this is an OpenShift instance (not EKS or other K8s) + is_openshift, infra_id = is_openshift_instance(instance, region) + + if is_openshift and infra_id: + # Avoid processing the same cluster multiple times + if infra_id in processed_clusters: + continue + + processed_clusters.add(infra_id) + openshift_clusters_found += 1 + + # Extract cluster name from infra ID + cluster_name = extract_cluster_name_from_infra_id(infra_id) + + # Check TTL before marking for deletion + should_delete, days_overdue = check_cluster_ttl(tags_dict) + + if should_delete: + # Create cleanup action for this cluster + reason = ( + f"OpenShift cluster TTL expired ({days_overdue:.2f} days overdue)" + if days_overdue > 0 + else "OpenShift cluster has no TTL tags (unmanaged infrastructure)" + ) + action = CleanupAction( + instance_id=instance["InstanceId"], + region=region, + name=instance_name, + action="TERMINATE_OPENSHIFT_CLUSTER", + reason=reason, + days_overdue=days_overdue, + billing_tag=tags_dict.get("iit-billing-tag", ""), + cluster_name=cluster_name, + owner=tags_dict.get("owner", None), + ) + actions.append(action) + + # Log scan summary + logger.info( + "OpenShift scan complete", + extra={ + "region": region, + "execution_id": execution_id, + "stage": "scan_complete", + "statistics": { + "instances_scanned": instance_scan_count, + "openshift_clusters_found": openshift_clusters_found, + "actions_count": len(actions), + }, + }, + ) + + # Emit metrics with region dimension + metrics.add_dimension(name="Region", value=region) + metrics.add_metric( + name="InstancesScanned", unit=MetricUnit.Count, value=instance_scan_count + ) + metrics.add_metric( + name="OpenShiftClustersFound", + unit=MetricUnit.Count, + value=openshift_clusters_found, + ) + metrics.add_metric( + name="CleanupActions", unit=MetricUnit.Count, value=len(actions) + ) + + # Execute cleanup actions + for action in actions: + execute_cleanup_action(action, region) + + # Send notification + if actions: + send_notification(actions, region) + + # Region completion with timing + duration = time.time() - start_time + logger.info( + "Region cleanup complete", + extra={ + "region": region, + "execution_id": execution_id, + "stage": "region_complete", + "performance": { + "duration_seconds": round(duration, 2), + "instances_per_second": ( + round(instance_scan_count / duration, 2) if duration > 0 else 0 + ), + }, + "summary": { + "instances_scanned": instance_scan_count, + "openshift_clusters_found": openshift_clusters_found, + "total_actions": len(actions), + }, + }, + ) + + except Exception as e: + logger.error(f"Error processing region {region}: {e}") + + return actions + + +@logger.inject_lambda_context +@tracer.capture_lambda_handler +@metrics.log_metrics(capture_cold_start_metric=True) +def lambda_handler(event: dict[str, Any], context: LambdaContext) -> dict[str, Any]: + """Main Lambda handler for OpenShift cleanup.""" + start_time = time.time() + execution_id = context.aws_request_id + + # Log configuration at startup + logger.info( + "OpenShift Cleanup Lambda initialized", + extra={ + "execution_id": execution_id, + "lambda_name": context.function_name, + "lambda_version": context.function_version, + "mode": "DRY_RUN" if DRY_RUN else "LIVE", + "configuration": { + "dry_run": DRY_RUN, + "log_level": LOG_LEVEL, + "cleanup_features": { + "openshift_cleanup_enabled": OPENSHIFT_CLEANUP_ENABLED, + "openshift_base_domain": OPENSHIFT_BASE_DOMAIN, + }, + "notifications": { + "sns_enabled": bool(SNS_TOPIC_ARN), + "sns_topic": SNS_TOPIC_ARN if SNS_TOPIC_ARN else "disabled", + }, + "regions": { + "target_regions": ( + TARGET_REGIONS if TARGET_REGIONS != "all" else "all" + ), + }, + }, + "event": event if event else {}, + }, + ) + + try: + ec2 = boto3.client("ec2") + all_regions = [ + region["RegionName"] for region in ec2.describe_regions()["Regions"] + ] + + # Filter regions based on TARGET_REGIONS parameter + if TARGET_REGIONS and TARGET_REGIONS.lower() != "all": + target_list = [r.strip() for r in TARGET_REGIONS.split(",") if r.strip()] + regions = [r for r in all_regions if r in target_list] + logger.info( + "Target regions configured", + extra={ + "execution_id": execution_id, + "regions": regions, + "regions_count": len(regions), + "filter_type": "specific", + }, + ) + else: + regions = all_regions + logger.info( + "Target regions configured", + extra={ + "execution_id": execution_id, + "regions_count": len(regions), + "filter_type": "all", + }, + ) + + all_actions = [] + regions_processed = [] + regions_with_actions = [] + + for region in regions: + region_actions = cleanup_region(region, execution_id) + all_actions.extend(region_actions) + regions_processed.append(region) + if region_actions: + regions_with_actions.append(region) + + # Calculate summary statistics + total_duration = time.time() - start_time + action_counts: dict[str, int] = {} + + for action in all_actions: + action_counts[action.action] = action_counts.get(action.action, 0) + 1 + + summary = { + "execution_id": execution_id, + "stage": "execution_complete", + "mode": "DRY_RUN" if DRY_RUN else "LIVE", + "performance": { + "total_duration_seconds": round(total_duration, 2), + "regions_per_second": ( + round(len(regions) / total_duration, 2) if total_duration > 0 else 0 + ), + "actions_per_second": ( + round(len(all_actions) / total_duration, 2) + if total_duration > 0 + else 0 + ), + }, + "regions": { + "total_regions": len(regions), + "regions_processed": len(regions_processed), + "regions_with_actions": len(regions_with_actions), + "regions_list": regions_with_actions, + }, + "openshift": { + "total_clusters_found": len(all_actions), + "clusters_by_region": action_counts, + }, + } + + logger.info("OpenShift Cleanup execution complete", extra=summary) + + # Emit summary metrics + metrics.add_metric( + name="TotalActions", unit=MetricUnit.Count, value=len(all_actions) + ) + metrics.add_metric( + name="RegionsProcessed", unit=MetricUnit.Count, value=len(regions) + ) + metrics.add_metric( + name="ExecutionDuration", unit=MetricUnit.Seconds, value=total_duration + ) + + return { + "statusCode": 200, + "body": json.dumps( + { + "dry_run": DRY_RUN, + "total_actions": len(all_actions), + "by_action": action_counts, + "actions": [action.to_dict() for action in all_actions], + } + ), + } + + except Exception as e: + logger.error(f"Lambda execution failed: {e}") + raise diff --git a/IaC/cdk/openshift-resources-cleanup/lambda/openshift_resource_cleanup/models/__init__.py b/IaC/cdk/openshift-resources-cleanup/lambda/openshift_resource_cleanup/models/__init__.py new file mode 100644 index 0000000000..d2fd12e016 --- /dev/null +++ b/IaC/cdk/openshift-resources-cleanup/lambda/openshift_resource_cleanup/models/__init__.py @@ -0,0 +1,6 @@ +"""Data models for EC2 cleanup Lambda.""" + +from .cleanup_action import CleanupAction +from .config import Config + +__all__ = ["CleanupAction", "Config"] diff --git a/IaC/cdk/openshift-resources-cleanup/lambda/openshift_resource_cleanup/models/cleanup_action.py b/IaC/cdk/openshift-resources-cleanup/lambda/openshift_resource_cleanup/models/cleanup_action.py new file mode 100644 index 0000000000..665dc311c4 --- /dev/null +++ b/IaC/cdk/openshift-resources-cleanup/lambda/openshift_resource_cleanup/models/cleanup_action.py @@ -0,0 +1,26 @@ +"""CleanupAction data class.""" + +from __future__ import annotations +from dataclasses import dataclass, asdict +from typing import Any + + +@dataclass +class CleanupAction: + """Represents an OpenShift cluster cleanup action.""" + + instance_id: str + region: str + name: str + action: str # TERMINATE_OPENSHIFT_CLUSTER + reason: str + days_overdue: float + billing_tag: str = "" + cluster_name: str | None = None + owner: str | None = None + + def to_dict(self) -> dict[str, Any]: + """Convert to dictionary for JSON serialization.""" + data = asdict(self) + data["days_overdue"] = round(self.days_overdue, 2) + return data diff --git a/IaC/cdk/openshift-resources-cleanup/lambda/openshift_resource_cleanup/models/config.py b/IaC/cdk/openshift-resources-cleanup/lambda/openshift_resource_cleanup/models/config.py new file mode 100644 index 0000000000..887095df44 --- /dev/null +++ b/IaC/cdk/openshift-resources-cleanup/lambda/openshift_resource_cleanup/models/config.py @@ -0,0 +1,31 @@ +"""Configuration from environment variables.""" + +import os + +# Core configuration +DRY_RUN = os.environ.get("DRY_RUN", "true").lower() == "true" +SNS_TOPIC_ARN = os.environ.get("SNS_TOPIC_ARN", "") + +# OpenShift cleanup configuration +OPENSHIFT_CLEANUP_ENABLED = ( + os.environ.get("OPENSHIFT_CLEANUP_ENABLED", "true").lower() == "true" +) +OPENSHIFT_BASE_DOMAIN = os.environ.get("OPENSHIFT_BASE_DOMAIN", "cd.percona.com") + +# Region filtering +TARGET_REGIONS = os.environ.get("TARGET_REGIONS", "all") + +# Logging configuration +LOG_LEVEL = os.environ.get("LOG_LEVEL", "INFO").upper() + + +class Config: + """Configuration singleton for OpenShift cleanup.""" + + def __init__(self): + self.dry_run = DRY_RUN + self.sns_topic_arn = SNS_TOPIC_ARN + self.openshift_cleanup_enabled = OPENSHIFT_CLEANUP_ENABLED + self.openshift_base_domain = OPENSHIFT_BASE_DOMAIN + self.target_regions = TARGET_REGIONS + self.log_level = LOG_LEVEL diff --git a/IaC/cdk/openshift-resources-cleanup/lambda/openshift_resource_cleanup/openshift/__init__.py b/IaC/cdk/openshift-resources-cleanup/lambda/openshift_resource_cleanup/openshift/__init__.py new file mode 100644 index 0000000000..ed6a832b25 --- /dev/null +++ b/IaC/cdk/openshift-resources-cleanup/lambda/openshift_resource_cleanup/openshift/__init__.py @@ -0,0 +1,35 @@ +"""OpenShift cluster comprehensive cleanup.""" + +from .detection import detect_openshift_infra_id +from .compute import delete_load_balancers +from .network import ( + delete_nat_gateways, + release_elastic_ips, + cleanup_network_interfaces, + delete_vpc_endpoints, + delete_security_groups, + delete_subnets, + delete_route_tables, + delete_internet_gateway, + delete_vpc, +) +from .dns import cleanup_route53_records +from .storage import cleanup_s3_state +from .orchestrator import destroy_openshift_cluster + +__all__ = [ + "detect_openshift_infra_id", + "delete_load_balancers", + "delete_nat_gateways", + "release_elastic_ips", + "cleanup_network_interfaces", + "delete_vpc_endpoints", + "delete_security_groups", + "delete_subnets", + "delete_route_tables", + "delete_internet_gateway", + "delete_vpc", + "cleanup_route53_records", + "cleanup_s3_state", + "destroy_openshift_cluster", +] diff --git a/IaC/cdk/openshift-resources-cleanup/lambda/openshift_resource_cleanup/openshift/compute.py b/IaC/cdk/openshift-resources-cleanup/lambda/openshift_resource_cleanup/openshift/compute.py new file mode 100644 index 0000000000..f41f9a4c4f --- /dev/null +++ b/IaC/cdk/openshift-resources-cleanup/lambda/openshift_resource_cleanup/openshift/compute.py @@ -0,0 +1,87 @@ +"""OpenShift compute resources (EC2, Load Balancers).""" + +import boto3 +from ..models.config import DRY_RUN +from ..utils import get_logger + +logger = get_logger() + + +def delete_load_balancers(infra_id: str, region: str): + """Delete Classic ELBs and ALB/NLBs for OpenShift cluster.""" + try: + elb = boto3.client("elb", region_name=region) + elbv2 = boto3.client("elbv2", region_name=region) + ec2 = boto3.client("ec2", region_name=region) + + # Get VPC ID for cluster + vpcs = ec2.describe_vpcs( + Filters=[ + {"Name": "tag:kubernetes.io/cluster/" + infra_id, "Values": ["owned"]} + ] + )["Vpcs"] + vpc_id = vpcs[0]["VpcId"] if vpcs else None + + # Delete Classic ELBs + classic_elbs = elb.describe_load_balancers().get("LoadBalancerDescriptions", []) + for lb in classic_elbs: + if infra_id in lb["LoadBalancerName"] or ( + vpc_id and lb.get("VPCId") == vpc_id + ): + if DRY_RUN: + logger.info( + "Would DELETE load_balancer", + extra={ + "dry_run": True, + "load_balancer_name": lb["LoadBalancerName"], + "load_balancer_type": "classic", + "vpc_id": vpc_id, + "infra_id": infra_id, + }, + ) + else: + elb.delete_load_balancer(LoadBalancerName=lb["LoadBalancerName"]) + logger.info( + "DELETE load_balancer", + extra={ + "load_balancer_name": lb["LoadBalancerName"], + "load_balancer_type": "classic", + "vpc_id": vpc_id, + "infra_id": infra_id, + }, + ) + + # Delete ALB/NLBs + alb_nlbs = elbv2.describe_load_balancers().get("LoadBalancers", []) + for lb in alb_nlbs: + if infra_id in lb["LoadBalancerName"] or ( + vpc_id and lb.get("VpcId") == vpc_id + ): + lb_type = lb.get("Type", "unknown") + if DRY_RUN: + logger.info( + "Would DELETE load_balancer", + extra={ + "dry_run": True, + "load_balancer_name": lb["LoadBalancerName"], + "load_balancer_arn": lb["LoadBalancerArn"], + "load_balancer_type": lb_type, + "vpc_id": vpc_id, + "infra_id": infra_id, + }, + ) + else: + elbv2.delete_load_balancer(LoadBalancerArn=lb["LoadBalancerArn"]) + logger.info( + "DELETE load_balancer", + extra={ + "load_balancer_name": lb["LoadBalancerName"], + "load_balancer_arn": lb["LoadBalancerArn"], + "load_balancer_type": lb_type, + "vpc_id": vpc_id, + "infra_id": infra_id, + }, + ) + + except Exception as e: + logger.error(f"Error deleting load balancers: {e}") diff --git a/IaC/cdk/openshift-resources-cleanup/lambda/openshift_resource_cleanup/openshift/detection.py b/IaC/cdk/openshift-resources-cleanup/lambda/openshift_resource_cleanup/openshift/detection.py new file mode 100644 index 0000000000..988a24cfde --- /dev/null +++ b/IaC/cdk/openshift-resources-cleanup/lambda/openshift_resource_cleanup/openshift/detection.py @@ -0,0 +1,45 @@ +"""OpenShift cluster detection.""" + +from __future__ import annotations +import boto3 +from ..utils import get_logger + +logger = get_logger() + + +def detect_openshift_infra_id(cluster_name: str, region: str) -> str | None: + """Detect OpenShift infrastructure ID from cluster name.""" + try: + ec2 = boto3.client("ec2", region_name=region) + + # Try exact match first + vpcs = ec2.describe_vpcs( + Filters=[ + {"Name": "tag-key", "Values": [f"kubernetes.io/cluster/{cluster_name}"]} + ] + )["Vpcs"] + + # Try wildcard match if exact doesn't work + if not vpcs: + vpcs = ec2.describe_vpcs( + Filters=[ + { + "Name": "tag-key", + "Values": [f"kubernetes.io/cluster/{cluster_name}-*"], + } + ] + )["Vpcs"] + + if vpcs: + for tag in vpcs[0].get("Tags", []): + if tag["Key"].startswith("kubernetes.io/cluster/"): + infra_id: str = tag["Key"].split("/")[-1] + logger.info( + f"Detected OpenShift infra ID: {infra_id} from cluster: {cluster_name}" + ) + return infra_id + + except Exception as e: + logger.error(f"Error detecting OpenShift infra ID: {e}") + + return None diff --git a/IaC/cdk/openshift-resources-cleanup/lambda/openshift_resource_cleanup/openshift/dns.py b/IaC/cdk/openshift-resources-cleanup/lambda/openshift_resource_cleanup/openshift/dns.py new file mode 100644 index 0000000000..47e8239e37 --- /dev/null +++ b/IaC/cdk/openshift-resources-cleanup/lambda/openshift_resource_cleanup/openshift/dns.py @@ -0,0 +1,82 @@ +"""OpenShift Route53 DNS cleanup.""" + +import boto3 +from ..models.config import DRY_RUN, OPENSHIFT_BASE_DOMAIN +from ..utils import get_logger + +logger = get_logger() + + +def cleanup_route53_records(cluster_name: str, region: str): + """Clean up Route53 DNS records for OpenShift cluster.""" + try: + route53 = boto3.client("route53") + + # Find the hosted zone for the base domain + zones = route53.list_hosted_zones()["HostedZones"] + zone_id = None + for zone in zones: + if zone["Name"].rstrip(".") == OPENSHIFT_BASE_DOMAIN: + zone_id = zone["Id"].split("/")[-1] + break + + if not zone_id: + logger.warning(f"Hosted zone for {OPENSHIFT_BASE_DOMAIN} not found") + return + + # Get all DNS records for this zone + records = route53.list_resource_record_sets(HostedZoneId=zone_id)[ + "ResourceRecordSets" + ] + + # Find records for this cluster + changes = [] + for record in records: + name = record["Name"].rstrip(".") + # Match api.cluster.domain or *.apps.cluster.domain + if ( + f"api.{cluster_name}.{OPENSHIFT_BASE_DOMAIN}" in name + or f"apps.{cluster_name}.{OPENSHIFT_BASE_DOMAIN}" in name + ): + changes.append({"Action": "DELETE", "ResourceRecordSet": record}) + + # Log each DNS record being deleted + for change in changes: + record = change["ResourceRecordSet"] + if DRY_RUN: + logger.info( + "Would DELETE route53_record", + extra={ + "dry_run": True, + "record_name": record["Name"].rstrip("."), + "record_type": record["Type"], + "cluster_name": cluster_name, + "hosted_zone_id": zone_id, + }, + ) + else: + logger.info( + "DELETE route53_record", + extra={ + "record_name": record["Name"].rstrip("."), + "record_type": record["Type"], + "cluster_name": cluster_name, + "hosted_zone_id": zone_id, + }, + ) + + if changes and not DRY_RUN: + route53.change_resource_record_sets( + HostedZoneId=zone_id, ChangeBatch={"Changes": changes} + ) + logger.info( + f"Deleted {len(changes)} Route53 records for {cluster_name}", + extra={ + "hosted_zone_id": zone_id, + "records_deleted": len(changes), + "cluster_name": cluster_name, + }, + ) + + except Exception as e: + logger.error(f"Error cleaning up Route53 records: {e}") diff --git a/IaC/cdk/openshift-resources-cleanup/lambda/openshift_resource_cleanup/openshift/network.py b/IaC/cdk/openshift-resources-cleanup/lambda/openshift_resource_cleanup/openshift/network.py new file mode 100644 index 0000000000..6ba27552ed --- /dev/null +++ b/IaC/cdk/openshift-resources-cleanup/lambda/openshift_resource_cleanup/openshift/network.py @@ -0,0 +1,358 @@ +"""OpenShift network resources cleanup.""" + +import boto3 +from botocore.exceptions import ClientError +from ..models.config import DRY_RUN +from ..utils import get_logger + +logger = get_logger() + + +def delete_nat_gateways(infra_id: str, region: str): + """Delete NAT gateways for OpenShift cluster.""" + try: + ec2 = boto3.client("ec2", region_name=region) + nat_gws = ec2.describe_nat_gateways( + Filters=[ + {"Name": "tag:kubernetes.io/cluster/" + infra_id, "Values": ["owned"]}, + {"Name": "state", "Values": ["available", "pending"]}, + ] + )["NatGateways"] + + for nat in nat_gws: + if DRY_RUN: + logger.info( + "Would DELETE nat_gateway", + extra={ + "dry_run": True, + "nat_gateway_id": nat["NatGatewayId"], + "infra_id": infra_id, + }, + ) + else: + ec2.delete_nat_gateway(NatGatewayId=nat["NatGatewayId"]) + logger.info( + "DELETE nat_gateway", + extra={"nat_gateway_id": nat["NatGatewayId"], "infra_id": infra_id}, + ) + + except Exception as e: + logger.error("Error deleting NAT gateways", extra={"error": str(e)}) + + +def release_elastic_ips(infra_id: str, region: str): + """Release Elastic IPs for OpenShift cluster.""" + try: + ec2 = boto3.client("ec2", region_name=region) + eips = ec2.describe_addresses( + Filters=[ + {"Name": "tag:kubernetes.io/cluster/" + infra_id, "Values": ["owned"]} + ] + )["Addresses"] + + for eip in eips: + if "AllocationId" in eip: + if DRY_RUN: + logger.info( + "Would DELETE elastic_ip", + extra={ + "dry_run": True, + "allocation_id": eip["AllocationId"], + "infra_id": infra_id, + }, + ) + else: + try: + ec2.release_address(AllocationId=eip["AllocationId"]) + logger.info( + "DELETE elastic_ip", + extra={ + "allocation_id": eip["AllocationId"], + "infra_id": infra_id, + }, + ) + except ClientError: + pass # May already be released + + except Exception as e: + logger.error("Error releasing EIPs", extra={"error": str(e)}) + + +def cleanup_network_interfaces(vpc_id: str, region: str): + """Clean up orphaned network interfaces.""" + try: + ec2 = boto3.client("ec2", region_name=region) + enis = ec2.describe_network_interfaces( + Filters=[ + {"Name": "vpc-id", "Values": [vpc_id]}, + {"Name": "status", "Values": ["available"]}, + ] + )["NetworkInterfaces"] + + for eni in enis: + if DRY_RUN: + logger.info( + "Would DELETE network_interface", + extra={ + "dry_run": True, + "network_interface_id": eni["NetworkInterfaceId"], + "vpc_id": vpc_id, + }, + ) + else: + try: + ec2.delete_network_interface( + NetworkInterfaceId=eni["NetworkInterfaceId"] + ) + logger.info( + "DELETE network_interface", + extra={ + "network_interface_id": eni["NetworkInterfaceId"], + "vpc_id": vpc_id, + }, + ) + except ClientError: + pass # May already be deleted + + except Exception as e: + logger.error("Error cleaning up ENIs", extra={"error": str(e)}) + + +def delete_vpc_endpoints(vpc_id: str, region: str): + """Delete VPC endpoints.""" + try: + ec2 = boto3.client("ec2", region_name=region) + endpoints = ec2.describe_vpc_endpoints( + Filters=[{"Name": "vpc-id", "Values": [vpc_id]}] + )["VpcEndpoints"] + + for endpoint in endpoints: + if DRY_RUN: + logger.info( + "Would DELETE vpc_endpoint", + extra={ + "dry_run": True, + "vpc_endpoint_id": endpoint["VpcEndpointId"], + "vpc_id": vpc_id, + }, + ) + else: + try: + ec2.delete_vpc_endpoints(VpcEndpointIds=[endpoint["VpcEndpointId"]]) + logger.info( + "DELETE vpc_endpoint", + extra={ + "vpc_endpoint_id": endpoint["VpcEndpointId"], + "vpc_id": vpc_id, + }, + ) + except ClientError: + pass + + except Exception as e: + logger.error("Error deleting VPC endpoints", extra={"error": str(e)}) + + +def delete_security_groups(vpc_id: str, region: str): + """Delete security groups with dependency handling.""" + try: + ec2 = boto3.client("ec2", region_name=region) + sgs = ec2.describe_security_groups( + Filters=[{"Name": "vpc-id", "Values": [vpc_id]}] + )["SecurityGroups"] + + # First pass: remove all ingress rules to break circular dependencies + for sg in sgs: + if sg["GroupName"] == "default": + continue + try: + if sg.get("IpPermissions"): + if not DRY_RUN: + ec2.revoke_security_group_ingress( + GroupId=sg["GroupId"], IpPermissions=sg["IpPermissions"] + ) + except ClientError: + pass + + # Second pass: delete security groups + for sg in sgs: + if sg["GroupName"] == "default": + continue + if DRY_RUN: + logger.info( + "Would DELETE security_group", + extra={ + "dry_run": True, + "security_group_id": sg["GroupId"], + "vpc_id": vpc_id, + }, + ) + else: + try: + ec2.delete_security_group(GroupId=sg["GroupId"]) + logger.info( + "DELETE security_group", + extra={"security_group_id": sg["GroupId"], "vpc_id": vpc_id}, + ) + except ClientError: + pass + + except Exception as e: + logger.error("Error deleting security groups", extra={"error": str(e)}) + + +def delete_subnets(vpc_id: str, region: str): + """Delete subnets.""" + try: + ec2 = boto3.client("ec2", region_name=region) + subnets = ec2.describe_subnets( + Filters=[{"Name": "vpc-id", "Values": [vpc_id]}] + )["Subnets"] + + for subnet in subnets: + if DRY_RUN: + logger.info( + "Would DELETE subnet", + extra={ + "dry_run": True, + "subnet_id": subnet["SubnetId"], + "vpc_id": vpc_id, + }, + ) + else: + try: + ec2.delete_subnet(SubnetId=subnet["SubnetId"]) + logger.info( + "DELETE subnet", + extra={"subnet_id": subnet["SubnetId"], "vpc_id": vpc_id}, + ) + except ClientError: + pass + + except Exception as e: + logger.error("Error deleting subnets", extra={"error": str(e)}) + + +def delete_route_tables(vpc_id: str, region: str): + """Delete route tables.""" + try: + ec2 = boto3.client("ec2", region_name=region) + rts = ec2.describe_route_tables( + Filters=[{"Name": "vpc-id", "Values": [vpc_id]}] + )["RouteTables"] + + for rt in rts: + # Skip main route table + is_main = any( + assoc.get("Main", False) for assoc in rt.get("Associations", []) + ) + if is_main: + continue + + if DRY_RUN: + logger.info( + "Would DELETE route_table", + extra={ + "dry_run": True, + "route_table_id": rt["RouteTableId"], + "vpc_id": vpc_id, + }, + ) + else: + try: + ec2.delete_route_table(RouteTableId=rt["RouteTableId"]) + logger.info( + "DELETE route_table", + extra={"route_table_id": rt["RouteTableId"], "vpc_id": vpc_id}, + ) + except ClientError: + pass + + except Exception as e: + logger.error("Error deleting route tables", extra={"error": str(e)}) + + +def delete_internet_gateway(vpc_id: str, region: str): + """Detach and delete internet gateway.""" + try: + ec2 = boto3.client("ec2", region_name=region) + igws = ec2.describe_internet_gateways( + Filters=[{"Name": "attachment.vpc-id", "Values": [vpc_id]}] + )["InternetGateways"] + + for igw in igws: + if DRY_RUN: + logger.info( + "Would DELETE internet_gateway", + extra={ + "dry_run": True, + "internet_gateway_id": igw["InternetGatewayId"], + "vpc_id": vpc_id, + }, + ) + else: + try: + ec2.detach_internet_gateway( + InternetGatewayId=igw["InternetGatewayId"], VpcId=vpc_id + ) + ec2.delete_internet_gateway( + InternetGatewayId=igw["InternetGatewayId"] + ) + logger.info( + "DELETE internet_gateway", + extra={ + "internet_gateway_id": igw["InternetGatewayId"], + "vpc_id": vpc_id, + }, + ) + except ClientError: + pass + + except Exception as e: + logger.error("Error deleting IGW", extra={"error": str(e)}) + + +def delete_vpc(vpc_id: str, region: str) -> bool: + """ + Delete VPC. + + Returns: + True if VPC was deleted successfully + False if VPC still has dependencies + """ + try: + ec2 = boto3.client("ec2", region_name=region) + if DRY_RUN: + logger.info( + "Would DELETE vpc", + extra={"dry_run": True, "vpc_id": vpc_id, "region": region}, + ) + return True # In DRY_RUN, assume success + else: + try: + ec2.delete_vpc(VpcId=vpc_id) + logger.info("DELETE vpc", extra={"vpc_id": vpc_id, "region": region}) + return True + except ClientError as e: + error_code = e.response.get("Error", {}).get("Code", "") + if error_code == "DependencyViolation": + logger.info( + "VPC still has dependencies, cannot delete yet", + extra={"vpc_id": vpc_id, "error_code": error_code}, + ) + return False + else: + # Other errors (permissions, etc.) should be logged + logger.error( + "Error deleting VPC", + extra={ + "vpc_id": vpc_id, + "error": str(e), + "error_code": error_code, + }, + ) + return False + + except Exception as e: + logger.error("Unexpected error deleting VPC", extra={"error": str(e)}) + return False diff --git a/IaC/cdk/openshift-resources-cleanup/lambda/openshift_resource_cleanup/openshift/orchestrator.py b/IaC/cdk/openshift-resources-cleanup/lambda/openshift_resource_cleanup/openshift/orchestrator.py new file mode 100644 index 0000000000..74f77e4f53 --- /dev/null +++ b/IaC/cdk/openshift-resources-cleanup/lambda/openshift_resource_cleanup/openshift/orchestrator.py @@ -0,0 +1,136 @@ +"""OpenShift cluster destruction orchestration. + +Single-pass cleanup with dependency order enforcement. +EventBridge schedule (every 15 minutes) handles retries naturally. +""" + +import boto3 +from botocore.exceptions import ClientError +from ..utils import get_logger +from .compute import delete_load_balancers +from .network import ( + delete_nat_gateways, + release_elastic_ips, + cleanup_network_interfaces, + delete_vpc_endpoints, + delete_security_groups, + delete_subnets, + delete_route_tables, + delete_internet_gateway, + delete_vpc, +) +from .dns import cleanup_route53_records +from .storage import cleanup_s3_state + +logger = get_logger() + + +def destroy_openshift_cluster(cluster_name: str, infra_id: str, region: str) -> bool: + """ + Single-pass OpenShift cluster cleanup. + + Deletes resources in dependency order. If resources still have dependencies, + exits gracefully and relies on next EventBridge schedule (15min) to retry. + + Returns: + True if VPC successfully deleted (cleanup complete) + False if resources remain (will retry on next schedule) + """ + logger.info( + "Starting OpenShift cluster cleanup", + extra={ + "cluster_name": cluster_name, + "infra_id": infra_id, + "cluster_type": "openshift", + "region": region, + }, + ) + + try: + ec2 = boto3.client("ec2", region_name=region) + + # Check if VPC still exists + vpcs = ec2.describe_vpcs( + Filters=[ + { + "Name": "tag:kubernetes.io/cluster/" + infra_id, + "Values": ["owned"], + } + ] + )["Vpcs"] + + if not vpcs: + logger.info( + "VPC not found - cleanup complete", + extra={"cluster_name": cluster_name, "infra_id": infra_id}, + ) + # Clean up Route53 and S3 when VPC is gone + cleanup_route53_records(cluster_name, region) + cleanup_s3_state(cluster_name, region) + return True + + vpc_id = vpcs[0]["VpcId"] + logger.info( + "Found VPC, proceeding with cleanup", + extra={"cluster_name": cluster_name, "vpc_id": vpc_id}, + ) + + # Delete resources in dependency order + # Each function handles its own DependencyViolation errors gracefully + delete_load_balancers(infra_id, region) + delete_nat_gateways(infra_id, region) + release_elastic_ips(infra_id, region) + cleanup_network_interfaces(vpc_id, region) + delete_vpc_endpoints(vpc_id, region) + delete_security_groups(vpc_id, region) + delete_subnets(vpc_id, region) + delete_route_tables(vpc_id, region) + delete_internet_gateway(vpc_id, region) + + # Try to delete VPC - if it fails due to dependencies, we'll retry on next run + vpc_deleted = delete_vpc(vpc_id, region) + + if vpc_deleted: + logger.info( + "Successfully deleted VPC", + extra={"cluster_name": cluster_name, "vpc_id": vpc_id}, + ) + # Clean up Route53 and S3 when VPC is successfully deleted + cleanup_route53_records(cluster_name, region) + cleanup_s3_state(cluster_name, region) + return True + else: + logger.info( + "VPC still has dependencies, will retry on next schedule", + extra={ + "cluster_name": cluster_name, + "vpc_id": vpc_id, + "retry_interval_minutes": 15, + }, + ) + return False + + except ClientError as e: + error_code = e.response.get("Error", {}).get("Code", "") + if error_code == "DependencyViolation": + logger.info( + "Dependencies remain, will retry on next schedule", + extra={"cluster_name": cluster_name, "error_code": error_code}, + ) + return False + else: + logger.error( + "Error during OpenShift cleanup", + extra={ + "cluster_name": cluster_name, + "error": str(e), + "error_code": error_code, + }, + ) + raise + except Exception as e: + logger.error( + "Unexpected error during OpenShift cleanup", + extra={"cluster_name": cluster_name, "error": str(e)}, + ) + raise diff --git a/IaC/cdk/openshift-resources-cleanup/lambda/openshift_resource_cleanup/openshift/storage.py b/IaC/cdk/openshift-resources-cleanup/lambda/openshift_resource_cleanup/openshift/storage.py new file mode 100644 index 0000000000..6bca1a3f16 --- /dev/null +++ b/IaC/cdk/openshift-resources-cleanup/lambda/openshift_resource_cleanup/openshift/storage.py @@ -0,0 +1,75 @@ +"""OpenShift S3 state storage cleanup.""" + +import boto3 +from botocore.exceptions import ClientError +from ..models.config import DRY_RUN +from ..utils import get_logger + +logger = get_logger() + + +def cleanup_s3_state(cluster_name: str, region: str): + """Clean up S3 state bucket for OpenShift cluster.""" + try: + s3 = boto3.client("s3", region_name=region) + sts = boto3.client("sts") + + # Determine S3 bucket name (standard naming convention) + account_id = sts.get_caller_identity()["Account"] + bucket_name = f"openshift-clusters-{account_id}-{region}" + + try: + # List objects with cluster name prefix + objects = s3.list_objects_v2(Bucket=bucket_name, Prefix=f"{cluster_name}/") + + if "Contents" in objects: + object_count = len(objects["Contents"]) + if DRY_RUN: + logger.info( + f"[DRY-RUN] Would delete {object_count} S3 objects for {cluster_name}", + extra={ + "dry_run": True, + "bucket_name": bucket_name, + "prefix": f"{cluster_name}/", + "object_count": object_count, + }, + ) + # Log each object that would be deleted + for obj in objects["Contents"]: + logger.info( + "Would DELETE s3_object", + extra={ + "dry_run": True, + "bucket_name": bucket_name, + "object_key": obj["Key"], + "cluster_name": cluster_name, + }, + ) + else: + # Delete and log each object + for obj in objects["Contents"]: + s3.delete_object(Bucket=bucket_name, Key=obj["Key"]) + logger.info( + "DELETE s3_object", + extra={ + "bucket_name": bucket_name, + "object_key": obj["Key"], + "cluster_name": cluster_name, + }, + ) + logger.info( + f"Deleted S3 state for {cluster_name}", + extra={ + "bucket_name": bucket_name, + "objects_deleted": object_count, + "cluster_name": cluster_name, + }, + ) + except ClientError as e: + if "NoSuchBucket" in str(e): + logger.info(f"S3 bucket {bucket_name} does not exist") + else: + raise + + except Exception as e: + logger.error(f"Error cleaning up S3 state: {e}") diff --git a/IaC/cdk/openshift-resources-cleanup/lambda/openshift_resource_cleanup/requirements.txt b/IaC/cdk/openshift-resources-cleanup/lambda/openshift_resource_cleanup/requirements.txt new file mode 100644 index 0000000000..85c16d7a50 --- /dev/null +++ b/IaC/cdk/openshift-resources-cleanup/lambda/openshift_resource_cleanup/requirements.txt @@ -0,0 +1,6 @@ +# AWS SDK (included in Lambda runtime, but specified for local development) +boto3>=1.40.53 +botocore>=1.40.53 + +# AWS Lambda Powertools for observability +aws-lambda-powertools[tracer]>=3.3.0 diff --git a/IaC/cdk/openshift-resources-cleanup/lambda/openshift_resource_cleanup/utils/__init__.py b/IaC/cdk/openshift-resources-cleanup/lambda/openshift_resource_cleanup/utils/__init__.py new file mode 100644 index 0000000000..dbb464a4eb --- /dev/null +++ b/IaC/cdk/openshift-resources-cleanup/lambda/openshift_resource_cleanup/utils/__init__.py @@ -0,0 +1,15 @@ +"""Utility functions for EC2 cleanup Lambda.""" + +from .aws_helpers import ( + convert_tags_to_dict, + has_valid_billing_tag, + extract_cluster_name, +) +from .logging_config import get_logger + +__all__ = [ + "convert_tags_to_dict", + "has_valid_billing_tag", + "extract_cluster_name", + "get_logger", +] diff --git a/IaC/cdk/openshift-resources-cleanup/lambda/openshift_resource_cleanup/utils/aws_helpers.py b/IaC/cdk/openshift-resources-cleanup/lambda/openshift_resource_cleanup/utils/aws_helpers.py new file mode 100644 index 0000000000..77006c0fc7 --- /dev/null +++ b/IaC/cdk/openshift-resources-cleanup/lambda/openshift_resource_cleanup/utils/aws_helpers.py @@ -0,0 +1,64 @@ +"""AWS helper functions.""" + +from __future__ import annotations +import datetime +from typing import Any +from .logging_config import get_logger + +logger = get_logger() + + +def convert_tags_to_dict(tags: list[dict[str, str]] | None) -> dict[str, str]: + """Convert AWS tag list to dictionary.""" + return {tag["Key"]: tag["Value"] for tag in tags} if tags else {} + + +def has_valid_billing_tag( + tags_dict: dict[str, str], instance_launch_time: Any = None +) -> bool: + """ + Check if instance has a valid iit-billing-tag. + + For regular instances: any non-empty value is valid + For timestamp-based tags: check if Unix timestamp is in the future + """ + if "iit-billing-tag" not in tags_dict: + return False + + tag_value = tags_dict["iit-billing-tag"] + + # Empty tag is invalid + if not tag_value: + return False + + # Try to parse as Unix timestamp (for EKS auto-expiration) + try: + expiration_timestamp = int(tag_value) + current_timestamp = int( + datetime.datetime.now(datetime.timezone.utc).timestamp() + ) + + # If it's a valid future timestamp, check if it's expired + if expiration_timestamp > current_timestamp: + return True + else: + logger.debug( + "Billing tag expired", + extra={ + "expiration_timestamp": expiration_timestamp, + "current_timestamp": current_timestamp, + "expired_seconds_ago": current_timestamp - expiration_timestamp, + }, + ) + return False + except ValueError: + # Not a timestamp, treat as category string (e.g., "pmm-staging", "CirrusCI") + return True + + +def extract_cluster_name(tags_dict: dict[str, str]) -> str | None: + """Extract cluster name from kubernetes tags.""" + for key in tags_dict.keys(): + if key.startswith("kubernetes.io/cluster/"): + return key.split("/")[-1] + return tags_dict.get("aws:eks:cluster-name") diff --git a/IaC/cdk/openshift-resources-cleanup/lambda/openshift_resource_cleanup/utils/logging_config.py b/IaC/cdk/openshift-resources-cleanup/lambda/openshift_resource_cleanup/utils/logging_config.py new file mode 100644 index 0000000000..9e359ed2c9 --- /dev/null +++ b/IaC/cdk/openshift-resources-cleanup/lambda/openshift_resource_cleanup/utils/logging_config.py @@ -0,0 +1,26 @@ +"""Logging configuration using AWS Lambda Powertools.""" + +import os + +from aws_lambda_powertools import Logger + +# Read log level from environment (default to INFO) +LOG_LEVEL = os.environ.get("LOG_LEVEL", "INFO").upper() + +# Set up Powertools logger with service name +# This provides structured logging with automatic Lambda context injection +logger = Logger( + service="aws-resource-cleanup", + level=LOG_LEVEL, +) + + +def get_logger(): + """Get the configured logger instance. + + Returns Powertools Logger with: + - Structured JSON logging + - Automatic Lambda context (request_id, function_name, etc.) + - CloudWatch Logs Insights ready + """ + return logger diff --git a/IaC/cdk/openshift-resources-cleanup/mypy.ini b/IaC/cdk/openshift-resources-cleanup/mypy.ini new file mode 100644 index 0000000000..668df7540e --- /dev/null +++ b/IaC/cdk/openshift-resources-cleanup/mypy.ini @@ -0,0 +1,15 @@ +[mypy] +python_version = 3.12 +warn_return_any = True +warn_unused_configs = True +disallow_untyped_defs = False +ignore_missing_imports = True +exclude = (?x)( + ^lambda/(?!openshift_resource_cleanup).*\.py$ # Exclude bundled dependencies + ) + +[mypy-boto3.*] +ignore_missing_imports = True + +[mypy-botocore.*] +ignore_missing_imports = True diff --git a/IaC/cdk/openshift-resources-cleanup/requirements.txt b/IaC/cdk/openshift-resources-cleanup/requirements.txt new file mode 100644 index 0000000000..80c400f37b --- /dev/null +++ b/IaC/cdk/openshift-resources-cleanup/requirements.txt @@ -0,0 +1,20 @@ +# CDK core and constructs +aws-cdk-lib>=2.220.0 +constructs>=10.4.2 + +# Python requirements +boto3>=1.40.53 + +# Testing +pytest>=8.4.2 +pytest-cov>=7.0.0 +moto>=5.1.14 +freezegun>=1.5.1 + +# Linting and formatting +ruff>=0.14.0 +black>=25.9.0 +mypy>=1.18.2 + +# AWS CLI and CDK CLI +awscli>=1.42.53 diff --git a/IaC/cdk/openshift-resources-cleanup/stacks/__init__.py b/IaC/cdk/openshift-resources-cleanup/stacks/__init__.py new file mode 100644 index 0000000000..11b6ce5901 --- /dev/null +++ b/IaC/cdk/openshift-resources-cleanup/stacks/__init__.py @@ -0,0 +1,5 @@ +"""CDK stacks for AWS resource cleanup.""" + +from .resource_cleanup_stack import ResourceCleanupStack + +__all__ = ['ResourceCleanupStack'] diff --git a/IaC/cdk/openshift-resources-cleanup/stacks/resource_cleanup_stack.py b/IaC/cdk/openshift-resources-cleanup/stacks/resource_cleanup_stack.py new file mode 100644 index 0000000000..2a7bd89d2d --- /dev/null +++ b/IaC/cdk/openshift-resources-cleanup/stacks/resource_cleanup_stack.py @@ -0,0 +1,314 @@ +"""CDK Stack for OpenShift Cluster Cleanup Lambda.""" + +from aws_cdk import ( + Stack, + Duration, + aws_lambda as lambda_, + aws_iam as iam, + aws_sns as sns, + aws_sns_subscriptions as subscriptions, + aws_events as events, + aws_events_targets as targets, + aws_logs as logs, + aws_cloudwatch as cloudwatch, + aws_cloudwatch_actions as cw_actions, + CfnParameter, + CfnOutput, + Tags +) +from constructs import Construct + + +class ResourceCleanupStack(Stack): + """ + CDK Stack for OpenShift cluster infrastructure cleanup. + + Manages OpenShift cluster cleanup including VPC, ELB, Route53, and S3 resources with: + - Comprehensive resource discovery and dependency-ordered deletion + - Configurable dry-run mode for safe testing + - SNS notifications for cleanup actions + - Multi-region support + """ + + def __init__(self, scope: Construct, construct_id: str, **kwargs) -> None: + super().__init__(scope, construct_id, **kwargs) + + # Resource naming (single source of truth) + lambda_function_name = "LambdaOpenShiftCleanup" + iam_role_name = "RoleOpenShiftCleanup" + sns_topic_name = "OpenShiftCleanupNotifications" + schedule_rule_name = "OpenShiftCleanupSchedule" + alarm_prefix = "OpenShiftCleanup" + + # Parameters + dry_run_param = CfnParameter( + self, "DryRunMode", + type="String", + default="true", + allowed_values=["true", "false"], + description="[SAFETY] Safe mode - logs all actions without executing them. Set to 'false' only when ready for actual resource deletion. Always test with 'true' first." + ) + + notification_email_param = CfnParameter( + self, "NotificationEmail", + type="String", + default="", + description="[NOTIFICATIONS] Email address for cleanup action reports. Leave empty to disable SNS notifications. Subscribe to SNS topic manually after deployment." + ) + + openshift_cleanup_param = CfnParameter( + self, "OpenShiftCleanupEnabled", + type="String", + default="true", + allowed_values=["true", "false"], + description="[OPENSHIFT] Enable comprehensive OpenShift cluster cleanup including VPC, load balancers, Route53 DNS, and S3 buckets." + ) + + openshift_domain_param = CfnParameter( + self, "OpenShiftBaseDomain", + type="String", + default="cd.percona.com", + description="[OPENSHIFT] Base domain for Route53 DNS record cleanup. Only records under this domain will be removed. Must match your OpenShift installation domain." + ) + + # Scheduling + schedule_rate_param = CfnParameter( + self, "ScheduleRateMinutes", + type="Number", + default=15, + description="[SCHEDULING] Execution frequency in minutes. Lambda scans all target regions at this interval. Recommended: 15 for normal use, 5 for aggressive cleanup, 60 for light monitoring." + ) + + # Region Filter + regions_param = CfnParameter( + self, "TargetRegions", + type="String", + default="all", + description="Regions to scan for clusters. Use 'all' (default) or comma-separated list (e.g., 'us-east-1,eu-west-1')." + ) + + # Logging + log_retention_param = CfnParameter( + self, "LogRetentionDays", + type="Number", + default=30, + description="[LOGGING] CloudWatch log retention period in days. Valid options: 1, 3, 7, 14, 30, 60, 90, 120, 180. Affects storage costs - longer retention = higher costs." + ) + + log_level_param = CfnParameter( + self, "LogLevel", + type="String", + default="INFO", + allowed_values=["DEBUG", "INFO", "WARNING", "ERROR"], + description="[LOGGING] Log verbosity. DEBUG = detailed, INFO = standard (actions + summaries), WARNING = issues only, ERROR = failures only." + ) + + # SNS Topic for notifications + sns_topic = sns.Topic( + self, "CleanupNotificationTopic", + topic_name=sns_topic_name, + display_name="OpenShift Cluster Cleanup Notifications" + ) + + Tags.of(sns_topic).add("iit-billing-tag", "openshift-cleanup") + + # IAM Role for Lambda + lambda_role = iam.Role( + self, "ResourceCleanupRole", + role_name=iam_role_name, + assumed_by=iam.ServicePrincipal("lambda.amazonaws.com"), + managed_policies=[ + iam.ManagedPolicy.from_aws_managed_policy_name( + "service-role/AWSLambdaBasicExecutionRole" + ) + ] + ) + + Tags.of(lambda_role).add("iit-billing-tag", "openshift-cleanup") + + # IAM Policy for Lambda (OpenShift-only permissions) + lambda_role.add_to_policy(iam.PolicyStatement( + effect=iam.Effect.ALLOW, + actions=[ + # EC2 - Basic operations + "ec2:DescribeRegions", + "ec2:DescribeInstances", + "ec2:TerminateInstances", + + # EC2 - VPC and network cleanup + "ec2:DescribeSecurityGroups", + "ec2:RevokeSecurityGroupIngress", + "ec2:DeleteSecurityGroup", + "ec2:DescribeVpcs", + "ec2:DeleteVpc", + "ec2:DescribeSubnets", + "ec2:DeleteSubnet", + "ec2:DescribeInternetGateways", + "ec2:DetachInternetGateway", + "ec2:DeleteInternetGateway", + "ec2:DescribeNatGateways", + "ec2:DeleteNatGateway", + "ec2:DescribeAddresses", + "ec2:ReleaseAddress", + "ec2:DescribeNetworkInterfaces", + "ec2:DeleteNetworkInterface", + "ec2:DescribeVpcEndpoints", + "ec2:DeleteVpcEndpoints", + "ec2:DescribeRouteTables", + "ec2:DisassociateRouteTable", + "ec2:DeleteRoute", + "ec2:DeleteRouteTable", + + # ELB - Load balancer cleanup + "elasticloadbalancing:DescribeLoadBalancers", + "elasticloadbalancing:DeleteLoadBalancer", + "elasticloadbalancing:DescribeTargetGroups", + "elasticloadbalancing:DeleteTargetGroup", + + # Route53 - DNS cleanup + "route53:ListHostedZones", + "route53:ListResourceRecordSets", + "route53:ChangeResourceRecordSets", + "route53:GetChange", + + # S3 - State bucket cleanup + "s3:ListBucket", + "s3:DeleteObject", + "s3:DeleteObjectVersion", + "s3:GetBucketLocation", + + # STS - Identity verification + "sts:GetCallerIdentity" + ], + resources=["*"] + )) + + # SNS publish permission + lambda_role.add_to_policy(iam.PolicyStatement( + effect=iam.Effect.ALLOW, + actions=["sns:Publish"], + resources=[sns_topic.topic_arn] + )) + + # Map log retention parameter to CDK enum + log_retention_mapping = { + 1: logs.RetentionDays.ONE_DAY, + 3: logs.RetentionDays.THREE_DAYS, + 7: logs.RetentionDays.ONE_WEEK, + 14: logs.RetentionDays.TWO_WEEKS, + 30: logs.RetentionDays.ONE_MONTH, + 60: logs.RetentionDays.TWO_MONTHS, + 90: logs.RetentionDays.THREE_MONTHS, + 120: logs.RetentionDays.FOUR_MONTHS, + 180: logs.RetentionDays.SIX_MONTHS, + } + + # Create log group with retention + log_group = logs.LogGroup( + self, "ResourceCleanupLogGroup", + log_group_name=f"/aws/lambda/{lambda_function_name}", + retention=log_retention_mapping.get( + log_retention_param.value_as_number, + logs.RetentionDays.ONE_MONTH + ), + ) + + # Lambda Function + cleanup_lambda = lambda_.Function( + self, "ResourceCleanupLambda", + function_name=lambda_function_name, + description="OpenShift cluster cleanup: VPC, ELB, Route53, S3, and EC2 instances", + runtime=lambda_.Runtime.PYTHON_3_13, + architecture=lambda_.Architecture.ARM_64, + handler="openshift_resource_cleanup.handler.lambda_handler", + code=lambda_.Code.from_asset("lambda"), + role=lambda_role, + timeout=Duration.seconds(600), + memory_size=1024, + reserved_concurrent_executions=1, + log_group=log_group, + environment={ + "DRY_RUN": dry_run_param.value_as_string, + "SNS_TOPIC_ARN": sns_topic.topic_arn, + "OPENSHIFT_CLEANUP_ENABLED": openshift_cleanup_param.value_as_string, + "OPENSHIFT_BASE_DOMAIN": openshift_domain_param.value_as_string, + "TARGET_REGIONS": regions_param.value_as_string, + "LOG_LEVEL": log_level_param.value_as_string + } + ) + + Tags.of(cleanup_lambda).add("iit-billing-tag", "openshift-cleanup") + + # EventBridge Rule (configurable schedule) + schedule_rule = events.Rule( + self, "CleanupScheduleRule", + rule_name=schedule_rule_name, + description=f"Executes every {schedule_rate_param.value_as_number} minutes for OpenShift cluster cleanup", + schedule=events.Schedule.rate(Duration.minutes(schedule_rate_param.value_as_number)), + enabled=True + ) + + # Add target with retry policy for failed invocations + schedule_rule.add_target(targets.LambdaFunction( + cleanup_lambda, + retry_attempts=2, + max_event_age=Duration.hours(1) + )) + + # CloudWatch Alarms for monitoring + lambda_errors_alarm = cloudwatch.Alarm( + self, "LambdaErrorsAlarm", + alarm_name=f"{alarm_prefix}-LambdaErrors", + alarm_description="Alert when cleanup Lambda encounters errors", + metric=cleanup_lambda.metric_errors( + period=Duration.minutes(15), + statistic="Sum" + ), + threshold=1, + evaluation_periods=1, + treat_missing_data=cloudwatch.TreatMissingData.NOT_BREACHING + ) + lambda_errors_alarm.add_alarm_action(cw_actions.SnsAction(sns_topic)) + + # Alarm for Lambda timeout + lambda_timeout_alarm = cloudwatch.Alarm( + self, "LambdaTimeoutAlarm", + alarm_name=f"{alarm_prefix}-LambdaTimeout", + alarm_description="Alert when cleanup Lambda approaches timeout (>8 minutes)", + metric=cleanup_lambda.metric_duration( + period=Duration.minutes(15), + statistic="Maximum" + ), + threshold=480000, + evaluation_periods=1, + comparison_operator=cloudwatch.ComparisonOperator.GREATER_THAN_THRESHOLD, + treat_missing_data=cloudwatch.TreatMissingData.NOT_BREACHING + ) + lambda_timeout_alarm.add_alarm_action(cw_actions.SnsAction(sns_topic)) + + # Outputs + CfnOutput( + self, "LambdaFunctionName", + description="Name of the Lambda function", + value=cleanup_lambda.function_name, + export_name="OpenShiftCleanupLambdaName" + ) + + CfnOutput( + self, "LambdaFunctionArn", + description="ARN of the Lambda function", + value=cleanup_lambda.function_arn, + export_name="OpenShiftCleanupLambdaArn" + ) + + CfnOutput( + self, "SNSTopicArn", + description="ARN of the SNS topic for notifications", + value=sns_topic.topic_arn + ) + + CfnOutput( + self, "DryRunModeOutput", + description="Current dry-run mode setting", + value=dry_run_param.value_as_string + ) diff --git a/IaC/cdk/openshift-resources-cleanup/tests/README.md b/IaC/cdk/openshift-resources-cleanup/tests/README.md new file mode 100644 index 0000000000..6fa30910eb --- /dev/null +++ b/IaC/cdk/openshift-resources-cleanup/tests/README.md @@ -0,0 +1,29 @@ +# Tests + +Unit, integration, and e2e tests for the OpenShift cleanup Lambda. + +## Running Tests + +```bash +just test # Unit tests only +just test-coverage # With coverage report +pytest -m openshift # OpenShift-specific tests +pytest -m "not slow" # Skip slow tests +``` + +## Key Fixtures + +**`make_instance`** - Create test instances: +```python +make_instance(name="test", billing_tag="pmm-staging") +make_instance(ttl_expired=True, hours_old=3) +make_instance(protected=True) +make_instance(openshift=True, infra_id="my-infra-123") +``` + +**`time_utils`** - Time helpers: +```python +time_utils.hours_ago(3) +time_utils.days_ago(30) +time_utils.now() +``` \ No newline at end of file diff --git a/IaC/cdk/openshift-resources-cleanup/tests/__init__.py b/IaC/cdk/openshift-resources-cleanup/tests/__init__.py new file mode 100644 index 0000000000..71bb083614 --- /dev/null +++ b/IaC/cdk/openshift-resources-cleanup/tests/__init__.py @@ -0,0 +1 @@ +"""Unit tests for AWS resource cleanup Lambda.""" diff --git a/IaC/cdk/openshift-resources-cleanup/tests/conftest.py b/IaC/cdk/openshift-resources-cleanup/tests/conftest.py new file mode 100644 index 0000000000..ba21cd0a17 --- /dev/null +++ b/IaC/cdk/openshift-resources-cleanup/tests/conftest.py @@ -0,0 +1,471 @@ +"""Pytest configuration and shared fixtures for AWS resource cleanup tests. + +This file contains: +1. InstanceBuilder - Builder pattern for creating test EC2 instances +2. Fixture factories - Reusable functions for creating test data (make_instance, time_utils) +3. Time utilities - Helpers for time-based test scenarios +4. Legacy fixtures - Deprecated fixtures kept for backward compatibility +""" + +from __future__ import annotations +import datetime +import pytest +from typing import Any, Callable + + +class VolumeBuilder: + """Builder pattern for creating test EBS volumes. + + This builder helps create test volume data structures with various + configurations without needing to mock AWS services. + """ + + def __init__(self): + self._volume = { + "VolumeId": "vol-test123456", + "State": "available", + "CreateTime": datetime.datetime.now(datetime.timezone.utc), + "Size": 10, + "VolumeType": "gp3", + "Tags": [], + } + + def with_volume_id(self, volume_id: str) -> VolumeBuilder: + """Set volume ID.""" + self._volume["VolumeId"] = volume_id + return self + + def with_name(self, name: str) -> VolumeBuilder: + """Set Name tag.""" + self._add_tag("Name", name) + return self + + def with_state(self, state: str) -> VolumeBuilder: + """Set volume state (available, in-use, creating, deleting).""" + self._volume["State"] = state + return self + + def with_create_time(self, create_time: datetime.datetime) -> VolumeBuilder: + """Set create time.""" + self._volume["CreateTime"] = create_time + return self + + def with_size(self, size_gb: int) -> VolumeBuilder: + """Set volume size in GB.""" + self._volume["Size"] = size_gb + return self + + def with_billing_tag(self, billing_tag: str) -> VolumeBuilder: + """Add iit-billing-tag.""" + self._add_tag("iit-billing-tag", billing_tag) + return self + + def with_tag(self, key: str, value: str) -> VolumeBuilder: + """Add custom tag.""" + self._add_tag(key, value) + return self + + def _add_tag(self, key: str, value: str): + """Internal method to add a tag.""" + self._volume["Tags"].append({"Key": key, "Value": value}) + + def build(self) -> dict[str, Any]: + """Build and return the volume dictionary.""" + return self._volume + + +class InstanceBuilder: + """Builder pattern for creating test EC2 instances. + + This builder helps create test instance data structures with various + configurations without needing to mock AWS services. + """ + + def __init__(self): + self._instance = { + "InstanceId": "i-test123456", + "State": {"Name": "running"}, + "LaunchTime": datetime.datetime.now(datetime.timezone.utc), + "Tags": [], + } + + def with_instance_id(self, instance_id: str) -> InstanceBuilder: + """Set instance ID.""" + self._instance["InstanceId"] = instance_id + return self + + def with_name(self, name: str) -> InstanceBuilder: + """Set Name tag.""" + self._add_tag("Name", name) + return self + + def with_state(self, state: str) -> InstanceBuilder: + """Set instance state (running, stopped).""" + self._instance["State"]["Name"] = state + return self + + def with_launch_time(self, launch_time: datetime.datetime) -> InstanceBuilder: + """Set launch time.""" + self._instance["LaunchTime"] = launch_time + return self + + def with_ttl_tags( + self, creation_time: int, delete_after_hours: int + ) -> InstanceBuilder: + """Add TTL tags (creation-time and delete-cluster-after-hours).""" + self._add_tag("creation-time", str(creation_time)) + self._add_tag("delete-cluster-after-hours", str(delete_after_hours)) + return self + + def with_billing_tag(self, billing_tag: str) -> InstanceBuilder: + """Add iit-billing-tag.""" + self._add_tag("iit-billing-tag", billing_tag) + return self + + def with_owner(self, owner: str) -> InstanceBuilder: + """Add owner tag.""" + self._add_tag("owner", owner) + return self + + def with_cluster_name(self, cluster_name: str) -> InstanceBuilder: + """Add cluster-name tag.""" + self._add_tag("cluster-name", cluster_name) + return self + + def with_stop_after_days(self, days: int) -> InstanceBuilder: + """Add stop-after-days tag.""" + self._add_tag("stop-after-days", str(days)) + return self + + def with_openshift_tags(self, infra_id: str) -> InstanceBuilder: + """Add OpenShift-specific tags.""" + self._add_tag("iit-billing-tag", "openshift") + self._add_tag(f"kubernetes.io/cluster/{infra_id}", "owned") + return self + + def with_eks_tags(self, cluster_name: str) -> InstanceBuilder: + """Add EKS-specific tags.""" + self._add_tag("iit-billing-tag", "eks") + self._add_tag(f"kubernetes.io/cluster/{cluster_name}", "owned") + return self + + def with_tag(self, key: str, value: str) -> InstanceBuilder: + """Add custom tag.""" + self._add_tag(key, value) + return self + + def _add_tag(self, key: str, value: str): + """Internal method to add a tag.""" + self._instance["Tags"].append({"Key": key, "Value": value}) + + def build(self) -> dict[str, Any]: + """Build and return the instance dictionary.""" + return self._instance + + +# ===== Core Fixtures ===== + + +@pytest.fixture +def instance_builder(): + """Fixture that returns a new InstanceBuilder.""" + return InstanceBuilder() + + +@pytest.fixture +def volume_builder(): + """Fixture that returns a new VolumeBuilder.""" + return VolumeBuilder() + + +@pytest.fixture +def current_time(): + """Fixture for current time as Unix timestamp.""" + return 1000000 + + +# ===== Fixture Factories ===== + + +@pytest.fixture +def make_instance(instance_builder, current_time): + """Factory fixture for creating test instances with various configurations. + + This replaces multiple similar fixtures with a single flexible factory. + + Args: + name: Instance name (default: "test-instance") + state: Instance state (default: "running") + billing_tag: Billing tag value (default: None) + ttl_expired: Whether TTL should be expired (default: False) + ttl_hours: TTL duration in hours (default: 1) + hours_old: How many hours ago instance was launched (default: 0) + days_old: How many days ago instance was launched (default: 0) + protected: Use protected billing tag (default: False) + openshift: Add OpenShift tags (default: False) + eks: Add EKS tags (default: False) + owner: Owner tag (default: None) + cluster_name: Cluster name tag (default: None) + stop_after_days: Add stop-after-days tag (default: None) + **kwargs: Additional custom tags + + Returns: + dict: Instance data structure + + Example: + # Simple instance + instance = make_instance(name="test", billing_tag="pmm-staging") + + # Expired TTL instance + instance = make_instance(ttl_expired=True, ttl_hours=1, hours_old=3) + + # Protected OpenShift instance + instance = make_instance(protected=True, openshift=True) + """ + def _make( + name: str = "test-instance", + state: str = "running", + billing_tag: str | None = None, + ttl_expired: bool = False, + ttl_hours: int = 1, + hours_old: int = 0, + days_old: int = 0, + protected: bool = False, + openshift: bool = False, + eks: bool = False, + owner: str | None = None, + cluster_name: str | None = None, + stop_after_days: int | None = None, + **kwargs + ) -> dict[str, Any]: + # Calculate launch time + total_seconds = (days_old * 86400) + (hours_old * 3600) + launch_time = datetime.datetime.fromtimestamp( + current_time - total_seconds, + tz=datetime.timezone.utc + ) + + # Build instance + builder = ( + instance_builder + .with_name(name) + .with_state(state) + .with_launch_time(launch_time) + ) + + # Apply protection + if protected: + builder = builder.with_billing_tag("jenkins-dev-pmm") + elif billing_tag: + builder = builder.with_billing_tag(billing_tag) + + # Apply TTL tags + if ttl_expired: + creation_time = current_time - (ttl_hours * 3600 + 3600) # Expired by 1 hour + builder = builder.with_ttl_tags(creation_time, ttl_hours) + + # Apply cluster tags + if openshift: + infra_id = kwargs.pop('infra_id', 'test-infra-123') + builder = builder.with_openshift_tags(infra_id) + if not cluster_name: + cluster_name = 'test-openshift' + + if eks: + eks_cluster = kwargs.pop('eks_cluster', 'test-eks-cluster') + builder = builder.with_eks_tags(eks_cluster) + if not cluster_name: + cluster_name = eks_cluster + + # Apply optional tags + if owner: + builder = builder.with_owner(owner) + if cluster_name: + builder = builder.with_cluster_name(cluster_name) + if stop_after_days is not None: + builder = builder.with_stop_after_days(stop_after_days) + + # Apply custom tags + for key, value in kwargs.items(): + builder = builder.with_tag(key, str(value)) + + return builder.build() + + return _make + + +@pytest.fixture +def time_utils(current_time): + """Utility functions for time-based test scenarios. + + Provides consistent time handling across all tests. + + Example: + # Get times relative to current_time + three_hours_ago = time_utils.hours_ago(3) + thirty_days_ago = time_utils.days_ago(30) + + # Get timestamps + ts = time_utils.timestamp() + old_ts = time_utils.timestamp(time_utils.days_ago(5)) + """ + class TimeUtils: + @staticmethod + def now() -> datetime.datetime: + """Get current time as datetime.""" + return datetime.datetime.fromtimestamp( + current_time, + tz=datetime.timezone.utc + ) + + @staticmethod + def timestamp(dt: datetime.datetime | None = None) -> int: + """Convert datetime to Unix timestamp.""" + if dt is None: + return current_time + return int(dt.timestamp()) + + @staticmethod + def hours_ago(hours: int) -> datetime.datetime: + """Get datetime N hours in the past.""" + return datetime.datetime.fromtimestamp( + current_time - (hours * 3600), + tz=datetime.timezone.utc + ) + + @staticmethod + def days_ago(days: int) -> datetime.datetime: + """Get datetime N days in the past.""" + return datetime.datetime.fromtimestamp( + current_time - (days * 86400), + tz=datetime.timezone.utc + ) + + @staticmethod + def seconds_ago(seconds: int) -> datetime.datetime: + """Get datetime N seconds in the past.""" + return datetime.datetime.fromtimestamp( + current_time - seconds, + tz=datetime.timezone.utc + ) + + return TimeUtils() + + +# ===== Legacy Fixtures (Deprecated - Use make_instance instead) ===== +# These fixtures are kept for backward compatibility during migration. +# New tests should use make_instance fixture factory. + + +@pytest.fixture +def instance_with_valid_billing_tag(instance_builder): + """Instance with valid billing tag.""" + return ( + instance_builder.with_name("test-instance") + .with_billing_tag("pmm-staging") + .with_owner("test-user") + .build() + ) + + +@pytest.fixture +def instance_with_expired_ttl(instance_builder, current_time): + """Instance with expired TTL (created 2 hours ago, TTL 1 hour).""" + creation_time = current_time - 7200 # 2 hours ago + return ( + instance_builder.with_name("expired-instance") + .with_ttl_tags(creation_time, 1) # 1 hour TTL + .with_billing_tag("test-billing") + .with_owner("test-user") + .build() + ) + + +@pytest.fixture +def instance_without_billing_tag(instance_builder): + """Instance without any billing tag.""" + now = datetime.datetime.now(datetime.timezone.utc) + old_time = now - datetime.timedelta(hours=2) + return ( + instance_builder.with_name("untagged-instance") + .with_launch_time(old_time) + .build() + ) + + +@pytest.fixture +def instance_stopped_long_term(instance_builder): + """Instance stopped for more than 30 days.""" + now = datetime.datetime.now(datetime.timezone.utc) + old_time = now - datetime.timedelta(days=35) + return ( + instance_builder.with_name("long-stopped") + .with_state("stopped") + .with_launch_time(old_time) + .with_billing_tag("test-billing") + .build() + ) + + +@pytest.fixture +def instance_with_stop_policy(instance_builder): + """Instance with stop-after-days policy.""" + now = datetime.datetime.now(datetime.timezone.utc) + old_time = now - datetime.timedelta(days=8) + return ( + instance_builder.with_name("pmm-staging") + .with_state("running") + .with_launch_time(old_time) + .with_stop_after_days(7) + .with_billing_tag("pmm-staging") + .build() + ) + + +@pytest.fixture +def protected_instance(instance_builder): + """Instance with persistent billing tag (protected).""" + return ( + instance_builder.with_name("protected-instance") + .with_billing_tag("jenkins-dev-pmm") + .build() + ) + + +@pytest.fixture +def openshift_cluster_instance(instance_builder, current_time): + """Instance that's part of an OpenShift cluster with expired TTL.""" + creation_time = current_time - 7200 # 2 hours ago + return ( + instance_builder.with_name("openshift-master") + .with_ttl_tags(creation_time, 1) + .with_openshift_tags("test-infra-123") + .with_cluster_name("test-openshift") + .with_owner("test-user") + .build() + ) + + +@pytest.fixture +def eks_cluster_instance(instance_builder, current_time): + """Instance that's part of an EKS cluster with expired TTL.""" + creation_time = current_time - 7200 # 2 hours ago + return ( + instance_builder.with_name("eks-node") + .with_ttl_tags(creation_time, 1) + .with_eks_tags("test-eks-cluster") + .with_cluster_name("test-eks-cluster") + .with_owner("test-user") + .build() + ) + + +@pytest.fixture +def tags_dict_from_instance(): + """Helper function to convert instance tags to dictionary format.""" + + def _convert(instance: dict[str, Any]) -> dict[str, str]: + """Convert Tags list to dict.""" + return {tag["Key"]: tag["Value"] for tag in instance.get("Tags", [])} + + return _convert diff --git a/IaC/cdk/openshift-resources-cleanup/tests/e2e/__init__.py b/IaC/cdk/openshift-resources-cleanup/tests/e2e/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/IaC/cdk/openshift-resources-cleanup/tests/e2e/conftest.py b/IaC/cdk/openshift-resources-cleanup/tests/e2e/conftest.py new file mode 100644 index 0000000000..8784daada7 --- /dev/null +++ b/IaC/cdk/openshift-resources-cleanup/tests/e2e/conftest.py @@ -0,0 +1,9 @@ +"""Fixtures specific to end-to-end tests.""" + +import pytest + + +@pytest.fixture(autouse=True) +def _mark_as_e2e(request): + """Automatically mark all tests in e2e/ as e2e tests.""" + request.node.add_marker(pytest.mark.e2e) \ No newline at end of file diff --git a/IaC/cdk/openshift-resources-cleanup/tests/e2e/test_lambda_handler.py b/IaC/cdk/openshift-resources-cleanup/tests/e2e/test_lambda_handler.py new file mode 100644 index 0000000000..961fa6cd7a --- /dev/null +++ b/IaC/cdk/openshift-resources-cleanup/tests/e2e/test_lambda_handler.py @@ -0,0 +1,228 @@ +"""End-to-end tests for Lambda handler entry point and integration flows. + +Tests focus on: +- lambda_handler() entry point (multi-region orchestration) +- End-to-end integration flows +- Error propagation and partial failure scenarios +""" + +from __future__ import annotations +import datetime +import json +import pytest +from unittest.mock import Mock, patch, MagicMock +from botocore.exceptions import ClientError + +from openshift_resource_cleanup.handler import lambda_handler, cleanup_region +from openshift_resource_cleanup.models import CleanupAction + + +@pytest.fixture +def mock_lambda_context(): + """Create a mock Lambda context object.""" + context = Mock() + context.function_name = "test-function" + context.function_version = "$LATEST" + context.invoked_function_arn = "arn:aws:lambda:us-east-1:123456789012:function:test-function" + context.memory_limit_in_mb = 128 + context.aws_request_id = "test-request-id" + context.log_group_name = "/aws/lambda/test-function" + context.log_stream_name = "2024/01/01/[$LATEST]test" + context.get_remaining_time_in_millis = Mock(return_value=300000) + return context + + +@pytest.mark.e2e +@pytest.mark.aws +class TestLambdaHandlerEntryPoint: + """Test the main Lambda handler entry point.""" + + @patch("openshift_resource_cleanup.handler.cleanup_region") + @patch("openshift_resource_cleanup.handler.boto3.client") + def test_lambda_handler_no_actions_across_all_regions( + self, mock_boto_client, mock_cleanup_region, mock_lambda_context + ): + """ + GIVEN regions with no cleanup actions needed + WHEN lambda_handler is invoked + THEN response should show zero actions + """ + mock_ec2 = Mock() + mock_boto_client.return_value = mock_ec2 + mock_ec2.describe_regions.return_value = { + "Regions": [ + {"RegionName": "us-east-1"}, + {"RegionName": "us-west-2"}, + ] + } + + mock_cleanup_region.return_value = [] + + result = lambda_handler({}, mock_lambda_context) + + assert result["statusCode"] == 200 + body = json.loads(result["body"]) + assert body["total_actions"] == 0 + assert body["by_action"] == {} + assert body["actions"] == [] + + @patch("openshift_resource_cleanup.handler.cleanup_region") + @patch("openshift_resource_cleanup.handler.boto3.client") + @patch("openshift_resource_cleanup.handler.DRY_RUN", True) + def test_lambda_handler_includes_dry_run_flag( + self, mock_boto_client, mock_cleanup_region, mock_lambda_context + ): + """ + GIVEN DRY_RUN mode enabled + WHEN lambda_handler is invoked + THEN response should indicate dry_run=true + """ + mock_ec2 = Mock() + mock_boto_client.return_value = mock_ec2 + mock_ec2.describe_regions.return_value = {"Regions": [{"RegionName": "us-east-1"}]} + mock_cleanup_region.return_value = [] + + result = lambda_handler({}, mock_lambda_context) + + body = json.loads(result["body"]) + assert body["dry_run"] is True + + @patch("openshift_resource_cleanup.handler.cleanup_region") + @patch("openshift_resource_cleanup.handler.boto3.client") + def test_lambda_handler_aggregates_actions_correctly( + self, mock_boto_client, mock_cleanup_region, mock_lambda_context + ): + """ + GIVEN multiple regions with various action types + WHEN lambda_handler is invoked + THEN actions should be aggregated correctly by type + """ + mock_ec2 = Mock() + mock_boto_client.return_value = mock_ec2 + mock_ec2.describe_regions.return_value = { + "Regions": [{"RegionName": "us-east-1"}, {"RegionName": "us-west-2"}] + } + + # Region 1: 2 TERMINATE, 1 STOP + # Region 2: 1 TERMINATE, 1 TERMINATE_CLUSTER + region1_actions = [ + CleanupAction("i-1", "us-east-1", "n1", "TERMINATE", "r1", 1.0), + CleanupAction("i-2", "us-east-1", "n2", "TERMINATE", "r2", 1.0), + CleanupAction("i-3", "us-east-1", "n3", "STOP", "r3", 0.5), + ] + region2_actions = [ + CleanupAction("i-4", "us-west-2", "n4", "TERMINATE", "r4", 2.0), + CleanupAction("i-5", "us-west-2", "n5", "TERMINATE_CLUSTER", "r5", 3.0, cluster_name="eks"), + ] + mock_cleanup_region.side_effect = [region1_actions, region2_actions] + + result = lambda_handler({}, mock_lambda_context) + + body = json.loads(result["body"]) + assert body["total_actions"] == 5 + assert body["by_action"]["TERMINATE"] == 3 + assert body["by_action"]["STOP"] == 1 + assert body["by_action"]["TERMINATE_CLUSTER"] == 1 + + @patch("openshift_resource_cleanup.handler.boto3.client") + def test_lambda_handler_handles_describe_regions_failure(self, mock_boto_client, mock_lambda_context): + """ + GIVEN describe_regions API call fails + WHEN lambda_handler is invoked + THEN exception should be raised + """ + mock_ec2 = Mock() + mock_boto_client.return_value = mock_ec2 + mock_ec2.describe_regions.side_effect = ClientError( + {"Error": {"Code": "RequestLimitExceeded", "Message": "Rate limit"}}, + "DescribeRegions", + ) + + with pytest.raises(ClientError): + lambda_handler({}, mock_lambda_context) + + +@pytest.mark.e2e +@pytest.mark.aws +class TestPartialFailureScenarios: + """Test error propagation and partial failure handling.""" + + @patch("openshift_resource_cleanup.handler.cleanup_region") + @patch("openshift_resource_cleanup.handler.boto3.client") + def test_lambda_handler_continues_after_region_failure( + self, mock_boto_client, mock_cleanup_region, mock_lambda_context + ): + """ + GIVEN one region fails but others succeed + WHEN lambda_handler is invoked + THEN successful regions should be processed and returned + """ + mock_ec2 = Mock() + mock_boto_client.return_value = mock_ec2 + mock_ec2.describe_regions.return_value = { + "Regions": [ + {"RegionName": "us-east-1"}, + {"RegionName": "us-west-2"}, + {"RegionName": "eu-west-1"}, + ] + } + + action = CleanupAction("i-ok", "us-east-1", "test", "TERMINATE", "test", 1.0) + + # Region 1: success, Region 2: exception, Region 3: success + mock_cleanup_region.side_effect = [ + [action], + [], # Returns empty instead of raising to match actual behavior + [action], + ] + + result = lambda_handler({}, mock_lambda_context) + + # Should succeed with actions from regions 1 and 3 + assert result["statusCode"] == 200 + body = json.loads(result["body"]) + assert body["total_actions"] == 2 + + + +@pytest.mark.e2e +class TestLambdaEventHandling: + """Test Lambda event validation and edge cases.""" + + @patch("openshift_resource_cleanup.handler.cleanup_region") + @patch("openshift_resource_cleanup.handler.boto3.client") + def test_lambda_handler_accepts_empty_event( + self, mock_boto_client, mock_cleanup_region, mock_lambda_context + ): + """ + GIVEN empty Lambda event + WHEN lambda_handler is invoked + THEN it should process normally + """ + mock_ec2 = Mock() + mock_boto_client.return_value = mock_ec2 + mock_ec2.describe_regions.return_value = {"Regions": [{"RegionName": "us-east-1"}]} + mock_cleanup_region.return_value = [] + + result = lambda_handler({}, mock_lambda_context) + + assert result["statusCode"] == 200 + + @patch("openshift_resource_cleanup.handler.cleanup_region") + @patch("openshift_resource_cleanup.handler.boto3.client") + def test_lambda_handler_accepts_none_context( + self, mock_boto_client, mock_cleanup_region, mock_lambda_context + ): + """ + GIVEN None as Lambda context + WHEN lambda_handler is invoked + THEN it should process normally + """ + mock_ec2 = Mock() + mock_boto_client.return_value = mock_ec2 + mock_ec2.describe_regions.return_value = {"Regions": [{"RegionName": "us-east-1"}]} + mock_cleanup_region.return_value = [] + + result = lambda_handler({}, mock_lambda_context) + + assert result["statusCode"] == 200 diff --git a/IaC/cdk/openshift-resources-cleanup/tests/integration/__init__.py b/IaC/cdk/openshift-resources-cleanup/tests/integration/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/IaC/cdk/openshift-resources-cleanup/tests/integration/conftest.py b/IaC/cdk/openshift-resources-cleanup/tests/integration/conftest.py new file mode 100644 index 0000000000..4581dd6088 --- /dev/null +++ b/IaC/cdk/openshift-resources-cleanup/tests/integration/conftest.py @@ -0,0 +1,83 @@ +"""Fixtures specific to integration tests.""" + +import pytest +from unittest.mock import Mock + + +@pytest.fixture(autouse=True) +def _mark_as_integration(request): + """Automatically mark all tests in integration/ as integration tests.""" + request.node.add_marker(pytest.mark.integration) + + +@pytest.fixture +def mock_ec2_client(): + """Factory for creating mock EC2 clients with common behaviors. + + Example: + ec2 = mock_ec2_client( + describe_instances_response={ + "Reservations": [{"Instances": [instance_data]}] + } + ) + """ + def _create_mock(**kwargs): + mock = Mock() + mock.describe_instances.return_value = kwargs.get( + 'describe_instances_response', + {"Reservations": []} + ) + mock.terminate_instances.return_value = kwargs.get( + 'terminate_response', + {} + ) + mock.stop_instances.return_value = kwargs.get( + 'stop_response', + {} + ) + mock.describe_regions.return_value = kwargs.get( + 'describe_regions_response', + {"Regions": [{"RegionName": "us-east-1"}]} + ) + return mock + return _create_mock + + +@pytest.fixture +def mock_sns_client(): + """Factory for creating mock SNS clients. + + Example: + sns = mock_sns_client( + publish_response={'MessageId': 'test-id'} + ) + """ + def _create_mock(**kwargs): + mock = Mock() + mock.publish.return_value = kwargs.get( + 'publish_response', + {'MessageId': 'test-message-id'} + ) + return mock + return _create_mock + + +@pytest.fixture +def mock_cloudformation_client(): + """Factory for creating mock CloudFormation clients. + + Example: + cfn = mock_cloudformation_client() + """ + def _create_mock(**kwargs): + mock = Mock() + mock.delete_stack.return_value = kwargs.get( + 'delete_stack_response', + {} + ) + mock.describe_stacks.return_value = kwargs.get( + 'describe_stacks_response', + {'Stacks': []} + ) + return mock + return _create_mock \ No newline at end of file diff --git a/IaC/cdk/openshift-resources-cleanup/tests/integration/test_cleanup_region_ttl.py b/IaC/cdk/openshift-resources-cleanup/tests/integration/test_cleanup_region_ttl.py new file mode 100644 index 0000000000..b13c1fc0f3 --- /dev/null +++ b/IaC/cdk/openshift-resources-cleanup/tests/integration/test_cleanup_region_ttl.py @@ -0,0 +1,295 @@ +"""Integration tests for cleanup_region TTL validation logic. + +These tests verify that the Lambda handler correctly evaluates cluster TTLs +before marking clusters for deletion. They should FAIL initially, proving +they catch the bug where all clusters are marked for deletion regardless of TTL. +""" + +from __future__ import annotations +import datetime +import pytest +from unittest.mock import Mock, patch, MagicMock +from freezegun import freeze_time + +from openshift_resource_cleanup.handler import cleanup_region +from openshift_resource_cleanup.models import CleanupAction + + +@pytest.mark.integration +@pytest.mark.openshift +class TestCleanupRegionTTLValidation: + """Test TTL validation in cleanup_region function.""" + + @freeze_time("2025-01-15 12:00:00") + @patch("openshift_resource_cleanup.handler.boto3.client") + @patch("openshift_resource_cleanup.openshift.detection.detect_openshift_infra_id") + @patch("openshift_resource_cleanup.handler.execute_cleanup_action") + @patch("openshift_resource_cleanup.handler.DRY_RUN", True) + def test_cluster_with_non_expired_ttl_should_not_delete( + self, mock_execute, mock_detect_infra, mock_boto_client + ): + """ + GIVEN an OpenShift cluster with non-expired TTL + WHEN cleanup_region is called + THEN NO cleanup action should be created + """ + # Cluster created at 2025-01-15 05:00:00 (7 hours ago) + # TTL: 14 hours (delete-cluster-after-hours=14) + # Time remaining: 7 hours - should NOT delete + creation_time = "1736917200" # Unix timestamp for 2025-01-15 05:00:00 UTC + ttl_hours = "14" + + mock_ec2 = Mock() + mock_boto_client.return_value = mock_ec2 + mock_ec2.describe_instances.return_value = { + "Reservations": [ + { + "Instances": [ + { + "InstanceId": "i-0123456789abcdef0", + "State": {"Name": "running"}, + "Tags": [ + {"Key": "Name", "Value": "jvp-rosa1-abc12-master-0"}, + {"Key": "iit-billing-tag", "Value": "pmm"}, + {"Key": "owner", "Value": "john.doe@percona.com"}, + {"Key": "creation-time", "Value": creation_time}, + {"Key": "delete-cluster-after-hours", "Value": ttl_hours}, + {"Key": "red-hat-clustertype", "Value": "rosa"}, + { + "Key": "kubernetes.io/cluster/jvp-rosa1-abc12", + "Value": "owned", + }, + ], + } + ] + } + ] + } + + # Mock OpenShift detection + mock_detect_infra.return_value = "jvp-rosa1-abc12" + + actions = cleanup_region("us-east-1", "test-exec-123") + + # ❌ This test should FAIL initially because current code marks ALL clusters for deletion + # After fix, this should PASS with actions == [] + assert len(actions) == 0, ( + f"Expected NO actions for non-expired TTL cluster, but got {len(actions)} actions. " + f"Cluster created at {creation_time}, TTL is {ttl_hours} hours, " + f"only 7 hours have passed. Should have 7 hours remaining." + ) + + @freeze_time("2025-01-15 12:00:00") + @patch("openshift_resource_cleanup.handler.boto3.client") + @patch("openshift_resource_cleanup.openshift.detection.detect_openshift_infra_id") + @patch("openshift_resource_cleanup.handler.execute_cleanup_action") + @patch("openshift_resource_cleanup.handler.DRY_RUN", True) + def test_cluster_with_expired_ttl_should_delete( + self, mock_execute, mock_detect_infra, mock_boto_client + ): + """ + GIVEN an OpenShift cluster with expired TTL + WHEN cleanup_region is called + THEN a TERMINATE_OPENSHIFT_CLUSTER action should be created + """ + # Cluster created at 2025-01-14 20:00:00 (16 hours ago) + # TTL: 14 hours (delete-cluster-after-hours=14) + # Time expired: 2 hours ago - SHOULD delete + creation_time = "1736884800" # Unix timestamp for 2025-01-14 20:00:00 UTC + ttl_hours = "14" + + mock_ec2 = Mock() + mock_boto_client.return_value = mock_ec2 + mock_ec2.describe_instances.return_value = { + "Reservations": [ + { + "Instances": [ + { + "InstanceId": "i-expired123456789", + "State": {"Name": "running"}, + "Tags": [ + {"Key": "Name", "Value": "expired-cluster-xyz45-master-0"}, + {"Key": "iit-billing-tag", "Value": "pmm"}, + {"Key": "owner", "Value": "jane.smith@percona.com"}, + {"Key": "creation-time", "Value": creation_time}, + {"Key": "delete-cluster-after-hours", "Value": ttl_hours}, + {"Key": "red-hat-clustertype", "Value": "rosa"}, + { + "Key": "kubernetes.io/cluster/expired-cluster-xyz45", + "Value": "owned", + }, + ], + } + ] + } + ] + } + + mock_detect_infra.return_value = "expired-cluster-xyz45" + + actions = cleanup_region("us-east-1", "test-exec-123") + + # ✅ This test should PASS even with current buggy code (it deletes everything) + # After fix, this should still PASS + assert len(actions) == 1 + assert actions[0].action == "TERMINATE_OPENSHIFT_CLUSTER" + assert actions[0].cluster_name == "expired-cluster" + assert actions[0].days_overdue > 0, ( + f"Expected days_overdue > 0 for expired cluster, but got {actions[0].days_overdue}" + ) + + @freeze_time("2025-01-15 12:00:00") + @patch("openshift_resource_cleanup.handler.boto3.client") + @patch("openshift_resource_cleanup.openshift.detection.detect_openshift_infra_id") + @patch("openshift_resource_cleanup.handler.execute_cleanup_action") + @patch("openshift_resource_cleanup.handler.DRY_RUN", True) + def test_cluster_without_ttl_tags_should_delete( + self, mock_execute, mock_detect_infra, mock_boto_client + ): + """ + GIVEN an OpenShift cluster without TTL tags (unmanaged/forgotten) + WHEN cleanup_region is called + THEN a TERMINATE action should be created (cleanup old infrastructure) + """ + mock_ec2 = Mock() + mock_boto_client.return_value = mock_ec2 + mock_ec2.describe_instances.return_value = { + "Reservations": [ + { + "Instances": [ + { + "InstanceId": "i-no-ttl-tags-123", + "State": {"Name": "running"}, + "Tags": [ + {"Key": "Name", "Value": "no-ttl-cluster-def67-master-0"}, + {"Key": "iit-billing-tag", "Value": "pmm"}, + {"Key": "owner", "Value": "bob.jones@percona.com"}, + # NO creation-time or delete-cluster-after-hours tags + {"Key": "red-hat-clustertype", "Value": "rosa"}, + { + "Key": "kubernetes.io/cluster/no-ttl-cluster-def67", + "Value": "owned", + }, + ], + } + ] + } + ] + } + + mock_detect_infra.return_value = "no-ttl-cluster-def67" + + actions = cleanup_region("us-east-1", "test-exec-123") + + # ✅ This test should PASS even with current buggy code (it deletes everything) + # After fix, this should still PASS (delete unmanaged clusters) + assert len(actions) == 1, ( + f"Expected 1 action for cluster without TTL tags (unmanaged infrastructure), " + f"but got {len(actions)} actions" + ) + assert actions[0].action == "TERMINATE_OPENSHIFT_CLUSTER" + assert actions[0].cluster_name == "no-ttl-cluster" + + @freeze_time("2025-01-15 12:00:00") + @patch("openshift_resource_cleanup.handler.boto3.client") + @patch("openshift_resource_cleanup.openshift.detection.detect_openshift_infra_id") + @patch("openshift_resource_cleanup.handler.execute_cleanup_action") + @patch("openshift_resource_cleanup.handler.DRY_RUN", True) + def test_cluster_with_malformed_ttl_should_not_delete( + self, mock_execute, mock_detect_infra, mock_boto_client + ): + """ + GIVEN an OpenShift cluster with malformed TTL tags + WHEN cleanup_region is called + THEN NO cleanup action should be created (fail safe) + """ + mock_ec2 = Mock() + mock_boto_client.return_value = mock_ec2 + mock_ec2.describe_instances.return_value = { + "Reservations": [ + { + "Instances": [ + { + "InstanceId": "i-malformed-ttl-456", + "State": {"Name": "running"}, + "Tags": [ + {"Key": "Name", "Value": "malformed-ttl-ghi89-master-0"}, + {"Key": "iit-billing-tag", "Value": "pmm"}, + {"Key": "creation-time", "Value": "invalid-date"}, + {"Key": "delete-cluster-after-hours", "Value": "not-a-number"}, + {"Key": "red-hat-clustertype", "Value": "rosa"}, + { + "Key": "kubernetes.io/cluster/malformed-ttl-ghi89", + "Value": "owned", + }, + ], + } + ] + } + ] + } + + mock_detect_infra.return_value = "malformed-ttl-ghi89" + + actions = cleanup_region("us-east-1", "test-exec-123") + + # ❌ This test should FAIL initially + # After fix, this should PASS with actions == [] (fail-safe behavior) + assert len(actions) == 0, ( + f"Expected NO actions for cluster with malformed TTL tags (fail-safe), " + f"but got {len(actions)} actions" + ) + + @freeze_time("2025-01-15 12:00:00") + @patch("openshift_resource_cleanup.handler.boto3.client") + @patch("openshift_resource_cleanup.openshift.detection.detect_openshift_infra_id") + @patch("openshift_resource_cleanup.handler.execute_cleanup_action") + @patch("openshift_resource_cleanup.handler.DRY_RUN", True) + def test_cluster_exactly_at_ttl_expiry_should_delete( + self, mock_execute, mock_detect_infra, mock_boto_client + ): + """ + GIVEN an OpenShift cluster exactly at TTL expiry time + WHEN cleanup_region is called + THEN a TERMINATE action should be created (boundary case) + """ + # Cluster created at 2025-01-14 22:00:00 (14 hours ago) + # TTL: 14 hours + # Time expired: exactly 0 hours - SHOULD delete + creation_time = "1736892000" # Unix timestamp for 2025-01-14 22:00:00 UTC + ttl_hours = "14" + + mock_ec2 = Mock() + mock_boto_client.return_value = mock_ec2 + mock_ec2.describe_instances.return_value = { + "Reservations": [ + { + "Instances": [ + { + "InstanceId": "i-boundary-case-789", + "State": {"Name": "running"}, + "Tags": [ + {"Key": "Name", "Value": "boundary-jkl01-master-0"}, + {"Key": "iit-billing-tag", "Value": "pmm"}, + {"Key": "creation-time", "Value": creation_time}, + {"Key": "delete-cluster-after-hours", "Value": ttl_hours}, + {"Key": "red-hat-clustertype", "Value": "rosa"}, + { + "Key": "kubernetes.io/cluster/boundary-jkl01", + "Value": "owned", + }, + ], + } + ] + } + ] + } + + mock_detect_infra.return_value = "boundary-jkl01" + + actions = cleanup_region("us-east-1", "test-exec-123") + + # After fix, should create deletion action at exact expiry + assert len(actions) == 1 + assert actions[0].action == "TERMINATE_OPENSHIFT_CLUSTER" + assert actions[0].days_overdue >= 0 diff --git a/IaC/cdk/openshift-resources-cleanup/tests/integration/test_openshift_orchestration.py b/IaC/cdk/openshift-resources-cleanup/tests/integration/test_openshift_orchestration.py new file mode 100644 index 0000000000..7a53e485c1 --- /dev/null +++ b/IaC/cdk/openshift-resources-cleanup/tests/integration/test_openshift_orchestration.py @@ -0,0 +1,173 @@ +"""Integration tests for OpenShift orchestration. + +Tests the destroy_openshift_cluster orchestrator with mocked AWS clients. +Single-pass cleanup with EventBridge retry handling. +""" + +from __future__ import annotations +import pytest +from unittest.mock import Mock, patch + +from openshift_resource_cleanup.openshift.orchestrator import destroy_openshift_cluster + + +@pytest.mark.integration +@pytest.mark.openshift +class TestDestroyOpenshiftCluster: + """Test OpenShift cluster destruction orchestration.""" + + @patch("openshift_resource_cleanup.openshift.orchestrator.cleanup_s3_state") + @patch("openshift_resource_cleanup.openshift.orchestrator.cleanup_route53_records") + @patch("openshift_resource_cleanup.openshift.orchestrator.delete_vpc") + @patch("openshift_resource_cleanup.openshift.orchestrator.delete_internet_gateway") + @patch("openshift_resource_cleanup.openshift.orchestrator.delete_route_tables") + @patch("openshift_resource_cleanup.openshift.orchestrator.delete_subnets") + @patch("openshift_resource_cleanup.openshift.orchestrator.delete_security_groups") + @patch("openshift_resource_cleanup.openshift.orchestrator.delete_vpc_endpoints") + @patch("openshift_resource_cleanup.openshift.orchestrator.cleanup_network_interfaces") + @patch("openshift_resource_cleanup.openshift.orchestrator.release_elastic_ips") + @patch("openshift_resource_cleanup.openshift.orchestrator.delete_nat_gateways") + @patch("openshift_resource_cleanup.openshift.orchestrator.delete_load_balancers") + @patch("openshift_resource_cleanup.openshift.orchestrator.boto3.client") + @patch("openshift_resource_cleanup.models.config.DRY_RUN", False) + def test_orchestrator_calls_functions_in_correct_order( + self, + mock_boto_client, + mock_delete_lbs, + mock_delete_nats, + mock_release_eips, + mock_cleanup_enis, + mock_delete_endpoints, + mock_delete_sgs, + mock_delete_subnets, + mock_delete_rts, + mock_delete_igw, + mock_delete_vpc, + mock_cleanup_route53, + mock_cleanup_s3, + ): + """ + GIVEN OpenShift cluster exists and VPC can be deleted + WHEN destroy_openshift_cluster is called + THEN resources should be deleted in dependency order in single pass + """ + mock_ec2 = Mock() + mock_boto_client.return_value = mock_ec2 + + # VPC exists and can be deleted + mock_ec2.describe_vpcs.return_value = {"Vpcs": [{"VpcId": "vpc-abc123"}]} + mock_delete_vpc.return_value = True # VPC successfully deleted + + result = destroy_openshift_cluster("test-cluster", "test-infra-123", "us-east-1") + + # Verify single-pass cleanup + assert result is True + + # Verify cleanup functions called once in correct order + mock_delete_lbs.assert_called_once_with("test-infra-123", "us-east-1") + mock_delete_nats.assert_called_once_with("test-infra-123", "us-east-1") + mock_release_eips.assert_called_once_with("test-infra-123", "us-east-1") + mock_cleanup_enis.assert_called_once_with("vpc-abc123", "us-east-1") + mock_delete_endpoints.assert_called_once_with("vpc-abc123", "us-east-1") + mock_delete_sgs.assert_called_once_with("vpc-abc123", "us-east-1") + mock_delete_subnets.assert_called_once_with("vpc-abc123", "us-east-1") + mock_delete_rts.assert_called_once_with("vpc-abc123", "us-east-1") + mock_delete_igw.assert_called_once_with("vpc-abc123", "us-east-1") + mock_delete_vpc.assert_called_once_with("vpc-abc123", "us-east-1") + + # Route53 and S3 cleanup when VPC successfully deleted + mock_cleanup_route53.assert_called_once_with("test-cluster", "us-east-1") + mock_cleanup_s3.assert_called_once_with("test-cluster", "us-east-1") + + @patch("openshift_resource_cleanup.openshift.orchestrator.cleanup_s3_state") + @patch("openshift_resource_cleanup.openshift.orchestrator.cleanup_route53_records") + @patch("openshift_resource_cleanup.openshift.orchestrator.delete_vpc") + @patch("openshift_resource_cleanup.openshift.orchestrator.delete_load_balancers") + @patch("openshift_resource_cleanup.openshift.orchestrator.boto3.client") + def test_orchestrator_exits_early_when_vpc_not_found( + self, mock_boto_client, mock_delete_lbs, mock_delete_vpc, mock_cleanup_route53, mock_cleanup_s3 + ): + """ + GIVEN VPC does not exist + WHEN destroy_openshift_cluster is called + THEN cleanup should exit early and clean up Route53/S3 + """ + mock_ec2 = Mock() + mock_boto_client.return_value = mock_ec2 + + mock_ec2.describe_vpcs.return_value = {"Vpcs": []} + + result = destroy_openshift_cluster("test-cluster", "test-infra-123", "us-east-1") + + # Should check VPC once and exit + mock_ec2.describe_vpcs.assert_called_once() + mock_delete_lbs.assert_not_called() + mock_delete_vpc.assert_not_called() + + # Should clean up Route53/S3 when VPC is already gone + mock_cleanup_route53.assert_called_once_with("test-cluster", "us-east-1") + mock_cleanup_s3.assert_called_once_with("test-cluster", "us-east-1") + + assert result is True # Cleanup complete + + @patch("openshift_resource_cleanup.openshift.orchestrator.cleanup_s3_state") + @patch("openshift_resource_cleanup.openshift.orchestrator.cleanup_route53_records") + @patch("openshift_resource_cleanup.openshift.orchestrator.delete_vpc") + @patch("openshift_resource_cleanup.openshift.orchestrator.delete_load_balancers") + @patch("openshift_resource_cleanup.openshift.orchestrator.boto3.client") + def test_vpc_has_dependencies_returns_false( + self, + mock_boto_client, + mock_delete_lbs, + mock_delete_vpc, + mock_cleanup_route53, + mock_cleanup_s3, + ): + """ + GIVEN VPC still has dependencies and cannot be deleted + WHEN destroy_openshift_cluster is called + THEN cleanup should return False and Route53/S3 should NOT be cleaned + """ + mock_ec2 = Mock() + mock_boto_client.return_value = mock_ec2 + + # VPC exists but has dependencies + mock_ec2.describe_vpcs.return_value = {"Vpcs": [{"VpcId": "vpc-abc123"}]} + mock_delete_vpc.return_value = False # VPC has dependencies + + result = destroy_openshift_cluster("test-cluster", "test-infra-123", "us-east-1") + + # Should return False (cleanup incomplete) + assert result is False + + # Route53 and S3 should NOT be cleaned when VPC deletion fails + mock_cleanup_route53.assert_not_called() + mock_cleanup_s3.assert_not_called() + + @patch("openshift_resource_cleanup.openshift.orchestrator.delete_vpc") + @patch("openshift_resource_cleanup.openshift.orchestrator.delete_load_balancers") + @patch("openshift_resource_cleanup.openshift.orchestrator.boto3.client") + def test_orchestrator_handles_dependency_violations( + self, mock_boto_client, mock_delete_lbs, mock_delete_vpc + ): + """ + GIVEN DependencyViolation error occurs during cleanup + WHEN destroy_openshift_cluster is called + THEN should return False and rely on EventBridge retry + """ + from botocore.exceptions import ClientError + + mock_ec2 = Mock() + mock_boto_client.return_value = mock_ec2 + + # VPC exists + mock_ec2.describe_vpcs.return_value = {"Vpcs": [{"VpcId": "vpc-abc123"}]} + + # Simulate DependencyViolation error + error_response = {"Error": {"Code": "DependencyViolation"}} + mock_delete_lbs.side_effect = ClientError(error_response, "DeleteLoadBalancer") + + result = destroy_openshift_cluster("test-cluster", "test-infra-123", "us-east-1") + + # Should return False (dependencies remain) + assert result is False diff --git a/IaC/cdk/openshift-resources-cleanup/tests/pytest.ini b/IaC/cdk/openshift-resources-cleanup/tests/pytest.ini new file mode 100644 index 0000000000..e731cd7e4c --- /dev/null +++ b/IaC/cdk/openshift-resources-cleanup/tests/pytest.ini @@ -0,0 +1,28 @@ +[pytest] +testpaths = . +python_files = test_*.py +python_classes = Test* +python_functions = test_* + +# Test markers +markers = + unit: Unit tests (fast, isolated business logic) + integration: Integration tests (component interactions) + e2e: End-to-end tests (full workflows) + aws: Tests interacting with AWS services + slow: Slow-running tests (>1s) + openshift: OpenShift-specific functionality + eks: EKS-specific functionality + policies: Cleanup policy tests + smoke: Critical path smoke tests + volumes: EBS volume cleanup tests + +# Output options +addopts = + -v + --strict-markers + --tb=short + --disable-warnings + +# Optional: Uncomment to enable coverage reporting +# addopts = --cov=aws_resource_cleanup --cov-report=term-missing --cov-report=html \ No newline at end of file diff --git a/IaC/cdk/openshift-resources-cleanup/tests/unit/__init__.py b/IaC/cdk/openshift-resources-cleanup/tests/unit/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/IaC/cdk/openshift-resources-cleanup/tests/unit/conftest.py b/IaC/cdk/openshift-resources-cleanup/tests/unit/conftest.py new file mode 100644 index 0000000000..2d8b5d5c09 --- /dev/null +++ b/IaC/cdk/openshift-resources-cleanup/tests/unit/conftest.py @@ -0,0 +1,9 @@ +"""Fixtures specific to unit tests.""" + +import pytest + + +@pytest.fixture(autouse=True) +def _mark_as_unit(request): + """Automatically mark all tests in unit/ as unit tests.""" + request.node.add_marker(pytest.mark.unit) \ No newline at end of file diff --git a/IaC/cdk/openshift-resources-cleanup/tests/unit/openshift/__init__.py b/IaC/cdk/openshift-resources-cleanup/tests/unit/openshift/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/IaC/cdk/openshift-resources-cleanup/tests/unit/openshift/test_openshift_detection.py b/IaC/cdk/openshift-resources-cleanup/tests/unit/openshift/test_openshift_detection.py new file mode 100644 index 0000000000..f20047ff5c --- /dev/null +++ b/IaC/cdk/openshift-resources-cleanup/tests/unit/openshift/test_openshift_detection.py @@ -0,0 +1,154 @@ +"""Unit tests for OpenShift cluster detection.""" + +from __future__ import annotations +import pytest +from unittest.mock import Mock, patch + +from openshift_resource_cleanup.openshift.detection import detect_openshift_infra_id + + +@pytest.mark.unit +@pytest.mark.openshift +class TestDetectOpenshiftInfraId: + """Test OpenShift infrastructure ID detection.""" + + @patch("openshift_resource_cleanup.openshift.detection.boto3.client") + def test_detects_infra_id_from_exact_match(self, mock_boto_client): + """ + GIVEN VPC with exact cluster name tag + WHEN detect_openshift_infra_id is called + THEN infrastructure ID should be extracted from tag + """ + mock_ec2 = Mock() + mock_boto_client.return_value = mock_ec2 + + mock_ec2.describe_vpcs.return_value = { + "Vpcs": [ + { + "VpcId": "vpc-abc123", + "Tags": [ + { + "Key": "kubernetes.io/cluster/test-infra-abc123", + "Value": "owned", + }, + {"Key": "Name", "Value": "openshift-vpc"}, + ], + } + ] + } + + infra_id = detect_openshift_infra_id("test-infra-abc123", "us-east-1") + + assert infra_id == "test-infra-abc123" + mock_ec2.describe_vpcs.assert_called_once_with( + Filters=[ + { + "Name": "tag-key", + "Values": ["kubernetes.io/cluster/test-infra-abc123"], + } + ] + ) + + @patch("openshift_resource_cleanup.openshift.detection.boto3.client") + def test_detects_infra_id_from_wildcard_match(self, mock_boto_client): + """ + GIVEN VPC with cluster name prefix (wildcard match needed) + WHEN detect_openshift_infra_id is called with cluster name + THEN infrastructure ID should be extracted from tag + """ + mock_ec2 = Mock() + mock_boto_client.return_value = mock_ec2 + + # First call returns empty (exact match fails) + # Second call returns VPC (wildcard match succeeds) + mock_ec2.describe_vpcs.side_effect = [ + {"Vpcs": []}, + { + "Vpcs": [ + { + "VpcId": "vpc-def456", + "Tags": [ + { + "Key": "kubernetes.io/cluster/test-cluster-xyz789", + "Value": "owned", + } + ], + } + ] + }, + ] + + infra_id = detect_openshift_infra_id("test-cluster", "us-east-1") + + assert infra_id == "test-cluster-xyz789" + assert mock_ec2.describe_vpcs.call_count == 2 + + # First call: exact match + first_call = mock_ec2.describe_vpcs.call_args_list[0] + assert first_call.kwargs["Filters"][0]["Values"] == [ + "kubernetes.io/cluster/test-cluster" + ] + + # Second call: wildcard match + second_call = mock_ec2.describe_vpcs.call_args_list[1] + assert second_call.kwargs["Filters"][0]["Values"] == [ + "kubernetes.io/cluster/test-cluster-*" + ] + + @patch("openshift_resource_cleanup.openshift.detection.boto3.client") + def test_returns_none_when_no_vpc_found(self, mock_boto_client): + """ + GIVEN no VPC exists with cluster tags + WHEN detect_openshift_infra_id is called + THEN None should be returned + """ + mock_ec2 = Mock() + mock_boto_client.return_value = mock_ec2 + + mock_ec2.describe_vpcs.return_value = {"Vpcs": []} + + infra_id = detect_openshift_infra_id("nonexistent-cluster", "us-east-1") + + assert infra_id is None + + @patch("openshift_resource_cleanup.openshift.detection.boto3.client") + def test_returns_none_when_vpc_has_no_cluster_tags(self, mock_boto_client): + """ + GIVEN VPC exists but has no kubernetes cluster tags + WHEN detect_openshift_infra_id is called + THEN None should be returned + """ + mock_ec2 = Mock() + mock_boto_client.return_value = mock_ec2 + + mock_ec2.describe_vpcs.return_value = { + "Vpcs": [ + { + "VpcId": "vpc-abc123", + "Tags": [ + {"Key": "Name", "Value": "regular-vpc"}, + {"Key": "Environment", "Value": "test"}, + ], + } + ] + } + + infra_id = detect_openshift_infra_id("test-cluster", "us-east-1") + + assert infra_id is None + + @patch("openshift_resource_cleanup.openshift.detection.boto3.client") + def test_handles_aws_api_exception(self, mock_boto_client): + """ + GIVEN AWS API raises exception + WHEN detect_openshift_infra_id is called + THEN exception should be handled and None returned + """ + mock_ec2 = Mock() + mock_boto_client.return_value = mock_ec2 + + mock_ec2.describe_vpcs.side_effect = Exception("AWS API Error") + + infra_id = detect_openshift_infra_id("test-cluster", "us-east-1") + + assert infra_id is None diff --git a/IaC/cdk/openshift-resources-cleanup/tests/unit/openshift/test_openshift_dns.py b/IaC/cdk/openshift-resources-cleanup/tests/unit/openshift/test_openshift_dns.py new file mode 100644 index 0000000000..2f87682885 --- /dev/null +++ b/IaC/cdk/openshift-resources-cleanup/tests/unit/openshift/test_openshift_dns.py @@ -0,0 +1,147 @@ +"""Unit tests for OpenShift Route53 DNS cleanup.""" + +from __future__ import annotations +import pytest +from unittest.mock import Mock, patch + +from openshift_resource_cleanup.openshift.dns import cleanup_route53_records + + +@pytest.mark.unit +@pytest.mark.openshift +class TestCleanupRoute53Records: + """Test Route53 DNS record cleanup for OpenShift clusters.""" + + @patch("openshift_resource_cleanup.openshift.dns.boto3.client") + @patch("openshift_resource_cleanup.openshift.dns.OPENSHIFT_BASE_DOMAIN", "cd.percona.com") + @patch("openshift_resource_cleanup.openshift.dns.DRY_RUN", False) + def test_deletes_cluster_dns_records_live_mode(self, mock_boto_client): + """ + GIVEN OpenShift cluster DNS records exist in Route53 + WHEN cleanup_route53_records is called in live mode + THEN matching DNS records should be deleted + """ + mock_route53 = Mock() + mock_boto_client.return_value = mock_route53 + + mock_route53.list_hosted_zones.return_value = { + "HostedZones": [{"Id": "/hostedzone/Z123456", "Name": "cd.percona.com."}] + } + + mock_route53.list_resource_record_sets.return_value = { + "ResourceRecordSets": [ + { + "Name": "api.test-cluster.cd.percona.com.", + "Type": "A", + "TTL": 300, + "ResourceRecords": [{"Value": "1.2.3.4"}], + }, + { + "Name": "*.apps.test-cluster.cd.percona.com.", + "Type": "A", + "TTL": 300, + "ResourceRecords": [{"Value": "5.6.7.8"}], + }, + { + "Name": "other.cd.percona.com.", + "Type": "A", + "TTL": 300, + "ResourceRecords": [{"Value": "9.10.11.12"}], + }, + ] + } + + cleanup_route53_records("test-cluster", "us-east-1") + + mock_route53.list_hosted_zones.assert_called_once() + mock_route53.list_resource_record_sets.assert_called_once_with( + HostedZoneId="Z123456" + ) + + # Should delete 2 records (api and apps) but not the other one + call_args = mock_route53.change_resource_record_sets.call_args + assert call_args is not None + changes = call_args.kwargs["ChangeBatch"]["Changes"] + assert len(changes) == 2 + assert all(change["Action"] == "DELETE" for change in changes) + + @patch("openshift_resource_cleanup.openshift.dns.boto3.client") + @patch("openshift_resource_cleanup.openshift.dns.OPENSHIFT_BASE_DOMAIN", "cd.percona.com") + @patch("openshift_resource_cleanup.openshift.dns.DRY_RUN", True) + def test_skips_deletion_in_dry_run_mode(self, mock_boto_client): + """ + GIVEN OpenShift cluster DNS records exist + WHEN cleanup_route53_records is called in DRY_RUN mode + THEN no changes should be made + """ + mock_route53 = Mock() + mock_boto_client.return_value = mock_route53 + + mock_route53.list_hosted_zones.return_value = { + "HostedZones": [{"Id": "/hostedzone/Z123", "Name": "cd.percona.com."}] + } + + mock_route53.list_resource_record_sets.return_value = { + "ResourceRecordSets": [ + { + "Name": "api.test-cluster.cd.percona.com.", + "Type": "A", + "TTL": 300, + "ResourceRecords": [{"Value": "1.2.3.4"}], + } + ] + } + + cleanup_route53_records("test-cluster", "us-east-1") + + mock_route53.change_resource_record_sets.assert_not_called() + + @patch("openshift_resource_cleanup.openshift.dns.boto3.client") + @patch("openshift_resource_cleanup.openshift.dns.OPENSHIFT_BASE_DOMAIN", "cd.percona.com") + def test_handles_missing_hosted_zone(self, mock_boto_client): + """ + GIVEN hosted zone does not exist + WHEN cleanup_route53_records is called + THEN function should return without error + """ + mock_route53 = Mock() + mock_boto_client.return_value = mock_route53 + + mock_route53.list_hosted_zones.return_value = { + "HostedZones": [{"Id": "/hostedzone/Z999", "Name": "other-domain.com."}] + } + + cleanup_route53_records("test-cluster", "us-east-1") + + mock_route53.list_resource_record_sets.assert_not_called() + + @patch("openshift_resource_cleanup.openshift.dns.boto3.client") + @patch("openshift_resource_cleanup.openshift.dns.OPENSHIFT_BASE_DOMAIN", "cd.percona.com") + @patch("openshift_resource_cleanup.openshift.dns.DRY_RUN", False) + def test_handles_no_matching_records(self, mock_boto_client): + """ + GIVEN no DNS records match the cluster name + WHEN cleanup_route53_records is called + THEN no changes should be made + """ + mock_route53 = Mock() + mock_boto_client.return_value = mock_route53 + + mock_route53.list_hosted_zones.return_value = { + "HostedZones": [{"Id": "/hostedzone/Z123", "Name": "cd.percona.com."}] + } + + mock_route53.list_resource_record_sets.return_value = { + "ResourceRecordSets": [ + { + "Name": "other.cd.percona.com.", + "Type": "A", + "TTL": 300, + "ResourceRecords": [{"Value": "1.2.3.4"}], + } + ] + } + + cleanup_route53_records("test-cluster", "us-east-1") + + mock_route53.change_resource_record_sets.assert_not_called() diff --git a/IaC/cdk/openshift-resources-cleanup/tests/unit/openshift/test_openshift_logic.py b/IaC/cdk/openshift-resources-cleanup/tests/unit/openshift/test_openshift_logic.py new file mode 100644 index 0000000000..bacb7712fb --- /dev/null +++ b/IaC/cdk/openshift-resources-cleanup/tests/unit/openshift/test_openshift_logic.py @@ -0,0 +1 @@ +"""Unit tests for OpenShift cleanup orchestration logic.""" diff --git a/IaC/cdk/openshift-resources-cleanup/tests/unit/openshift/test_openshift_network.py b/IaC/cdk/openshift-resources-cleanup/tests/unit/openshift/test_openshift_network.py new file mode 100644 index 0000000000..612c308d23 --- /dev/null +++ b/IaC/cdk/openshift-resources-cleanup/tests/unit/openshift/test_openshift_network.py @@ -0,0 +1,403 @@ +"""Unit tests for OpenShift network cleanup functions. + +Tests the individual network resource deletion functions with mocked boto3 clients. +""" + +from __future__ import annotations +import pytest +from unittest.mock import Mock, patch +from botocore.exceptions import ClientError + +from openshift_resource_cleanup.openshift.network import ( + delete_nat_gateways, + release_elastic_ips, + cleanup_network_interfaces, + delete_vpc_endpoints, + delete_security_groups, + delete_subnets, + delete_route_tables, + delete_internet_gateway, + delete_vpc, +) + + +@pytest.mark.unit +@pytest.mark.openshift +class TestDeleteNatGateways: + """Test NAT gateway deletion.""" + + @patch("openshift_resource_cleanup.openshift.network.boto3.client") + @patch("openshift_resource_cleanup.openshift.network.DRY_RUN", False) + def test_deletes_nat_gateways_live_mode(self, mock_boto_client): + """ + GIVEN NAT gateways exist for OpenShift cluster + WHEN delete_nat_gateways is called in live mode + THEN delete_nat_gateway should be called for each NAT gateway + """ + mock_ec2 = Mock() + mock_boto_client.return_value = mock_ec2 + mock_ec2.describe_nat_gateways.return_value = { + "NatGateways": [ + {"NatGatewayId": "nat-abc123"}, + {"NatGatewayId": "nat-def456"}, + ] + } + + delete_nat_gateways("test-infra-123", "us-east-1") + + mock_ec2.describe_nat_gateways.assert_called_once_with( + Filters=[ + { + "Name": "tag:kubernetes.io/cluster/test-infra-123", + "Values": ["owned"], + }, + {"Name": "state", "Values": ["available", "pending"]}, + ] + ) + assert mock_ec2.delete_nat_gateway.call_count == 2 + mock_ec2.delete_nat_gateway.assert_any_call(NatGatewayId="nat-abc123") + mock_ec2.delete_nat_gateway.assert_any_call(NatGatewayId="nat-def456") + + @patch("openshift_resource_cleanup.openshift.network.boto3.client") + @patch("openshift_resource_cleanup.openshift.network.DRY_RUN", True) + def test_skips_deletion_in_dry_run_mode(self, mock_boto_client): + """ + GIVEN NAT gateways exist for OpenShift cluster + WHEN delete_nat_gateways is called in DRY_RUN mode + THEN no deletion should occur + """ + mock_ec2 = Mock() + mock_boto_client.return_value = mock_ec2 + mock_ec2.describe_nat_gateways.return_value = { + "NatGateways": [{"NatGatewayId": "nat-abc123"}] + } + + delete_nat_gateways("test-infra-123", "us-east-1") + + mock_ec2.describe_nat_gateways.assert_called_once() + mock_ec2.delete_nat_gateway.assert_not_called() + + @patch("openshift_resource_cleanup.openshift.network.boto3.client") + def test_handles_empty_nat_gateway_list(self, mock_boto_client): + """ + GIVEN no NAT gateways exist + WHEN delete_nat_gateways is called + THEN function should complete without errors + """ + mock_ec2 = Mock() + mock_boto_client.return_value = mock_ec2 + mock_ec2.describe_nat_gateways.return_value = {"NatGateways": []} + + delete_nat_gateways("test-infra-123", "us-east-1") + + mock_ec2.delete_nat_gateway.assert_not_called() + + +@pytest.mark.unit +@pytest.mark.openshift +class TestReleaseElasticIps: + """Test Elastic IP release.""" + + @patch("openshift_resource_cleanup.openshift.network.boto3.client") + @patch("openshift_resource_cleanup.openshift.network.DRY_RUN", False) + def test_releases_elastic_ips_live_mode(self, mock_boto_client): + """ + GIVEN Elastic IPs exist for OpenShift cluster + WHEN release_elastic_ips is called in live mode + THEN release_address should be called for each EIP + """ + mock_ec2 = Mock() + mock_boto_client.return_value = mock_ec2 + mock_ec2.describe_addresses.return_value = { + "Addresses": [ + {"AllocationId": "eipalloc-abc123"}, + {"AllocationId": "eipalloc-def456"}, + ] + } + + release_elastic_ips("test-infra-123", "us-east-1") + + mock_ec2.describe_addresses.assert_called_once_with( + Filters=[ + { + "Name": "tag:kubernetes.io/cluster/test-infra-123", + "Values": ["owned"], + } + ] + ) + assert mock_ec2.release_address.call_count == 2 + mock_ec2.release_address.assert_any_call(AllocationId="eipalloc-abc123") + mock_ec2.release_address.assert_any_call(AllocationId="eipalloc-def456") + + @patch("openshift_resource_cleanup.openshift.network.boto3.client") + @patch("openshift_resource_cleanup.openshift.network.DRY_RUN", False) + def test_handles_client_error_gracefully(self, mock_boto_client): + """ + GIVEN an EIP that cannot be released (already released) + WHEN release_elastic_ips is called + THEN ClientError should be caught and function continues + """ + mock_ec2 = Mock() + mock_boto_client.return_value = mock_ec2 + mock_ec2.describe_addresses.return_value = { + "Addresses": [{"AllocationId": "eipalloc-abc123"}] + } + mock_ec2.release_address.side_effect = ClientError( + {"Error": {"Code": "InvalidAllocationID.NotFound"}}, "ReleaseAddress" + ) + + # Should not raise exception + release_elastic_ips("test-infra-123", "us-east-1") + + mock_ec2.release_address.assert_called_once() + + +@pytest.mark.unit +@pytest.mark.openshift +class TestCleanupNetworkInterfaces: + """Test network interface cleanup.""" + + @patch("openshift_resource_cleanup.openshift.network.boto3.client") + @patch("openshift_resource_cleanup.openshift.network.DRY_RUN", False) + def test_deletes_available_enis(self, mock_boto_client): + """ + GIVEN available (orphaned) network interfaces in VPC + WHEN cleanup_network_interfaces is called in live mode + THEN delete_network_interface should be called for each ENI + """ + mock_ec2 = Mock() + mock_boto_client.return_value = mock_ec2 + mock_ec2.describe_network_interfaces.return_value = { + "NetworkInterfaces": [ + {"NetworkInterfaceId": "eni-abc123"}, + {"NetworkInterfaceId": "eni-def456"}, + ] + } + + cleanup_network_interfaces("vpc-123456", "us-east-1") + + mock_ec2.describe_network_interfaces.assert_called_once_with( + Filters=[ + {"Name": "vpc-id", "Values": ["vpc-123456"]}, + {"Name": "status", "Values": ["available"]}, + ] + ) + assert mock_ec2.delete_network_interface.call_count == 2 + + +@pytest.mark.unit +@pytest.mark.openshift +class TestDeleteVpcEndpoints: + """Test VPC endpoint deletion.""" + + @patch("openshift_resource_cleanup.openshift.network.boto3.client") + @patch("openshift_resource_cleanup.openshift.network.DRY_RUN", False) + def test_deletes_vpc_endpoints(self, mock_boto_client): + """ + GIVEN VPC endpoints exist in VPC + WHEN delete_vpc_endpoints is called in live mode + THEN delete_vpc_endpoints should be called for each endpoint + """ + mock_ec2 = Mock() + mock_boto_client.return_value = mock_ec2 + mock_ec2.describe_vpc_endpoints.return_value = { + "VpcEndpoints": [ + {"VpcEndpointId": "vpce-abc123"}, + {"VpcEndpointId": "vpce-def456"}, + ] + } + + delete_vpc_endpoints("vpc-123456", "us-east-1") + + assert mock_ec2.delete_vpc_endpoints.call_count == 2 + mock_ec2.delete_vpc_endpoints.assert_any_call(VpcEndpointIds=["vpce-abc123"]) + + +@pytest.mark.unit +@pytest.mark.openshift +class TestDeleteSecurityGroups: + """Test security group deletion with dependency handling.""" + + @patch("openshift_resource_cleanup.openshift.network.boto3.client") + @patch("openshift_resource_cleanup.openshift.network.DRY_RUN", False) + def test_removes_ingress_rules_before_deletion(self, mock_boto_client): + """ + GIVEN security groups with ingress rules + WHEN delete_security_groups is called + THEN ingress rules should be revoked before deletion + """ + mock_ec2 = Mock() + mock_boto_client.return_value = mock_ec2 + mock_ec2.describe_security_groups.return_value = { + "SecurityGroups": [ + { + "GroupId": "sg-abc123", + "GroupName": "openshift-sg", + "IpPermissions": [ + { + "IpProtocol": "tcp", + "FromPort": 443, + "ToPort": 443, + "IpRanges": [{"CidrIp": "0.0.0.0/0"}], + } + ], + } + ] + } + + delete_security_groups("vpc-123456", "us-east-1") + + # Should revoke ingress rules first + mock_ec2.revoke_security_group_ingress.assert_called_once() + # Then delete the security group + mock_ec2.delete_security_group.assert_called_once_with(GroupId="sg-abc123") + + @patch("openshift_resource_cleanup.openshift.network.boto3.client") + @patch("openshift_resource_cleanup.openshift.network.DRY_RUN", False) + def test_skips_default_security_group(self, mock_boto_client): + """ + GIVEN a VPC with default security group + WHEN delete_security_groups is called + THEN default security group should not be deleted + """ + mock_ec2 = Mock() + mock_boto_client.return_value = mock_ec2 + mock_ec2.describe_security_groups.return_value = { + "SecurityGroups": [ + { + "GroupId": "sg-default", + "GroupName": "default", + "IpPermissions": [], + } + ] + } + + delete_security_groups("vpc-123456", "us-east-1") + + mock_ec2.delete_security_group.assert_not_called() + + +@pytest.mark.unit +@pytest.mark.openshift +class TestDeleteSubnets: + """Test subnet deletion.""" + + @patch("openshift_resource_cleanup.openshift.network.boto3.client") + @patch("openshift_resource_cleanup.openshift.network.DRY_RUN", False) + def test_deletes_all_subnets(self, mock_boto_client): + """ + GIVEN subnets exist in VPC + WHEN delete_subnets is called in live mode + THEN delete_subnet should be called for each subnet + """ + mock_ec2 = Mock() + mock_boto_client.return_value = mock_ec2 + mock_ec2.describe_subnets.return_value = { + "Subnets": [ + {"SubnetId": "subnet-abc123"}, + {"SubnetId": "subnet-def456"}, + ] + } + + delete_subnets("vpc-123456", "us-east-1") + + assert mock_ec2.delete_subnet.call_count == 2 + + +@pytest.mark.unit +@pytest.mark.openshift +class TestDeleteRouteTables: + """Test route table deletion.""" + + @patch("openshift_resource_cleanup.openshift.network.boto3.client") + @patch("openshift_resource_cleanup.openshift.network.DRY_RUN", False) + def test_skips_main_route_table(self, mock_boto_client): + """ + GIVEN route tables including main route table + WHEN delete_route_tables is called + THEN main route table should not be deleted + """ + mock_ec2 = Mock() + mock_boto_client.return_value = mock_ec2 + mock_ec2.describe_route_tables.return_value = { + "RouteTables": [ + { + "RouteTableId": "rtb-main", + "Associations": [{"Main": True}], + }, + { + "RouteTableId": "rtb-custom", + "Associations": [{"Main": False}], + }, + ] + } + + delete_route_tables("vpc-123456", "us-east-1") + + # Should only delete non-main route table + mock_ec2.delete_route_table.assert_called_once_with(RouteTableId="rtb-custom") + + +@pytest.mark.unit +@pytest.mark.openshift +class TestDeleteInternetGateway: + """Test internet gateway deletion.""" + + @patch("openshift_resource_cleanup.openshift.network.boto3.client") + @patch("openshift_resource_cleanup.openshift.network.DRY_RUN", False) + def test_detaches_and_deletes_igw(self, mock_boto_client): + """ + GIVEN internet gateway attached to VPC + WHEN delete_internet_gateway is called + THEN IGW should be detached then deleted + """ + mock_ec2 = Mock() + mock_boto_client.return_value = mock_ec2 + mock_ec2.describe_internet_gateways.return_value = { + "InternetGateways": [{"InternetGatewayId": "igw-abc123"}] + } + + delete_internet_gateway("vpc-123456", "us-east-1") + + mock_ec2.detach_internet_gateway.assert_called_once_with( + InternetGatewayId="igw-abc123", VpcId="vpc-123456" + ) + mock_ec2.delete_internet_gateway.assert_called_once_with( + InternetGatewayId="igw-abc123" + ) + + +@pytest.mark.unit +@pytest.mark.openshift +class TestDeleteVpc: + """Test VPC deletion.""" + + @patch("openshift_resource_cleanup.openshift.network.boto3.client") + @patch("openshift_resource_cleanup.openshift.network.DRY_RUN", False) + def test_deletes_vpc_live_mode(self, mock_boto_client): + """ + GIVEN VPC exists + WHEN delete_vpc is called in live mode + THEN delete_vpc should be called + """ + mock_ec2 = Mock() + mock_boto_client.return_value = mock_ec2 + + delete_vpc("vpc-123456", "us-east-1") + + mock_ec2.delete_vpc.assert_called_once_with(VpcId="vpc-123456") + + @patch("openshift_resource_cleanup.openshift.network.boto3.client") + @patch("openshift_resource_cleanup.openshift.network.DRY_RUN", True) + def test_skips_deletion_in_dry_run(self, mock_boto_client): + """ + GIVEN VPC exists + WHEN delete_vpc is called in DRY_RUN mode + THEN no deletion should occur + """ + mock_ec2 = Mock() + mock_boto_client.return_value = mock_ec2 + + delete_vpc("vpc-123456", "us-east-1") + + mock_ec2.delete_vpc.assert_not_called() diff --git a/IaC/cdk/openshift-resources-cleanup/tests/unit/openshift/test_openshift_storage.py b/IaC/cdk/openshift-resources-cleanup/tests/unit/openshift/test_openshift_storage.py new file mode 100644 index 0000000000..cb596a0781 --- /dev/null +++ b/IaC/cdk/openshift-resources-cleanup/tests/unit/openshift/test_openshift_storage.py @@ -0,0 +1,144 @@ +"""Unit tests for OpenShift S3 state storage cleanup.""" + +from __future__ import annotations +import pytest +from unittest.mock import Mock, patch +from botocore.exceptions import ClientError + +from openshift_resource_cleanup.openshift.storage import cleanup_s3_state + + +@pytest.mark.unit +@pytest.mark.openshift +class TestCleanupS3State: + """Test S3 state bucket cleanup for OpenShift clusters.""" + + @patch("openshift_resource_cleanup.openshift.storage.boto3.client") + @patch("openshift_resource_cleanup.openshift.storage.DRY_RUN", False) + def test_deletes_s3_objects_live_mode(self, mock_boto_client): + """ + GIVEN S3 objects exist for OpenShift cluster + WHEN cleanup_s3_state is called in live mode + THEN all cluster objects should be deleted + """ + mock_s3 = Mock() + mock_sts = Mock() + + def client_factory(service_name, **kwargs): + if service_name == "s3": + return mock_s3 + elif service_name == "sts": + return mock_sts + return Mock() + + mock_boto_client.side_effect = client_factory + + mock_sts.get_caller_identity.return_value = {"Account": "123456789012"} + + mock_s3.list_objects_v2.return_value = { + "Contents": [ + {"Key": "test-cluster/terraform.tfstate"}, + {"Key": "test-cluster/metadata.json"}, + ] + } + + cleanup_s3_state("test-cluster", "us-east-1") + + expected_bucket = "openshift-clusters-123456789012-us-east-1" + mock_s3.list_objects_v2.assert_called_once_with( + Bucket=expected_bucket, Prefix="test-cluster/" + ) + + assert mock_s3.delete_object.call_count == 2 + mock_s3.delete_object.assert_any_call( + Bucket=expected_bucket, Key="test-cluster/terraform.tfstate" + ) + mock_s3.delete_object.assert_any_call( + Bucket=expected_bucket, Key="test-cluster/metadata.json" + ) + + @patch("openshift_resource_cleanup.openshift.storage.boto3.client") + @patch("openshift_resource_cleanup.openshift.storage.DRY_RUN", True) + def test_skips_deletion_in_dry_run_mode(self, mock_boto_client): + """ + GIVEN S3 objects exist for cluster + WHEN cleanup_s3_state is called in DRY_RUN mode + THEN no deletions should occur + """ + mock_s3 = Mock() + mock_sts = Mock() + + def client_factory(service_name, **kwargs): + if service_name == "s3": + return mock_s3 + elif service_name == "sts": + return mock_sts + return Mock() + + mock_boto_client.side_effect = client_factory + + mock_sts.get_caller_identity.return_value = {"Account": "123456789012"} + mock_s3.list_objects_v2.return_value = { + "Contents": [{"Key": "test-cluster/terraform.tfstate"}] + } + + cleanup_s3_state("test-cluster", "us-east-1") + + mock_s3.delete_object.assert_not_called() + + @patch("openshift_resource_cleanup.openshift.storage.boto3.client") + @patch("openshift_resource_cleanup.openshift.storage.DRY_RUN", False) + def test_handles_no_contents_in_bucket(self, mock_boto_client): + """ + GIVEN no S3 objects exist for cluster + WHEN cleanup_s3_state is called + THEN function should complete without errors + """ + mock_s3 = Mock() + mock_sts = Mock() + + def client_factory(service_name, **kwargs): + if service_name == "s3": + return mock_s3 + elif service_name == "sts": + return mock_sts + return Mock() + + mock_boto_client.side_effect = client_factory + + mock_sts.get_caller_identity.return_value = {"Account": "123456789012"} + mock_s3.list_objects_v2.return_value = {} # No Contents key + + cleanup_s3_state("test-cluster", "us-east-1") + + mock_s3.delete_object.assert_not_called() + + @patch("openshift_resource_cleanup.openshift.storage.boto3.client") + @patch("openshift_resource_cleanup.openshift.storage.DRY_RUN", False) + def test_handles_missing_bucket_gracefully(self, mock_boto_client): + """ + GIVEN S3 bucket does not exist + WHEN cleanup_s3_state is called + THEN NoSuchBucket error should be handled gracefully + """ + mock_s3 = Mock() + mock_sts = Mock() + + def client_factory(service_name, **kwargs): + if service_name == "s3": + return mock_s3 + elif service_name == "sts": + return mock_sts + return Mock() + + mock_boto_client.side_effect = client_factory + + mock_sts.get_caller_identity.return_value = {"Account": "123456789012"} + mock_s3.list_objects_v2.side_effect = ClientError( + {"Error": {"Code": "NoSuchBucket"}}, "ListObjectsV2" + ) + + # Should not raise exception + cleanup_s3_state("test-cluster", "us-east-1") + + mock_s3.delete_object.assert_not_called() diff --git a/IaC/cdk/openshift-resources-cleanup/tests/unit/test_openshift_detection.py b/IaC/cdk/openshift-resources-cleanup/tests/unit/test_openshift_detection.py new file mode 100644 index 0000000000..0652f04b4b --- /dev/null +++ b/IaC/cdk/openshift-resources-cleanup/tests/unit/test_openshift_detection.py @@ -0,0 +1,227 @@ +"""Unit tests for OpenShift-specific cluster detection (vs EKS, vanilla K8s).""" + +from __future__ import annotations +import pytest +from unittest.mock import Mock, patch + +from openshift_resource_cleanup.handler import is_openshift_instance, extract_cluster_name_from_infra_id + + +class TestOpenShiftDetection: + """Test that we only detect OpenShift clusters, not EKS or other K8s.""" + + def test_detects_rosa_cluster_via_red_hat_clustertype_tag(self): + """GIVEN an instance with red-hat-clustertype: rosa tag + WHEN is_openshift_instance is called + THEN it should detect as OpenShift and return infra_id + """ + instance = { + "InstanceId": "i-rosa123", + "Tags": [ + {"Key": "Name", "Value": "rosa-cluster-abc123-worker-0"}, + {"Key": "red-hat-clustertype", "Value": "rosa"}, + {"Key": "kubernetes.io/cluster/rosa-cluster-abc123", "Value": "owned"}, + ], + } + + is_openshift, infra_id = is_openshift_instance(instance, "us-east-1") + + assert is_openshift is True + assert infra_id == "rosa-cluster-abc123" + + def test_detects_openshift_via_red_hat_managed_tag(self): + """GIVEN an instance with red-hat-managed: true tag + WHEN is_openshift_instance is called + THEN it should detect as OpenShift + """ + instance = { + "InstanceId": "i-openshift456", + "Tags": [ + {"Key": "Name", "Value": "my-openshift-def456-infra-0"}, + {"Key": "red-hat-managed", "Value": "true"}, + {"Key": "kubernetes.io/cluster/my-openshift-def456", "Value": "owned"}, + ], + } + + is_openshift, infra_id = is_openshift_instance(instance, "us-east-2") + + assert is_openshift is True + assert infra_id == "my-openshift-def456" + + def test_detects_openshift_via_cluster_api_tag(self): + """GIVEN an instance with sigs.k8s.io/cluster-api-provider-aws tag + WHEN is_openshift_instance is called + THEN it should detect as OpenShift + """ + instance = { + "InstanceId": "i-clusterapi789", + "Tags": [ + {"Key": "Name", "Value": "ocp-cluster-ghi789-worker-1"}, + { + "Key": "sigs.k8s.io/cluster-api-provider-aws/cluster/ocp-cluster-ghi789", + "Value": "owned", + }, + {"Key": "kubernetes.io/cluster/ocp-cluster-ghi789", "Value": "owned"}, + ], + } + + is_openshift, infra_id = is_openshift_instance(instance, "eu-west-1") + + assert is_openshift is True + assert infra_id == "ocp-cluster-ghi789" + + @patch("openshift_resource_cleanup.openshift.detection.detect_openshift_infra_id") + def test_detects_openshift_via_master_name_pattern_fallback( + self, mock_detect_infra + ): + """GIVEN an instance with -master- in name (old detection method) + WHEN is_openshift_instance is called + THEN it should verify with detect_openshift_infra_id and detect if valid + """ + mock_detect_infra.return_value = "legacy-cluster-jkl012" + + instance = { + "InstanceId": "i-legacy123", + "Tags": [ + {"Key": "Name", "Value": "legacy-cluster-jkl012-master-0"}, + {"Key": "kubernetes.io/cluster/legacy-cluster-jkl012", "Value": "owned"}, + ], + } + + is_openshift, infra_id = is_openshift_instance(instance, "us-west-2") + + assert is_openshift is True + assert infra_id == "legacy-cluster-jkl012" + mock_detect_infra.assert_called_once_with("legacy-cluster", "us-west-2") + + def test_does_not_detect_eks_cluster(self): + """GIVEN an EKS instance (no Red Hat tags, has eks:cluster-name tag) + WHEN is_openshift_instance is called + THEN it should NOT detect as OpenShift + """ + instance = { + "InstanceId": "i-eks999", + "Tags": [ + {"Key": "Name", "Value": "eks-worker-node-1"}, + {"Key": "eks:cluster-name", "Value": "my-eks-cluster"}, + {"Key": "kubernetes.io/cluster/my-eks-cluster", "Value": "owned"}, + {"Key": "eks:nodegroup-name", "Value": "ng-1"}, + ], + } + + is_openshift, infra_id = is_openshift_instance(instance, "us-east-1") + + assert is_openshift is False + assert infra_id is None + + def test_does_not_detect_vanilla_kubernetes(self): + """GIVEN a vanilla K8s instance (generic kubernetes.io tag only) + WHEN is_openshift_instance is called + THEN it should NOT detect as OpenShift + """ + instance = { + "InstanceId": "i-k8s888", + "Tags": [ + {"Key": "Name", "Value": "k8s-worker-01"}, + {"Key": "kubernetes.io/cluster/my-k8s-cluster", "Value": "owned"}, + {"Key": "KubernetesCluster", "Value": "my-k8s-cluster"}, + ], + } + + is_openshift, infra_id = is_openshift_instance(instance, "eu-central-1") + + assert is_openshift is False + assert infra_id is None + + def test_does_not_detect_non_kubernetes_instance(self): + """GIVEN a regular EC2 instance (no K8s tags at all) + WHEN is_openshift_instance is called + THEN it should NOT detect as OpenShift + """ + instance = { + "InstanceId": "i-regular777", + "Tags": [ + {"Key": "Name", "Value": "web-server-01"}, + {"Key": "Environment", "Value": "production"}, + ], + } + + is_openshift, infra_id = is_openshift_instance(instance, "us-west-1") + + assert is_openshift is False + assert infra_id is None + + def test_detects_from_worker_node_not_just_master(self): + """GIVEN an OpenShift worker node (not master) + WHEN is_openshift_instance is called + THEN it should still detect the cluster via Red Hat tags + """ + instance = { + "InstanceId": "i-worker555", + "Tags": [ + {"Key": "Name", "Value": "prod-cluster-xyz999-worker-2"}, + {"Key": "red-hat-clustertype", "Value": "rosa"}, + {"Key": "kubernetes.io/cluster/prod-cluster-xyz999", "Value": "owned"}, + {"Key": "sigs.k8s.io/cluster-api-provider-aws/role", "Value": "worker"}, + ], + } + + is_openshift, infra_id = is_openshift_instance(instance, "eu-west-2") + + assert is_openshift is True + assert infra_id == "prod-cluster-xyz999" + + def test_detects_from_infra_node(self): + """GIVEN an OpenShift infra node + WHEN is_openshift_instance is called + THEN it should detect the cluster via Red Hat tags + """ + instance = { + "InstanceId": "i-infra444", + "Tags": [ + {"Key": "Name", "Value": "prod-cluster-xyz999-infra-0"}, + {"Key": "red-hat-managed", "Value": "true"}, + {"Key": "kubernetes.io/cluster/prod-cluster-xyz999", "Value": "owned"}, + ], + } + + is_openshift, infra_id = is_openshift_instance(instance, "eu-west-2") + + assert is_openshift is True + assert infra_id == "prod-cluster-xyz999" + + +class TestExtractClusterName: + """Test cluster name extraction from infra ID.""" + + def test_extracts_cluster_name_from_infra_id(self): + """GIVEN an infra ID like 'jvp-rosa1-qmdkk' + WHEN extract_cluster_name_from_infra_id is called + THEN it should return 'jvp-rosa1' (without random suffix) + """ + result = extract_cluster_name_from_infra_id("jvp-rosa1-qmdkk") + assert result == "jvp-rosa1" + + def test_extracts_cluster_name_from_longer_infra_id(self): + """GIVEN a longer infra ID with multiple dashes + WHEN extract_cluster_name_from_infra_id is called + THEN it should return everything except the last segment + """ + result = extract_cluster_name_from_infra_id("my-production-cluster-abc123") + assert result == "my-production-cluster" + + def test_returns_unchanged_if_single_segment(self): + """GIVEN an infra ID with no dashes + WHEN extract_cluster_name_from_infra_id is called + THEN it should return the ID unchanged + """ + result = extract_cluster_name_from_infra_id("simplecluster") + assert result == "simplecluster" + + def test_returns_unchanged_if_only_two_segments(self): + """GIVEN an infra ID with only two segments + WHEN extract_cluster_name_from_infra_id is called + THEN it should return the first segment only + """ + result = extract_cluster_name_from_infra_id("cluster-id") + assert result == "cluster" diff --git a/IaC/justfile b/IaC/justfile new file mode 100644 index 0000000000..c7245ef8c3 --- /dev/null +++ b/IaC/justfile @@ -0,0 +1,21 @@ +# Root IaC Justfile - Routes to CDK projects +# Usage: just + +# Default - show available projects +default: + @echo "Available CDK projects:" + @echo " aws-resources-cleanup - Comprehensive AWS resource cleanup Lambda" + @echo "" + @echo "Usage from IaC/:" + @echo " just aws-resources-cleanup Show project help" + @echo " just aws-resources-cleanup Run project command" + @echo "" + @echo "Common commands:" + @echo " just aws-resources-cleanup deploy Deploy in DRY_RUN mode" + @echo " just aws-resources-cleanup logs Tail CloudWatch logs" + @echo " just aws-resources-cleanup invoke-aws Test Lambda execution" + @echo " just aws-resources-cleanup update-code Fast Lambda code update" + +# AWS Resources Cleanup +aws-resources-cleanup *ARGS: + @cd cdk/aws-resources-cleanup && just {{ARGS}}