From c92d6e25b0e6eead6427c788cfef4c3816cb689b Mon Sep 17 00:00:00 2001 From: Secretions Date: Thu, 2 Mar 2023 12:12:30 -0800 Subject: [PATCH] PLAT-5668: TF convertor integration test / misc. follow-up (#131) * Automatically destroy ASG instances before ASGs * Deal with optionality of bastion * Same with monitoring bucket * Stack name is sanitized to `\w` or something similar, deal with that * Public API setting unsupported, notify user * Don't hardcode S3 force destroy variable * Get route53 zone name from api; agent config going away * Maybe get EKS version from api? * Support flowlog import into eks module if/when added to that * Determine if we should do something more clever for the ASG node draining * Optionality between managed/unmanaged nodegroups, efs backups, comb for other things * Track the upstream AZ/Subnet changes and change input variable accordingly (if relevant) * (barely) Manage the default auto-generated EKS SG because its destruction is unreliable * Integration test --- .github/workflows/test.yml | 177 +++++++++++ .github/workflows/tests.yml | 129 -------- cdk/domino_cdk/config/eks.py | 6 +- convert/README.md | 4 + convert/cloudformation-only/main.tf | 4 +- convert/cloudformation-only/variables.tf | 6 + convert/data/bastion_resources.yaml | 21 ++ convert/data/efs_backup_resources.yaml | 12 + convert/data/monitoring_bucket_resources.yaml | 6 + convert/data/per_az.yaml | 35 +++ convert/data/resource_template.yaml | 46 +++ convert/data/route53_resources.yaml | 6 + convert/data/unmanaged_nodegroup.yaml | 21 ++ convert/lib/convert.py | 279 +++++++++++++----- convert/lib/meta.py | 167 +---------- convert/lib/nuke.py | 112 +++++-- convert/requirements.txt | 1 + convert/terraform/main.tf | 6 +- convert/terraform/variables.tf | 27 ++ convert/terraform/vpc.tf | 18 ++ 20 files changed, 693 insertions(+), 390 deletions(-) create mode 100644 .github/workflows/test.yml delete mode 100644 .github/workflows/tests.yml create mode 100644 convert/data/bastion_resources.yaml create mode 100644 convert/data/efs_backup_resources.yaml create mode 100644 convert/data/monitoring_bucket_resources.yaml create mode 100644 convert/data/per_az.yaml create mode 100644 convert/data/resource_template.yaml create mode 100644 convert/data/route53_resources.yaml create mode 100644 convert/data/unmanaged_nodegroup.yaml diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml new file mode 100644 index 00000000..3de94f60 --- /dev/null +++ b/.github/workflows/test.yml @@ -0,0 +1,177 @@ +# This workflow will install Python dependencies, run tests and lint with a single version of Python +# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions + +name: Run tests / create artifacts + +on: + push: + branches: [ master ] + pull_request: + types: [opened, synchronize, reopened, labeled, ready_for_review] + branches: [ master ] + +jobs: + test-build-deploy: + runs-on: ubuntu-latest + env: + AWS_ACCESS_KEY_ID: ${{ secrets.DELTA_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.DELTA_ACCESS_KEY }} + AWS_REGION: us-west-2 + defaults: + run: + working-directory: ./cdk + steps: + - uses: actions/checkout@v2 + - name: Set up Python 3.9 + uses: actions/setup-python@v2 + with: + python-version: 3.9 + - name: Setup nodejs + uses: actions/setup-node@v2 + with: + node-version: 16 + - name: Install dependencies + run: | + pip install -r requirements.txt + pip install awscli==1.25.57 build + - name: Install aws-cdk + run: npm install -g aws-cdk@$(pip freeze | grep aws-cdk.core | sed -e 's/.*==//') + - name: Lint with flake8/black/isort + run: | + set -x + export FILES=(*.py domino_cdk tests ../convert/) + for f in ${FILES[@]}; do + # stop the build if there are Python syntax errors or undefined names + flake8 $f --count --select=E9,F63,F7,F82 --show-source --statistics + # exit-zero treats all errors as warnings. + flake8 $f --count --ignore=E501,W503 --exit-zero --statistics + black $f --check + isort $f --check + done + - name: Test with pytest + run: | + coverage run -m pytest tests + + - name: Coverage report + run: | + coverage report + - name: Create/lint default config + env: + AWS_ACCOUNT_ID: ${{ secrets.DELTA_ACCOUNT_ID }} + GITHUB_SHA: ${{ github.sha }} + REGISTRY_USERNAME: ${{ secrets.REGISTRY_USERNAME }} + REGISTRY_PASSWORD: ${{ secrets.REGISTRY_PASSWORD }} + ACM_CERT_ARN: ${{ secrets.DELTA_ACM_CERT_ARN }} + BASE_DOMAIN: ${{ secrets.DELTA_BASE_DOMAIN }} + run: | + export NAME=cdk-${GITHUB_SHA:0:6}-$(date +%s) + echo "NAME=$NAME" >> $GITHUB_ENV + ./util.py generate_config_template --name $NAME --aws-region=$AWS_REGION --aws-account-id=$AWS_ACCOUNT_ID --dev --platform-nodegroups 2 --registry-username $REGISTRY_USERNAME --registry-password $REGISTRY_PASSWORD --hostname $NAME.$BASE_DOMAIN --acm-cert-arn $ACM_CERT_ARN --disable-flow-logs > config.yaml + ./util.py load_config -f ./config.yaml + - name: Test default config (single and nested stacks) + run: | + cdk synth --context singlestack=true -q + cdk synth -q + - name: Upload distribution artifacts + env: + DOMINO_CDK_VERSION: "0.0.0+${{ github.sha }}" + DATEDIR: "date +%Y%m%d" + run: | + cd .. + make clean && make dist + for suffix in "" "-terraform"; do + filename="domino-cdk$suffix-$DOMINO_CDK_VERSION.tar.gz" + aws s3 cp --acl=public-read ./dist/$filename s3://domino-artifacts/cdk/$($DATEDIR)/$filename + urlfile=$(python -c 'import sys, urllib.parse; print(urllib.parse.quote(sys.stdin.read().strip()))' <<< "$filename") + echo "Artifact url: https://domino-artifacts.s3.amazonaws.com/cdk/$($DATEDIR)/$urlfile" + done + - name: Deploy CDK + if: contains(github.event.pull_request.labels.*.name, 'deploy-test') || github.ref == 'refs/heads/master' + env: + REGISTRY_USERNAME: ${{ secrets.REGISTRY_USERNAME }} + REGISTRY_PASSWORD: ${{ secrets.REGISTRY_PASSWORD }} + run: | + docker login -u $REGISTRY_USERNAME -p $REGISTRY_PASSWORD quay.io + cdk deploy --require-approval never --outputs-file outputs.json + $(jq -r ".[].ekskubeconfigcmd" outputs.json) --kubeconfig ./kubeconfig + - name: Collect diagnostic data + if: always() && (contains(github.event.pull_request.labels.*.name, 'deploy-test') || github.ref == 'refs/heads/master') + env: + KUBECONFIG: ./kubeconfig + LOG_DIR: /tmp/k8s-cluster-state/cdk + run: | + set +e + curl -Lo /usr/local/bin/kubectl "https://dl.k8s.io/release/v1.23.6/bin/linux/amd64/kubectl" + for ns in domino-platform domino-compute domino-system kube-system; do + mkdir -p $LOG_DIR/$ns + kubectl -n $ns get ing -o yaml > $LOG_DIR/$ns/ingress.txt + kubectl -n $ns get po -o yaml > $LOG_DIR/$ns/pods.txt + kubectl -n $ns describe po > $LOG_DIR/$ns/pods-described.txt + kubectl -n $ns get pvc -o yaml > $LOG_DIR/$ns/pvcs.txt + kubectl -n $ns get svc -o yaml > $LOG_DIR/$ns/svcs.txt + kubectl -n $ns describe svc > $LOG_DIR/$ns/svcs-described.txt + kubectl -n $ns get events > $LOG_DIR/$ns/events.txt + done + kubectl get pv -o yaml > $LOG_DIR/pvs.txt + kubectl get no -o yaml > $LOG_DIR/nodes.txt + kubectl describe no > $LOG_DIR/nodes-described.txt + - name: Setup cloudformation-only user for safe stack deletion + if: (contains(github.event.pull_request.labels.*.name, 'deploy-test') || github.ref == 'refs/heads/master') + working-directory: ./convert/cloudformation-only + run: | + echo "{\"region\":\"us-west-2\",\"tags\":{},\"suffix\":\"$NAME\"}" > terraform.tfvars.json + terraform init + terraform plan -out=terraform.plan + terraform apply -auto-approve terraform.plan + - name: Setup terraform-aws-eks conversion process + if: (contains(github.event.pull_request.labels.*.name, 'deploy-test') || github.ref == 'refs/heads/master') + working-directory: ./convert + run: | + pip install -r requirements.txt + ssh-keygen -t rsa -f dummy.pem -N '' + ./convert.py print-stack --stack-name $NAME --region $AWS_REGION --verbose --yaml > stack-data.yaml + ./convert.py create-tfvars --stack-name $NAME --region $AWS_REGION --ssh-key-path ./dummy.pem > terraform/terraform.tfvars.json + ./convert.py get-imports --stack-name $NAME --region $AWS_REGION > terraform/imports.sh + - name: Run terraform + if: (contains(github.event.pull_request.labels.*.name, 'deploy-test') || github.ref == 'refs/heads/master') + working-directory: ./convert/terraform + run: | + terraform init + bash imports.sh + terraform plan -out=terraform.plan + terraform show -json terraform.plan | jq '.resource_changes[] | select(.change.actions[]=="delete")' | tee terraform.plan.json + test -s terraform.plan.json && echo "Detected deletions, bailing..." && exit 1 + terraform apply -auto-approve terraform.plan + - name: Clean and delete stack + if: (contains(github.event.pull_request.labels.*.name, 'deploy-test') || github.ref == 'refs/heads/master') + working-directory: ./convert + run: | + ./convert.py clean-stack --stack-name $NAME --region $AWS_REGION --remove-security-group-references --delete + ./convert.py delete-stack --stack-name $NAME --region $AWS_REGION --delete + - name: Upload diagnostic data + if: always() && (contains(github.event.pull_request.labels.*.name, 'deploy-test') || github.ref == 'refs/heads/master') + uses: actions/upload-artifact@v2 + with: + name: Diagnostic Data + path: ./ + retention-days: 14 + - name: Delete stack w/CDK + if: always() && (contains(github.event.pull_request.labels.*.name, 'deploy-test') || github.ref == 'refs/heads/master') + working-directory: ./cdk + run: | + cdk destroy --force + - name: Destroy Infrastructure + if: always() && (contains(github.event.pull_request.labels.*.name, 'deploy-test') || github.ref == 'refs/heads/master') + working-directory: ./convert/terraform + run: | + terraform destroy -auto-approve + - name: Destroy Infrastructure + if: always() && (contains(github.event.pull_request.labels.*.name, 'deploy-test') || github.ref == 'refs/heads/master') + working-directory: ./convert/cloudformation-only + run: | + terraform destroy -auto-approve + - name: Fail without deploy + if: ${{ github.event.pull_request.draft == false && ! (contains(github.event.pull_request.labels.*.name, 'deploy-test') || contains(github.event.pull_request.labels.*.name, 'no-deploy-needed') || github.ref == 'refs/heads/master') }} + run: | + echo "Deploy tests required on non-draft PRs. Please add 'deploy-test' label". + exit 1 diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml deleted file mode 100644 index e6be64c4..00000000 --- a/.github/workflows/tests.yml +++ /dev/null @@ -1,129 +0,0 @@ -# This workflow will install Python dependencies, run tests and lint with a single version of Python -# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions - -name: Run tests / create artifacts - -on: - push: - branches: [ master ] - pull_request: - types: [opened, synchronize, reopened, labeled, ready_for_review] - branches: [ master ] - -jobs: - test-build-deploy: - runs-on: ubuntu-latest - env: - DEPLOYER_IMAGE: quay.io/domino/deployer:develop.f72b81d5db7e04cc48478d310eafa4abb927ce7f - defaults: - run: - working-directory: ./cdk - steps: - - uses: actions/checkout@v2 - - name: Set up Python 3.9 - uses: actions/setup-python@v2 - with: - python-version: 3.9 - - name: Setup nodejs - uses: actions/setup-node@v2 - with: - node-version: 16 - - name: Determine deployer image - env: - PR_BODY: ${{ github.event.pull_request.body }} - run: | - export PR_DEPLOYER_IMAGE=$(echo $PR_BODY | grep -oP "deployer_image: \K\S+") - export DEPLOYER_IMAGE=${PR_DEPLOYER_IMAGE:-$DEPLOYER_IMAGE} - echo Using deployer image: $DEPLOYER_IMAGE - echo "DEPLOYER_IMAGE=$DEPLOYER_IMAGE" >> $GITHUB_ENV - - name: Install dependencies - run: | - pip install -r requirements.txt - pip install awscli==1.25.57 build - - name: Install aws-cdk - run: npm install -g aws-cdk@$(pip freeze | grep aws-cdk.core | sed -e 's/.*==//') - - name: Lint with flake8/black/isort - run: | - set -x - export FILES=(*.py domino_cdk tests ../convert/) - for f in ${FILES[@]}; do - # stop the build if there are Python syntax errors or undefined names - flake8 $f --count --select=E9,F63,F7,F82 --show-source --statistics - # exit-zero treats all errors as warnings. - flake8 $f --count --ignore=E501,W503 --exit-zero --statistics - black $f --check - isort $f --check - done - - name: Test with pytest - run: | - coverage run -m pytest tests - - - name: Coverage report - run: | - coverage report - - name: Create/lint default config - env: - AWS_ACCOUNT_ID: ${{ secrets.DELTA_ACCOUNT_ID }} - AWS_ACCESS_KEY_ID: ${{ secrets.DELTA_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.DELTA_ACCESS_KEY }} - GITHUB_SHA: ${{ github.sha }} - REGISTRY_USERNAME: ${{ secrets.REGISTRY_USERNAME }} - REGISTRY_PASSWORD: ${{ secrets.REGISTRY_PASSWORD }} - ACM_CERT_ARN: ${{ secrets.DELTA_ACM_CERT_ARN }} - BASE_DOMAIN: ${{ secrets.DELTA_BASE_DOMAIN }} - run: | - export NAME=cdk-deploy-${GITHUB_SHA:0:6} - ./util.py generate_config_template --name $NAME --aws-region=us-west-2 --aws-account-id=$AWS_ACCOUNT_ID --dev --platform-nodegroups 2 --registry-username $REGISTRY_USERNAME --registry-password $REGISTRY_PASSWORD --hostname $NAME.$BASE_DOMAIN --acm-cert-arn $ACM_CERT_ARN --disable-flow-logs > config.yaml - ./util.py load_config -f ./config.yaml - - name: Test default config (single and nested stacks) - env: - AWS_ACCESS_KEY_ID: ${{ secrets.DELTA_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.DELTA_ACCESS_KEY }} - run: | - cdk synth --context singlestack=true -q - cdk synth -q - - name: Upload distribution artifacts - env: - AWS_ACCESS_KEY_ID: ${{ secrets.DOMINO_ARTIFACTS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.DOMINO_ARTIFACTS_ACCESS_KEY }} - DOMINO_CDK_VERSION: "0.0.0+${{ github.sha }}" - DATEDIR: "date +%Y%m%d" - run: | - cd .. - make clean && make dist - for suffix in "" "-terraform"; do - filename="domino-cdk$suffix-$DOMINO_CDK_VERSION.tar.gz" - aws s3 cp --acl=public-read ./dist/$filename s3://domino-artifacts/cdk/$($DATEDIR)/$filename - urlfile=$(python -c 'import sys, urllib.parse; print(urllib.parse.quote(sys.stdin.read().strip()))' <<< "$filename") - echo "Artifact url: https://domino-artifacts.s3.amazonaws.com/cdk/$($DATEDIR)/$urlfile" - done - - name: Deploy CDK - if: contains(github.event.pull_request.labels.*.name, 'deploy-test') || github.ref == 'refs/heads/master' - env: - AWS_ACCESS_KEY_ID: ${{ secrets.DELTA_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.DELTA_ACCESS_KEY }} - REGISTRY_USERNAME: ${{ secrets.REGISTRY_USERNAME }} - REGISTRY_PASSWORD: ${{ secrets.REGISTRY_PASSWORD }} - run: | - docker login -u $REGISTRY_USERNAME -p $REGISTRY_PASSWORD quay.io - cdk deploy --require-approval never --outputs-file outputs.json - - name: Upload diagnostic data - if: always() && (contains(github.event.pull_request.labels.*.name, 'deploy-test') || github.ref == 'refs/heads/master') - uses: actions/upload-artifact@v2 - with: - name: Diagnostic Data - path: ./ - retention-days: 14 - - name: Destroy CDK - if: always() && (contains(github.event.pull_request.labels.*.name, 'deploy-test') || github.ref == 'refs/heads/master') - env: - AWS_ACCESS_KEY_ID: ${{ secrets.DELTA_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.DELTA_ACCESS_KEY }} - run: | - sleep 120 # Immediate destroy after cdk deploy causes race conditions + give k8s time to deprovision after domino Uninstall - cdk destroy --force - - name: Fail without deploy - if: ${{ github.event.pull_request.draft == false && ! (contains(github.event.pull_request.labels.*.name, 'deploy-test') || contains(github.event.pull_request.labels.*.name, 'no-deploy-needed') || github.ref == 'refs/heads/master') }} - run: | - echo "Deploy tests required on non-draft PRs. Please add 'deploy-test' label". - exit 1 diff --git a/cdk/domino_cdk/config/eks.py b/cdk/domino_cdk/config/eks.py index dfc615d1..c8cf9914 100644 --- a/cdk/domino_cdk/config/eks.py +++ b/cdk/domino_cdk/config/eks.py @@ -45,6 +45,7 @@ class NodegroupBase: instance_types: ["m5.2xlarge", "m5.4xlarge"] - Instance types available to nodegroup labels: some-label: "true" - Labels to apply to all nodes in nodegroup tags: some-tag: "true" - Tags to apply to all nodes in nodegroup + ssm_agent: true/false - Install SSM agent (ie for console access via aws web ui) ... Managed nodegroup-specific options: spot: true/false - Use spot instances, may affect reliability/availability of nodegroup @@ -52,7 +53,6 @@ class NodegroupBase: ... Unmanaged nodegroup-specific options: gpu: true/false - Setup GPU instance support - ssm_agent: true/false - Install SSM agent (ie for console access via aws web ui) taints: some-taint: "true" - Taints to apply to all nodes in nodegroup ie to taint gpu nodes, etc.) """ @@ -146,6 +146,10 @@ def check_ami_exceptions(ng_name: str, ami_id: str, user_data: str, incompatible errors.append( f"Error: {error_name} has min_size of 0. Only unmanaged nodegroups support min_size of 0." ) + if ng.min_size > ng.desired_size: + errors.append( + f"Error: {error_name} has a desired_size of {ng.desired_size}, which can't be less than the min_size (currently: {ng.min_size})." + ) for name, ng in self.unmanaged_nodegroups.items(): error_name = f"Unmanaged nodegroup [{name}]" check_ami_exceptions(error_name, ng.ami_id, ng.user_data, ["ssm_agent", "labels", "taints", "disk_size"]) diff --git a/convert/README.md b/convert/README.md index f7b014ca..73158409 100644 --- a/convert/README.md +++ b/convert/README.md @@ -78,6 +78,10 @@ Change to the `terraform/` directory, and run the following two commands: `imports.sh` will run *several* `terraform import` commands in a row. This is normal. +### Review and Configure Node Groups + +This conversion process will create a tvfars file with the default nodegroups for the terraform-aws-eks module. Please review the instructions for that module, and configure the nodegroups as desired. Your old nodegroups will remain functional after running the conversion (until the "clean-stack" step). + ### Evaluate the Terraform plan Once everything has been imported, you can run `terraform plan` and evaluate what terraform will do: diff --git a/convert/cloudformation-only/main.tf b/convert/cloudformation-only/main.tf index 5d2b7720..c458acce 100644 --- a/convert/cloudformation-only/main.tf +++ b/convert/cloudformation-only/main.tf @@ -1,5 +1,5 @@ resource "aws_iam_policy" "cloudformation_only" { - name = "cloudformation-only" + name = "cloudformation-only${var.suffix}" policy = jsonencode({ Version = "2012-10-17" @@ -14,7 +14,7 @@ resource "aws_iam_policy" "cloudformation_only" { } resource "aws_iam_role" "cloudformation_only" { - name = "cloudformation-only" + name = "cloudformation-only${var.suffix}" description = "Allows CloudFormation to create and manage AWS stacks and resources on your behalf, but nothing else" assume_role_policy = jsonencode({ diff --git a/convert/cloudformation-only/variables.tf b/convert/cloudformation-only/variables.tf index 09325b55..fe1a9ab9 100644 --- a/convert/cloudformation-only/variables.tf +++ b/convert/cloudformation-only/variables.tf @@ -7,3 +7,9 @@ variable "tags" { type = map(string) description = "Deployment tags." } + +variable "suffix" { + type = string + description = "Optional suffix for role/policy names" + default = "" +} diff --git a/convert/data/bastion_resources.yaml b/convert/data/bastion_resources.yaml new file mode 100644 index 00000000..3bd4efdb --- /dev/null +++ b/convert/data/bastion_resources.yaml @@ -0,0 +1,21 @@ +name: bastion +type: optional +resources: + eks_stack: + - cf_sgr: + rule: _ingress_tcp_22_22_ + rule_sg: bastionsg + rule_sg_stack: vpc_stack + sg: UnmanagedSG + tf: module.domino_eks.module.eks.aws_security_group_rule.bastion_eks["eks_nodes_ssh_from_bastion"] + vpc_stack: + - cf: bastionsg + tf: module.domino_eks.module.bastion[0].aws_security_group.bastion + - cf_sgr: + rule: _egress_all_0_0_0.0.0.0/0 + sg: bastionsg + tf: module.domino_eks.module.bastion[0].aws_security_group_rule.bastion_outbound + - cf_sgr: + rule: _ingress_tcp_22_22_0.0.0.0/0 + sg: bastionsg + tf: module.domino_eks.module.bastion[0].aws_security_group_rule.bastion["bastion_inbound_ssh"] diff --git a/convert/data/efs_backup_resources.yaml b/convert/data/efs_backup_resources.yaml new file mode 100644 index 00000000..f5cca0c9 --- /dev/null +++ b/convert/data/efs_backup_resources.yaml @@ -0,0 +1,12 @@ +name: efs_backup +type: optional +resources: + efs_stack: + - cf: efsbackup + tf: module.domino_eks.module.storage.aws_backup_vault.efs[0] + - cf: efsbackupplan + tf: module.domino_eks.module.storage.aws_backup_plan.efs[0] + - cf: efsbackuprole + tf: module.domino_eks.module.storage.aws_iam_role.efs_backup_role[0] + - cf_backupselection: efsbackupselection + tf: module.domino_eks.module.storage.aws_backup_selection.efs[0] diff --git a/convert/data/monitoring_bucket_resources.yaml b/convert/data/monitoring_bucket_resources.yaml new file mode 100644 index 00000000..3edd16c3 --- /dev/null +++ b/convert/data/monitoring_bucket_resources.yaml @@ -0,0 +1,6 @@ +name: monitoring_bucket +type: optional +resources: + s3_stack: + - cf: monitoring + tf: module.domino_eks.module.storage.aws_s3_bucket.monitoring diff --git a/convert/data/per_az.yaml b/convert/data/per_az.yaml new file mode 100644 index 00000000..c8417500 --- /dev/null +++ b/convert/data/per_az.yaml @@ -0,0 +1,35 @@ +name: per_az +type: availability_zone +resources: + efs_stack: + - cf: EfsEfsMountTarget%az_count_plus% + tf: module.domino_eks.module.storage.aws_efs_mount_target.eks[%az_count%] + vpc_stack: + - cf: VPC%cf_stack_key%PublicSubnet%az_count_plus%Subnet + tf: aws_subnet.public[%az_count%] + - cf: VPC%cf_stack_key%PublicSubnet%az_count_plus%RouteTable + tf: aws_route_table.public[%az_count%] + - cf_rtassoc: + route_table: VPC%cf_stack_key%PublicSubnet%az_count_plus%RouteTable + subnet: VPC%cf_stack_key%PublicSubnet%az_count_plus%Subnet + tf: aws_route_table_association.public[%az_count%] + - cf: VPC%cf_stack_key%PrivateSubnet%az_count_plus%Subnet + tf: aws_subnet.private[%az_count%] + - cf: VPC%cf_stack_key%PrivateSubnet%az_count_plus%RouteTable + tf: aws_route_table.private[%az_count%] + - cf_rtassoc: + route_table: VPC%cf_stack_key%PrivateSubnet%az_count_plus%RouteTable + subnet: VPC%cf_stack_key%PrivateSubnet%az_count_plus%Subnet + tf: aws_route_table_association.private[%az_count%] + - cf: '%cf_stack_key%PodSubnet%az_count_plus%Subnet' + tf: aws_subnet.pod[%az_count%] + - cf: '%cf_stack_key%PodSubnet%az_count_plus%RouteTable' + tf: aws_route_table.pod[%az_count%] + - cf_rtassoc: + route_table: '%cf_stack_key%PodSubnet%az_count_plus%RouteTable' + subnet: '%cf_stack_key%PodSubnet%az_count_plus%Subnet' + tf: aws_route_table_association.pod[%az_count%] + - cf: VPC%cf_stack_key%PublicSubnet%az_count_plus%EIP + tf: aws_eip.nat_gateway[%az_count%] + - cf: VPC%cf_stack_key%PublicSubnet%az_count_plus%NATGateway + tf: aws_nat_gateway.public[%az_count%] diff --git a/convert/data/resource_template.yaml b/convert/data/resource_template.yaml new file mode 100644 index 00000000..fee71cba --- /dev/null +++ b/convert/data/resource_template.yaml @@ -0,0 +1,46 @@ +name: resource_template +resources: + efs_stack: + - cf: Efs + tf: module.domino_eks.module.storage.aws_efs_file_system.eks + - cf: Efsaccesspoint + tf: module.domino_eks.module.storage.aws_efs_access_point.eks + eks_stack: + - cf: eks + tf: module.domino_eks.module.eks.aws_eks_cluster.this + - cf: EKSSG + tf: module.domino_eks.module.eks.aws_security_group.eks_cluster + - cf: eksRole + tf: module.domino_eks.module.eks.aws_iam_role.eks_cluster + - tf: module.domino_eks.module.eks.aws_cloudwatch_log_group.eks_cluster + value: /aws/eks/%stack_name%/cluster + - cf: S3 + tf: module.domino_eks.module.storage.aws_iam_policy.s3 + - tf: module.domino_eks.module.eks.aws_eks_addon.this["coredns"] + value: '%stack_name%:coredns' + - tf: module.domino_eks.module.eks.aws_eks_addon.vpc_cni + value: '%stack_name%:vpc-cni' + - tf: module.domino_eks.module.eks.aws_eks_addon.this["kube-proxy"] + value: '%stack_name%:kube-proxy' + - cf: eksCreationRole + tf: aws_iam_role.grandfathered_creation_role + - cf: '%cf_stack_key%kubernetessecretsenvelopekey' + tf: module.domino_eks.module.eks.aws_kms_key.eks_cluster + s3_stack: + - cf: backups + tf: module.domino_eks.module.storage.aws_s3_bucket.backups + - cf: blobs + tf: module.domino_eks.module.storage.aws_s3_bucket.blobs + - cf: logs + tf: module.domino_eks.module.storage.aws_s3_bucket.logs + - cf: registry + tf: module.domino_eks.module.storage.aws_s3_bucket.registry + vpc_stack: + - cf: VPC + tf: aws_vpc.cdk_vpc + - cf: VPCIGW + tf: aws_internet_gateway.cdk_vpc + - cf_igw_attachment: + igw: VPCIGW + vpc: VPC + tf: aws_internet_gateway_attachment.cdk_vpc diff --git a/convert/data/route53_resources.yaml b/convert/data/route53_resources.yaml new file mode 100644 index 00000000..a9f5b43b --- /dev/null +++ b/convert/data/route53_resources.yaml @@ -0,0 +1,6 @@ +name: route53 +type: optional +resources: + eks_stack: + - cf: route53 + tf: module.domino_eks.aws_iam_policy.route53[0] diff --git a/convert/data/unmanaged_nodegroup.yaml b/convert/data/unmanaged_nodegroup.yaml new file mode 100644 index 00000000..82d08b1f --- /dev/null +++ b/convert/data/unmanaged_nodegroup.yaml @@ -0,0 +1,21 @@ +name: unmanaged_nodegroup +type: optional +resources: + eks_stack: + - cf_sgr: + rule: _egress_tcp_443_443_ + rule_sg: UnmanagedSG + sg: EKSSG + tf: module.domino_eks.module.eks.aws_security_group_rule.eks_cluster["egress_nodes_443"] + - cf_sgr: + rule: _ingress_tcp_443_443_ + rule_sg: UnmanagedSG + sg: EKSSG + tf: module.domino_eks.module.eks.aws_security_group_rule.eks_cluster["ingress_nodes_443"] + - cf: UnmanagedSG + tf: module.domino_eks.module.eks.aws_security_group.eks_nodes + - cf_sgr: + rule: _ingress_tcp_443_443_ + rule_sg: EKSSG + sg: UnmanagedSG + tf: module.domino_eks.module.eks.aws_security_group_rule.node["ingress_cluster_443"] diff --git a/convert/lib/convert.py b/convert/lib/convert.py index 97c50330..254aacbb 100755 --- a/convert/lib/convert.py +++ b/convert/lib/convert.py @@ -3,24 +3,27 @@ import json import re from copy import deepcopy -from functools import cached_property, reduce +from functools import cached_property +from os import listdir +from os.path import abspath, join from pprint import pprint from subprocess import run +from textwrap import dedent from time import sleep import boto3 import yaml -from .meta import ( - cdk_ids, - cf_status, - efs_backup_resources, - resource_template, - route53_resource, - stack_map, -) +from .meta import cdk_ids, cf_status, stack_map from .nuke import nuke +resources = {} + +for filename in listdir("data"): + with open(join("data", filename)) as f: + r = yaml.safe_load(f.read()) + resources[r.pop("name")] = r + clean_categories = [x.name for x in cdk_ids if x.name != "cloudformation_stack"] @@ -41,7 +44,13 @@ def get_stacks(self, stack: str = None, full: bool = False): if r["ResourceType"] == cdk_ids.cloudformation_stack.value: for mapped_logical_id, name in stack_map.items(): if logical_id.startswith(mapped_logical_id): - stacks[name] = self.get_stacks(physical_id, full) + try: + stacks[name] = self.get_stacks(physical_id, full) + except self.cf.exceptions.ClientError as e: + if "does not exist" in e.response["Error"]["Message"]: + stacks[name] = None + else: + raise break else: raise Exception(f"Nothing to map stack {r} to!") @@ -53,6 +62,7 @@ def get_stacks(self, stack: str = None, full: bool = False): def setup(self, full: bool = False, no_stacks: bool = False): self.region = self.args.region self.stack_name = self.args.stack_name + self.cf_stack_key = re.sub(r"\W", "", self.stack_name) self.cf = boto3.client("cloudformation", self.region) @@ -112,6 +122,21 @@ def parse_args(self): resource_map_parser.add_argument( "--route53", help="Whether or not to import route53 zones", default=False, action="store_true" ) + resource_map_parser.add_argument( + "--bastion", help="Whether or not to import bastion security group", default=False, action="store_true" + ) + resource_map_parser.add_argument( + "--monitoring", help="Whether or not to import monitoring bucket", default=False, action="store_true" + ) + resource_map_parser.add_argument( + "--unmanaged-nodegroups", + help="Whether or not unmanaged nodegroups are in use", + default=False, + action="store_true", + ) + resource_map_parser.add_argument( + "--flow-logging", help="Whether or not flow logging is configured", default=False, action="store_true" + ) resource_map_parser.add_argument( "--efs-backups", help="Whether or not to import efs backup vault", @@ -198,58 +223,68 @@ def print_stack(self): else: pprint(out) - def generate_resource_map(self, availability_zones: int, efs_backups: bool, route53: bool) -> dict: - template = deepcopy(resource_template) + def generate_resource_map( + self, + availability_zones: int, + efs_backups: bool, + route53: bool, + bastion: bool, + monitoring: bool, + unmanaged_nodegroups: bool, + flow_logging: bool, + ) -> dict: + template = resources["resource_template"]["resources"] + + def nested_az_replace(d: dict, count: int): + for k, v in d.items(): + if isinstance(v, str): + d[k] = re.sub("%az_count%", str(count), d[k]) + d[k] = re.sub("%az_count_plus%", str(count + 1), d[k]) + elif isinstance(v, list): + for i, entry in enumerate(v): + if isinstance(entry, dict): + v[i] = nested_az_replace(entry, count) + else: + raise Exception(f"Unexpected resource map entry {k}: {v}") + elif isinstance(v, dict): + d[k] = nested_az_replace(v, count) + else: + raise Exception(f"Unexpected resource map entry {k}: {v}") + return d + + optional_resources = [] for count in range(availability_zones): - template["efs_stack"].append( - { - "cf": f"EfsEfsMountTarget{count+1}", - "tf": f"module.domino_eks.module.storage.aws_efs_mount_target.eks[{count}]", - } - ) - for s_type in ["Public", "Private", "Pod"]: - vpc_prefix = "" if s_type == "Pod" else "VPC" - prefix = f"{vpc_prefix}%stack_name%{s_type}Subnet{count+1}" - template["vpc_stack"].extend( - [ - { - "cf": f"{prefix}Subnet", - "tf": f"aws_subnet.{s_type.lower()}[{count}]", - }, - { - "cf": f"{prefix}RouteTable", - "tf": f"aws_route_table.{s_type.lower()}[{count}]", - }, - { - "cf_rtassoc": { - "subnet": f"{prefix}Subnet", - "route_table": f"{prefix}RouteTable", - }, - "tf": f"aws_route_table_association.{s_type.lower()}[{count}]", - }, - ] - ) - template["vpc_stack"].extend( - [ - { - "cf": f"VPC%stack_name%PublicSubnet{count+1}EIP", - "tf": f"aws_eip.nat_gateway[{count}]", - }, - {"cf": f"VPC%stack_name%PublicSubnet{count+1}NATGateway", "tf": f"aws_nat_gateway.public[{count}]"}, - ] - ) + az_template = nested_az_replace(deepcopy(resources["per_az"]), count) + optional_resources.append(az_template) if efs_backups: - template["efs_stack"].extend(efs_backup_resources) - + optional_resources.append(resources["efs_backup"]) if route53: - template["eks_stack"].append(route53_resource) + optional_resources.append(resources["route53"]) + if flow_logging: + optional_resources.append(resources["flow_logging"]) + if monitoring: + optional_resources.append(resources["monitoring_bucket"]) + if bastion: + optional_resources.append(resources["bastion"]) + if unmanaged_nodegroups: + optional_resources.append(resources["unmanaged_nodegroup"]) + + for resource in optional_resources: + for key in resource["resources"].keys(): + template[key].extend(resource["resources"][key]) return template def resource_map(self): resource_map = self.generate_resource_map( - self.args.availability_zones, self.args.efs_backups, self.args.route53 + self.args.availability_zones, + self.args.efs_backups, + self.args.route53, + self.args.bastion, + self.args.monitoring, + self.args.unmanaged_nodegroups, + self.args.flow_logging, ) print(yaml.safe_dump(resource_map)) @@ -264,10 +299,15 @@ def get_imports(self): availability_zones=self.args.availability_zones or self.cdkconfig["vpc"]["max_azs"], efs_backups=self.cdkconfig["efs"]["backup"]["enable"], route53=bool(self.cdkconfig["route53"]["zone_ids"]), + bastion=self.cdkconfig["vpc"]["bastion"]["enabled"], + monitoring=self.cdkconfig["s3"]["buckets"].get("monitoring"), + unmanaged_nodegroups=self.cdkconfig["eks"]["unmanaged_nodegroups"], + flow_logging=self.cdkconfig["vpc"]["flow_logging"], ) def t(val: str) -> str: val = re.sub(r"%stack_name%", self.stack_name, val) + val = re.sub(r"%cf_stack_key%", self.cf_stack_key, val) return val imports = [] @@ -298,9 +338,27 @@ def t(val: str) -> str: resource_id = f"{plan_id}|{selection_id}" else: resource_id = resources[t(item["cf"])] - imports.append(f"terraform import '{tf_import_path}' '{resource_id}'") + imports.append(f"tf_import '{tf_import_path}' '{resource_id}'") - print("#!/bin/bash\nset -ex") + eks = boto3.client("eks", self.region) + eks_cluster_result = eks.describe_cluster(name=self.cdkconfig["name"]) + eks_cluster_auto_sg = eks_cluster_result["cluster"]["resourcesVpcConfig"]["clusterSecurityGroupId"] + import_path = "aws_security_group.eks_cluster_auto" + imports.append(f"tf_import '{import_path}' '{eks_cluster_auto_sg}'") + + print( + dedent( + """\ + #!/bin/bash + set -ex + + tf_import() { + terraform import "$1" "$2" + terraform state show "$1" || (echo "$1 not in terraform state, import may have failed" && exit 1) + } + """ + ) + ) print("\n".join(imports)) def create_tfvars(self): @@ -310,12 +368,16 @@ def get_subnet_ids(subnet_type: str, prefix: str = "VPC"): return [ v for k, v in self.stacks["vpc_stack"]["resources"].items() - if re.match(f"{prefix}{self.stack_name}{subnet_type}Subnet\\d+Subnet", k) + if re.match(f"{prefix}{self.cf_stack_key}{subnet_type}Subnet\\d+Subnet", k) ] - ng_role_name = self.stacks["eks_stack"]["resources"][f"{self.stack_name}NG"] - client = boto3.client("iam", self.region) - ng_role_arn = client.get_role(RoleName=ng_role_name)["Role"]["Arn"] + ec2 = boto3.client("ec2", self.region) + eks = boto3.client("eks", self.region) + iam = boto3.client("iam", self.region) + r53 = boto3.client("route53", self.region) + + ng_role_name = self.stacks["eks_stack"]["resources"][f"{self.cf_stack_key}NG"] + ng_role_arn = iam.get_role(RoleName=ng_role_name)["Role"]["Arn"] eks_custom_role_maps = [ { "rolearn": ng_role_arn, @@ -327,6 +389,22 @@ def get_subnet_ids(subnet_type: str, prefix: str = "VPC"): ], } ] + # CDK force destroy is individually configurable + # If any of them at all are not set to force destroy, turn the feature off + s3_force_destroy = not [ + b for b in self.cdkconfig["s3"]["buckets"].values() if b and not b["auto_delete_objects"] + ] + + eks_cluster_result = eks.describe_cluster(name=self.cdkconfig["name"]) + eks_k8s_version = eks_cluster_result["cluster"]["version"] + eks_cluster_auto_sg = eks_cluster_result["cluster"]["resourcesVpcConfig"]["clusterSecurityGroupId"] + + route53_hosted_zone_name = "" + if r53_zone_ids := self.cdkconfig["route53"]["zone_ids"]: + route53_hosted_zone_name = r53.get_hosted_zone(Id=r53_zone_ids[0])["HostedZone"]["Name"] + + subnet_result = ec2.describe_subnets(SubnetIds=get_subnet_ids("Private")) + az_zone_ids = [s["AvailabilityZoneId"] for s in subnet_result["Subnets"]] tfvars = { "deploy_id": self.cdkconfig["name"], @@ -338,24 +416,50 @@ def get_subnet_ids(subnet_type: str, prefix: str = "VPC"): else self.stacks["vpc_stack"]["resources"]["VPC"], "public_subnet_ids": get_subnet_ids("Public"), "private_subnet_ids": get_subnet_ids("Private"), + "default_node_groups": { + "platform": { + "availability_zone_ids": az_zone_ids, + }, + "compute": { + "availability_zone_ids": az_zone_ids, + }, + "gpu": { + "availability_zone_ids": az_zone_ids, + }, + }, "pod_subnet_ids": get_subnet_ids("Pod", ""), - "k8s_version": self.cdkconfig["eks"]["version"], # We're trusting this is accurate - "ssh_key_path": self.args.ssh_key_path, + "k8s_version": eks_k8s_version, + "ssh_key_path": abspath(self.args.ssh_key_path), "number_of_azs": self.cdkconfig["vpc"]["max_azs"], - "route53_hosted_zone_name": reduce( - lambda cfg, k: cfg[k] if k in cfg else [""], - ["install", "overrides", "external_dns", "domain_filters"], - self.cdkconfig, - )[0], + "route53_hosted_zone_name": route53_hosted_zone_name, "efs_backups": self.cdkconfig["efs"]["backup"]["enable"], "efs_backup_schedule": self.cdkconfig["efs"]["backup"]["schedule"], "efs_backup_cold_storage_after": self.cdkconfig["efs"]["backup"]["move_to_cold_storage_after"], "efs_backup_delete_after": self.cdkconfig["efs"]["backup"]["delete_after"], "efs_backup_force_destroy": self.cdkconfig["efs"]["backup"]["removal_policy"] == "DESTROY", "eks_custom_role_maps": eks_custom_role_maps, + "s3_force_destroy_on_deletion": s3_force_destroy, + "flow_logging": self.cdkconfig["vpc"]["flow_logging"], + "eks_cluster_auto_sg": eks_cluster_auto_sg, } - print(json.dumps(tfvars)) + print(json.dumps(tfvars, indent=4)) + + notes = "" + if not self.cdkconfig["eks"]["private_api"]: + notes += "\n* Your CDK EKS is configured for public API access.\n Your cluster's setting will be changed to *PRIVATE*, as the terraform module does not support public EKS endpoints." + + if len(r53_zone_ids) > 1: + notes += f"\n* You have multiple hosted zones, only the first ({r53_zone_ids[0]} [{route53_hosted_zone_name}]) will be used." + + notes += ( + "\n* Nodegroup settings do not carry over. Please examine tfvars if you want to make any customizations." + ) + + from sys import stderr + + if notes: + print(f"*** IMPORTANT ***: {notes}", file=stderr) def clean_stack(self): self.setup(full=True, no_stacks=self.args.resource_file) @@ -387,26 +491,27 @@ def clean_stack(self): "(Handler|KubectlLayer|ProviderframeworkonEvent).*": lambda_safe, }, "eks_stack": { - f"(snapshot|{self.stack_name}ebscsi|{self.stack_name}DominoEcrRestricted|autoscaler)": [ + f"(snapshot|{self.cf_stack_key}ebscsi|{self.cf_stack_key}DominoEcrRestricted|autoscaler)": [ cdk_ids.iam_policy.value ], - f"(eksMastersRole|{self.stack_name}NG)$": [cdk_ids.iam_role.value], + f"(eksMastersRole|{self.cf_stack_key}NG)$": [cdk_ids.iam_role.value], "eksKubectlReadyBarrier": [cdk_ids.ssm_parameter.value], "(clusterpost(creation|deletion)tasks|LogRetention)": lambda_safe, "Unmanaged": [cdk_ids.instance_profile.value, cdk_ids.asg.value, cdk_ids.launch_template.value], + "eksNodegroup": [cdk_ids.eks_nodegroup.value], }, "s3_stack": { "CustomS3AutoDeleteObjectsCustomResourceProvider": lambda_safe, }, "vpc_stack": { "endpointssg": [cdk_ids.security_group.value], + "(.*ENDPOINT|VPCS3)": [cdk_ids.endpoint.value], "bastion": [ cdk_ids.instance.value, cdk_ids.instance_profile.value, cdk_ids.iam_role.value, cdk_ids.eip.value, ], - "VPCrejectFlowLogsFlowLog": [cdk_ids.flowlog.value], # TODO: should this be in the module? "(LogRetention|AWS)": lambda_safe, }, "core_stack": { @@ -438,17 +543,27 @@ def get_nukes(stack_name, stack_resources): # but the eks cluster security group isn't gettable from cloudformation... ec2 = boto3.client("ec2", self.region) eks = boto3.client("eks", self.region) - eks_cluster_sg = eks.describe_cluster(name=self.stack_name)["cluster"]["resourcesVpcConfig"][ - "clusterSecurityGroupId" - ] - unmanaged_sg = self.stacks["eks_stack"]["resources"]["UnmanagedSG"]["PhysicalResourceId"] + + empty_sg_rules = {"egress": [], "ingress": []} + try: + eks_cluster_sg = { + eks.describe_cluster(name=self.stack_name)["cluster"]["resourcesVpcConfig"][ + "clusterSecurityGroupId" + ]: empty_sg_rules + } + except eks.exceptions.ResourceNotFoundException: + eks_cluster_sg = {} + + unmanaged_sg = self.stacks["eks_stack"]["resources"].get("UnmanagedSG") + eks_sg = self.stacks["eks_stack"]["resources"]["EKSSG"]["PhysicalResourceId"] rule_ids_to_nuke = { - eks_cluster_sg: {"egress": [], "ingress": []}, - unmanaged_sg: {"egress": [], "ingress": []}, + **eks_cluster_sg, eks_sg: {"egress": [], "ingress": []}, } + if unmanaged_sg: + rule_ids_to_nuke[unmanaged_sg["PhysicalResourceId"]] = {"egress": [], "ingress": []} for group in rule_ids_to_nuke.keys(): rules = [ @@ -456,7 +571,7 @@ def get_nukes(stack_name, stack_resources): for r in ec2.describe_security_group_rules(Filters=[{"Name": "group-id", "Values": [group]}])[ "SecurityGroupRules" ] - if re.match(f"(from|to) {self.stack_name}", r.get("Description", "")) + if re.match(f"(from|to) {self.cf_stack_key}", r.get("Description", "")) ] rule_ids_to_nuke[group]["ingress"].extend([r["SecurityGroupRuleId"] for r in rules if not r["IsEgress"]]) rule_ids_to_nuke[group]["egress"].extend([r["SecurityGroupRuleId"] for r in rules if r["IsEgress"]]) @@ -505,8 +620,14 @@ def delete_stack(self): def get_stack_resources(s) -> dict: child_id = s["PhysicalResourceId"] child_name = re.search(r":stack/(.*)/", child_id).group(1) - if self.cf.describe_stacks(StackName=child_id)["Stacks"][0]["StackStatus"] == "DELETE_COMPLETE": - return + try: + if self.cf.describe_stacks(StackName=child_id)["Stacks"][0]["StackStatus"] == "DELETE_COMPLETE": + return + except self.cf.exceptions.ClientError as e: + if "does not exist" in e.response["Error"]["Message"]: + return + raise + child_resources = self.cf.describe_stack_resources(StackName=child_name)["StackResources"] stacks[child_name] = [ r["LogicalResourceId"] for r in child_resources if r["ResourceStatus"] != "DELETE_COMPLETE" diff --git a/convert/lib/meta.py b/convert/lib/meta.py index 872b42af..f10b473e 100755 --- a/convert/lib/meta.py +++ b/convert/lib/meta.py @@ -1,158 +1,6 @@ #!/usr/bin/env python3 from enum import Enum -resource_template = { - "efs_stack": [ - {"cf": "Efs", "tf": "module.domino_eks.module.storage.aws_efs_file_system.eks"}, - { - "cf": "Efsaccesspoint", - "tf": "module.domino_eks.module.storage.aws_efs_access_point.eks", - }, - ], - "eks_stack": [ - {"cf": "eks", "tf": "module.domino_eks.module.eks.aws_eks_cluster.this"}, - { - "cf": "EKSSG", - "tf": "module.domino_eks.module.eks.aws_security_group.eks_cluster", - }, - { - "cf_sgr": { - "sg": "EKSSG", - "rule": "_egress_tcp_443_443_", - "rule_sg": "UnmanagedSG", - }, - "tf": 'module.domino_eks.module.eks.aws_security_group_rule.eks_cluster["egress_nodes_443"]', - }, - { - "cf_sgr": { - "sg": "EKSSG", - "rule": "_ingress_tcp_443_443_", - "rule_sg": "UnmanagedSG", - }, - "tf": 'module.domino_eks.module.eks.aws_security_group_rule.eks_cluster["ingress_nodes_443"]', - }, - { - "cf": "UnmanagedSG", - "tf": "module.domino_eks.module.eks.aws_security_group.eks_nodes", - }, - { - "cf_sgr": { - "sg": "UnmanagedSG", - "rule": "_ingress_tcp_443_443_", - "rule_sg": "EKSSG", - }, - "tf": 'module.domino_eks.module.eks.aws_security_group_rule.node["ingress_cluster_443"]', - }, - { - "cf_sgr": { - "sg": "UnmanagedSG", - "rule": "_ingress_tcp_22_22_", - "rule_sg": "bastionsg", - "rule_sg_stack": "vpc_stack", - }, - "tf": 'module.domino_eks.module.eks.aws_security_group_rule.bastion_eks["eks_nodes_ssh_from_bastion"]', - }, - { - "cf": "eksRole", - "tf": "module.domino_eks.module.eks.aws_iam_role.eks_cluster", - }, - { - "tf": "module.domino_eks.module.eks.aws_cloudwatch_log_group.eks_cluster", - "value": "/aws/eks/%stack_name%/cluster", - }, - {"cf": "S3", "tf": "module.domino_eks.module.storage.aws_iam_policy.s3"}, - { - "tf": 'module.domino_eks.module.eks.aws_eks_addon.this["coredns"]', - "value": "%stack_name%:coredns", - }, - { - "tf": "module.domino_eks.module.eks.aws_eks_addon.vpc_cni", - "value": "%stack_name%:vpc-cni", - }, - { - "tf": 'module.domino_eks.module.eks.aws_eks_addon.this["kube-proxy"]', - "value": "%stack_name%:kube-proxy", - }, - { - "cf": "eksCreationRole", - "tf": "aws_iam_role.grandfathered_creation_role", - }, - { - "cf": "%stack_name%kubernetessecretsenvelopekey", - "tf": "module.domino_eks.module.eks.aws_kms_key.eks_cluster", - }, - ], - "s3_stack": [ - { - "cf": "backups", - "tf": "module.domino_eks.module.storage.aws_s3_bucket.backups", - }, - {"cf": "blobs", "tf": "module.domino_eks.module.storage.aws_s3_bucket.blobs"}, - {"cf": "logs", "tf": "module.domino_eks.module.storage.aws_s3_bucket.logs"}, - { - "cf": "registry", - "tf": "module.domino_eks.module.storage.aws_s3_bucket.registry", - }, - { - "cf": "monitoring", - "tf": "module.domino_eks.module.storage.aws_s3_bucket.monitoring", - }, - ], - "vpc_stack": [ - { - "cf": "bastionsg", - "tf": "module.domino_eks.module.bastion[0].aws_security_group.bastion", - }, - { - "cf_sgr": { - "sg": "bastionsg", - "rule": "_egress_all_0_0_0.0.0.0/0", - }, - "tf": "module.domino_eks.module.bastion[0].aws_security_group_rule.bastion_outbound", - }, - { - "cf_sgr": { - "sg": "bastionsg", - "rule": "_ingress_tcp_22_22_0.0.0.0/0", - }, - "tf": 'module.domino_eks.module.bastion[0].aws_security_group_rule.bastion["bastion_inbound_ssh"]', - }, - { - "cf": "VPC", - "tf": "aws_vpc.cdk_vpc", - }, - { - "cf": "VPCIGW", - "tf": "aws_internet_gateway.cdk_vpc", - }, - { - "cf_igw_attachment": { - "igw": "VPCIGW", - "vpc": "VPC", - }, - "tf": "aws_internet_gateway_attachment.cdk_vpc", - }, - ], -} - -efs_backup_resources = [ - { - "cf": "efsbackup", - "tf": "module.domino_eks.module.storage.aws_backup_vault.efs[0]", - }, - { - "cf": "efsbackupplan", - "tf": "module.domino_eks.module.storage.aws_backup_plan.efs[0]", - }, - { - "cf": "efsbackuprole", - "tf": "module.domino_eks.module.storage.aws_iam_role.efs_backup_role[0]", - }, - {"cf_backupselection": "efsbackupselection", "tf": "module.domino_eks.module.storage.aws_backup_selection.efs[0]"}, -] - -route53_resource = {"cf": "route53", "tf": "module.domino_eks.aws_iam_policy.route53[0]"} - stack_map = { "EfsStackNestedStackEfsStackNestedStackResource": "efs_stack", "EksStackNestedStackEksStackNestedStackResource": "eks_stack", @@ -193,15 +41,16 @@ class cdk_ids(Enum): asg = "AWS::AutoScaling::AutoScalingGroup" cloudformation_stack = "AWS::CloudFormation::Stack" eip = "AWS::EC2::EIP" - flowlog = "AWS::EC2::FlowLog" - instance = "AWS::EC2::Instance" - launch_template = "AWS::EC2::LaunchTemplate" - security_group = "AWS::EC2::SecurityGroup" - instance_profile = "AWS::IAM::InstanceProfile" + eks_nodegroup = "AWS::EKS::Nodegroup" + endpoint = "AWS::EC2::VPCEndpoint" iam_policy = "AWS::IAM::ManagedPolicy" iam_role = "AWS::IAM::Role" + instance = "AWS::EC2::Instance" + instance_profile = "AWS::IAM::InstanceProfile" lambda_function = "AWS::Lambda::Function" - stepfunctions_statemachine = "AWS::StepFunctions::StateMachine" lambda_layerversion = "AWS::Lambda::LayerVersion" - ssm_parameter = "AWS::SSM::Parameter" + launch_template = "AWS::EC2::LaunchTemplate" + security_group = "AWS::EC2::SecurityGroup" security_group_rule_ids = "security_group_rule_ids" # special + ssm_parameter = "AWS::SSM::Parameter" + stepfunctions_statemachine = "AWS::StepFunctions::StateMachine" diff --git a/convert/lib/nuke.py b/convert/lib/nuke.py index be6996f9..a4115dd3 100644 --- a/convert/lib/nuke.py +++ b/convert/lib/nuke.py @@ -2,8 +2,10 @@ import re from functools import cached_property from pprint import pprint +from time import sleep import boto3 +from retry import retry from .meta import cdk_ids @@ -22,6 +24,10 @@ def autoscaling(self): def ec2(self): return boto3.client("ec2", self.region) + @cached_property + def eks(self): + return boto3.client("eks", self.region) + @cached_property def iam(self): return boto3.client("iam", self.region) @@ -38,6 +44,28 @@ def ssm(self): def stepfunctions(self): return boto3.client("stepfunctions", self.region) + # TODO: Possibly need to nuke 0.0.0.0/0 SG rule on cluster SG when using eks nodegroup? + def eks_nodegroup(self, group_names: list[str]): + if not group_names: + return + + eks_ng_regex = r"([0-9A-Za-z][A-Za-z0-9\-_]+)\/([0-9A-Za-z][A-Za-z0-9\-_]+)" + + cluster_name = re.match(eks_ng_regex, group_names[0]).group(1) + group_names = [re.match(eks_ng_regex, g).group(2) for g in group_names] + + p = self.eks.get_paginator("list_nodegroups") + existing_groups = [ + ng for i in p.paginate(clusterName=cluster_name) for ng in i["nodegroups"] if ng in group_names + ] + + if existing_groups: + pprint(existing_groups) + + if self.delete: + for group in group_names: + print(self.eks.delete_nodegroup(clusterName=cluster_name, nodegroupName=group)) + def asg(self, group_names: list[str]): if not group_names: return @@ -54,8 +82,33 @@ def asg(self, group_names: list[str]): if self.delete: for group in existing_groups: + if ( + self.autoscaling.describe_auto_scaling_groups(AutoScalingGroupNames=[group])[ + "AutoScalingGroups" + ][0]["DesiredCapacity"] + != 0 + ): + print( + self.autoscaling.update_auto_scaling_group( + AutoScalingGroupName=group, DesiredCapacity=0, MinSize=0, MaxSize=0 + ) + ) + print(f"Auto scaling group {group} scaled to 0, must wait for scaling to finish to delete") + + @retry( + ( + self.autoscaling.exceptions.ResourceInUseFault, + self.autoscaling.exceptions.ScalingActivityInProgressFault, + ), + delay=5, + tries=60, + ) + def delete_asg(group: str): print(self.autoscaling.delete_auto_scaling_group(AutoScalingGroupName=group)) + for group in existing_groups: + delete_asg(group) + def eip(self, eip_addresses: list[str]): if not eip_addresses: return @@ -73,18 +126,6 @@ def eip(self, eip_addresses: list[str]): print(self.ec2.disassociate_address(AssociationId=association_id)) print(self.ec2.release_address(AllocationId=allocation_id)) - def flowlog(self, flow_logs: list[str]): - if not flow_logs: - return - result = self.ec2.describe_flow_logs(FlowLogIds=flow_logs) - existing_flow_logs = [i["FlowLogId"] for i in result["FlowLogs"]] - - if existing_flow_logs: - pprint({"Flow Log IDs to delete": existing_flow_logs}) - - if self.delete: - self.ec2.delete_flow_logs(FlowLogIds=existing_flow_logs) - def instance(self, instance_ids: list[str]): if not instance_ids: return @@ -268,6 +309,37 @@ def ssm_parameter(self, parameters: list[str]): for p in existing_parameters: self.ssm.delete_parameter(Name=p) + def endpoint(self, endpoints: list[str]): + if not endpoints: + return + + p = self.ec2.get_paginator("describe_vpc_endpoints") + existing_endpoints = [ + e["VpcEndpointId"] for i in p.paginate() for e in i["VpcEndpoints"] if e["VpcEndpointId"] in endpoints + ] + + if existing_endpoints: + pprint({"VPC Endpoints to delete": existing_endpoints}) + + if self.delete: + self.ec2.delete_vpc_endpoints(VpcEndpointIds=existing_endpoints) + print("Waiting for endpoints to finish deleting...") + while True: + sleep(5) + deleted_endpoints = 0 + for e in existing_endpoints: + try: + r = self.ec2.describe_vpc_endpoints(VpcEndpointIds=[e]) + if (state := r["VpcEndpoints"][0]["State"]) and state in ["available", "pending"]: + raise Exception( + f"VPC Endpoint {e} in unexpected state {state} after delete_endpoint call" + ) + except self.ec2.exceptions.ClientError as e: + if e.response["Error"]["Code"] == "InvalidVpcEndpointId.NotFound": + deleted_endpoints += 1 + if deleted_endpoints == len(existing_endpoints): + break + def security_group_rule_ids(self, rulemap: dict[str, list[str]]): if not rulemap: return @@ -276,10 +348,15 @@ def security_group_rule_ids(self, rulemap: dict[str, list[str]]): if self.delete: for group_id, rules in rulemap.items(): - if rules["egress"]: - self.ec2.revoke_security_group_egress(GroupId=group_id, SecurityGroupRuleIds=rules["egress"]) - if rules["ingress"]: - self.ec2.revoke_security_group_ingress(GroupId=group_id, SecurityGroupRuleIds=rules["ingress"]) + try: + if rules["egress"]: + self.ec2.revoke_security_group_egress(GroupId=group_id, SecurityGroupRuleIds=rules["egress"]) + if rules["ingress"]: + self.ec2.revoke_security_group_ingress(GroupId=group_id, SecurityGroupRuleIds=rules["ingress"]) + except self.ec2.exceptions.ClientError as e: + if e.response["Error"]["Code"] != "InvalidSecurityGroupRuleId.NotFound": + raise + print(e) def nuke(self, nuke_queue: dict[str, list[str]], remove_security_group_references: bool = False): all_referenced_groups = {} @@ -316,10 +393,11 @@ def nuke(self, nuke_queue: dict[str, list[str]], remove_security_group_reference local_queue = [] order = [ + cdk_ids.endpoint, + cdk_ids.eks_nodegroup, cdk_ids.asg, cdk_ids.instance, cdk_ids.eip, - cdk_ids.flowlog, cdk_ids.launch_template, cdk_ids.security_group, cdk_ids.stepfunctions_statemachine, diff --git a/convert/requirements.txt b/convert/requirements.txt index f0942ba2..3f4a68d1 100644 --- a/convert/requirements.txt +++ b/convert/requirements.txt @@ -1,2 +1,3 @@ boto3~=1.26.22 PyYAML~=6.0 +retry~=0.9.2 diff --git a/convert/terraform/main.tf b/convert/terraform/main.tf index cea233c9..6f577f9a 100755 --- a/convert/terraform/main.tf +++ b/convert/terraform/main.tf @@ -24,14 +24,14 @@ resource "aws_iam_role" "grandfathered_creation_role" { } module "domino_eks" { - source = "github.com/dominodatalab/terraform-aws-eks.git?ref=v1.2.2" + source = "github.com/dominodatalab/terraform-aws-eks.git?ref=v1.3.0" deploy_id = var.deploy_id region = var.region - number_of_azs = var.number_of_azs + default_node_groups = var.default_node_groups k8s_version = var.k8s_version route53_hosted_zone_name = var.route53_hosted_zone_name eks_master_role_names = var.eks_master_role_names - s3_force_destroy_on_deletion = true + s3_force_destroy_on_deletion = var.s3_force_destroy_on_deletion bastion = {} ssh_pvt_key_path = var.ssh_key_path tags = var.tags diff --git a/convert/terraform/variables.tf b/convert/terraform/variables.tf index 6401a171..42a2d0f0 100644 --- a/convert/terraform/variables.tf +++ b/convert/terraform/variables.tf @@ -41,6 +41,11 @@ variable "pod_subnet_ids" { description = "Pre-existing private subnets ids used with deployment" } +variable "flow_logging" { + type = bool + description = "Enable flow logging" +} + variable "k8s_version" { type = string description = "EKS cluster k8s version (should match existing)" @@ -57,6 +62,17 @@ variable "number_of_azs" { default = 3 } +variable "default_node_groups" { + type = map + description = "Default node groups" +} + +variable "additional_node_groups" { + type = map + description = "Additional EKS managed nodegroups" + default = {} +} + variable "route53_hosted_zone_name" { type = string description = "Name of route53 hosted zone (optional, for internal use)" @@ -103,3 +119,14 @@ variable "eks_custom_role_maps" { description = "blah" default = [] } + +variable "eks_cluster_auto_sg" { + type = string + description = "Atomatically generated security group with name in the form of eks-cluster-sg-" +} + +variable "s3_force_destroy_on_deletion" { + description = "Toogle to allow recursive deletion of all objects in the s3 buckets. if 'false' terraform will NOT be able to delete non-empty buckets" + type = bool + default = false +} diff --git a/convert/terraform/vpc.tf b/convert/terraform/vpc.tf index 4d614d47..8cf757f4 100644 --- a/convert/terraform/vpc.tf +++ b/convert/terraform/vpc.tf @@ -4,6 +4,15 @@ resource "aws_vpc" "cdk_vpc" { } } +resource "aws_flow_log" "flowlog" { + count = var.flow_logging ? 1 : 0 + log_destination = module.domino_eks.s3_buckets["monitoring"].arn + vpc_id = aws_vpc.cdk_vpc.id + max_aggregation_interval = 600 + log_destination_type = "s3" + traffic_type = "REJECT" +} + resource "aws_eip" "nat_gateway" { count = var.number_of_azs @@ -134,3 +143,12 @@ resource "aws_route_table_association" "pod" { subnet_id = aws_subnet.pod[count.index].id route_table_id = aws_route_table.pod[count.index].id } + +resource "aws_security_group" "eks_cluster_auto" { + name = "eks-cluster-sg-${var.deploy_id}" + revoke_rules_on_delete = true + + lifecycle { + ignore_changes = [name, description, ingress, egress, tags, tags_all, vpc_id, timeouts] + } +}