Skip to content

Commit

Permalink
PLAT-5668: TF convertor integration test / misc. follow-up (#131)
Browse files Browse the repository at this point in the history
* Automatically destroy ASG instances before ASGs
* Deal with optionality of bastion
* Same with monitoring bucket
* Stack name is sanitized to `\w` or something similar, deal with that
* Public API setting unsupported, notify user
* Don't hardcode S3 force destroy variable
* Get route53 zone name from api; agent config going away
* Maybe get EKS version from api?
* Support flowlog import into eks module if/when added to that
* Determine if we should do something more clever for the ASG node draining
* Optionality between managed/unmanaged nodegroups, efs backups, comb for other things
* Track the upstream AZ/Subnet changes and change input variable accordingly (if relevant)
* (barely) Manage the default auto-generated EKS SG because its destruction is unreliable
* Integration test
  • Loading branch information
Secretions authored Mar 2, 2023
1 parent d33ebd1 commit c92d6e2
Show file tree
Hide file tree
Showing 20 changed files with 693 additions and 390 deletions.
177 changes: 177 additions & 0 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,177 @@
# This workflow will install Python dependencies, run tests and lint with a single version of Python
# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions

name: Run tests / create artifacts

on:
push:
branches: [ master ]
pull_request:
types: [opened, synchronize, reopened, labeled, ready_for_review]
branches: [ master ]

jobs:
test-build-deploy:
runs-on: ubuntu-latest
env:
AWS_ACCESS_KEY_ID: ${{ secrets.DELTA_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.DELTA_ACCESS_KEY }}
AWS_REGION: us-west-2
defaults:
run:
working-directory: ./cdk
steps:
- uses: actions/checkout@v2
- name: Set up Python 3.9
uses: actions/setup-python@v2
with:
python-version: 3.9
- name: Setup nodejs
uses: actions/setup-node@v2
with:
node-version: 16
- name: Install dependencies
run: |
pip install -r requirements.txt
pip install awscli==1.25.57 build
- name: Install aws-cdk
run: npm install -g aws-cdk@$(pip freeze | grep aws-cdk.core | sed -e 's/.*==//')
- name: Lint with flake8/black/isort
run: |
set -x
export FILES=(*.py domino_cdk tests ../convert/)
for f in ${FILES[@]}; do
# stop the build if there are Python syntax errors or undefined names
flake8 $f --count --select=E9,F63,F7,F82 --show-source --statistics
# exit-zero treats all errors as warnings.
flake8 $f --count --ignore=E501,W503 --exit-zero --statistics
black $f --check
isort $f --check
done
- name: Test with pytest
run: |
coverage run -m pytest tests
- name: Coverage report
run: |
coverage report
- name: Create/lint default config
env:
AWS_ACCOUNT_ID: ${{ secrets.DELTA_ACCOUNT_ID }}
GITHUB_SHA: ${{ github.sha }}
REGISTRY_USERNAME: ${{ secrets.REGISTRY_USERNAME }}
REGISTRY_PASSWORD: ${{ secrets.REGISTRY_PASSWORD }}
ACM_CERT_ARN: ${{ secrets.DELTA_ACM_CERT_ARN }}
BASE_DOMAIN: ${{ secrets.DELTA_BASE_DOMAIN }}
run: |
export NAME=cdk-${GITHUB_SHA:0:6}-$(date +%s)
echo "NAME=$NAME" >> $GITHUB_ENV
./util.py generate_config_template --name $NAME --aws-region=$AWS_REGION --aws-account-id=$AWS_ACCOUNT_ID --dev --platform-nodegroups 2 --registry-username $REGISTRY_USERNAME --registry-password $REGISTRY_PASSWORD --hostname $NAME.$BASE_DOMAIN --acm-cert-arn $ACM_CERT_ARN --disable-flow-logs > config.yaml
./util.py load_config -f ./config.yaml
- name: Test default config (single and nested stacks)
run: |
cdk synth --context singlestack=true -q
cdk synth -q
- name: Upload distribution artifacts
env:
DOMINO_CDK_VERSION: "0.0.0+${{ github.sha }}"
DATEDIR: "date +%Y%m%d"
run: |
cd ..
make clean && make dist
for suffix in "" "-terraform"; do
filename="domino-cdk$suffix-$DOMINO_CDK_VERSION.tar.gz"
aws s3 cp --acl=public-read ./dist/$filename s3://domino-artifacts/cdk/$($DATEDIR)/$filename
urlfile=$(python -c 'import sys, urllib.parse; print(urllib.parse.quote(sys.stdin.read().strip()))' <<< "$filename")
echo "Artifact url: https://domino-artifacts.s3.amazonaws.com/cdk/$($DATEDIR)/$urlfile"
done
- name: Deploy CDK
if: contains(github.event.pull_request.labels.*.name, 'deploy-test') || github.ref == 'refs/heads/master'
env:
REGISTRY_USERNAME: ${{ secrets.REGISTRY_USERNAME }}
REGISTRY_PASSWORD: ${{ secrets.REGISTRY_PASSWORD }}
run: |
docker login -u $REGISTRY_USERNAME -p $REGISTRY_PASSWORD quay.io
cdk deploy --require-approval never --outputs-file outputs.json
$(jq -r ".[].ekskubeconfigcmd" outputs.json) --kubeconfig ./kubeconfig
- name: Collect diagnostic data
if: always() && (contains(github.event.pull_request.labels.*.name, 'deploy-test') || github.ref == 'refs/heads/master')
env:
KUBECONFIG: ./kubeconfig
LOG_DIR: /tmp/k8s-cluster-state/cdk
run: |
set +e
curl -Lo /usr/local/bin/kubectl "https://dl.k8s.io/release/v1.23.6/bin/linux/amd64/kubectl"
for ns in domino-platform domino-compute domino-system kube-system; do
mkdir -p $LOG_DIR/$ns
kubectl -n $ns get ing -o yaml > $LOG_DIR/$ns/ingress.txt
kubectl -n $ns get po -o yaml > $LOG_DIR/$ns/pods.txt
kubectl -n $ns describe po > $LOG_DIR/$ns/pods-described.txt
kubectl -n $ns get pvc -o yaml > $LOG_DIR/$ns/pvcs.txt
kubectl -n $ns get svc -o yaml > $LOG_DIR/$ns/svcs.txt
kubectl -n $ns describe svc > $LOG_DIR/$ns/svcs-described.txt
kubectl -n $ns get events > $LOG_DIR/$ns/events.txt
done
kubectl get pv -o yaml > $LOG_DIR/pvs.txt
kubectl get no -o yaml > $LOG_DIR/nodes.txt
kubectl describe no > $LOG_DIR/nodes-described.txt
- name: Setup cloudformation-only user for safe stack deletion
if: (contains(github.event.pull_request.labels.*.name, 'deploy-test') || github.ref == 'refs/heads/master')
working-directory: ./convert/cloudformation-only
run: |
echo "{\"region\":\"us-west-2\",\"tags\":{},\"suffix\":\"$NAME\"}" > terraform.tfvars.json
terraform init
terraform plan -out=terraform.plan
terraform apply -auto-approve terraform.plan
- name: Setup terraform-aws-eks conversion process
if: (contains(github.event.pull_request.labels.*.name, 'deploy-test') || github.ref == 'refs/heads/master')
working-directory: ./convert
run: |
pip install -r requirements.txt
ssh-keygen -t rsa -f dummy.pem -N ''
./convert.py print-stack --stack-name $NAME --region $AWS_REGION --verbose --yaml > stack-data.yaml
./convert.py create-tfvars --stack-name $NAME --region $AWS_REGION --ssh-key-path ./dummy.pem > terraform/terraform.tfvars.json
./convert.py get-imports --stack-name $NAME --region $AWS_REGION > terraform/imports.sh
- name: Run terraform
if: (contains(github.event.pull_request.labels.*.name, 'deploy-test') || github.ref == 'refs/heads/master')
working-directory: ./convert/terraform
run: |
terraform init
bash imports.sh
terraform plan -out=terraform.plan
terraform show -json terraform.plan | jq '.resource_changes[] | select(.change.actions[]=="delete")' | tee terraform.plan.json
test -s terraform.plan.json && echo "Detected deletions, bailing..." && exit 1
terraform apply -auto-approve terraform.plan
- name: Clean and delete stack
if: (contains(github.event.pull_request.labels.*.name, 'deploy-test') || github.ref == 'refs/heads/master')
working-directory: ./convert
run: |
./convert.py clean-stack --stack-name $NAME --region $AWS_REGION --remove-security-group-references --delete
./convert.py delete-stack --stack-name $NAME --region $AWS_REGION --delete
- name: Upload diagnostic data
if: always() && (contains(github.event.pull_request.labels.*.name, 'deploy-test') || github.ref == 'refs/heads/master')
uses: actions/upload-artifact@v2
with:
name: Diagnostic Data
path: ./
retention-days: 14
- name: Delete stack w/CDK
if: always() && (contains(github.event.pull_request.labels.*.name, 'deploy-test') || github.ref == 'refs/heads/master')
working-directory: ./cdk
run: |
cdk destroy --force
- name: Destroy Infrastructure
if: always() && (contains(github.event.pull_request.labels.*.name, 'deploy-test') || github.ref == 'refs/heads/master')
working-directory: ./convert/terraform
run: |
terraform destroy -auto-approve
- name: Destroy Infrastructure
if: always() && (contains(github.event.pull_request.labels.*.name, 'deploy-test') || github.ref == 'refs/heads/master')
working-directory: ./convert/cloudformation-only
run: |
terraform destroy -auto-approve
- name: Fail without deploy
if: ${{ github.event.pull_request.draft == false && ! (contains(github.event.pull_request.labels.*.name, 'deploy-test') || contains(github.event.pull_request.labels.*.name, 'no-deploy-needed') || github.ref == 'refs/heads/master') }}
run: |
echo "Deploy tests required on non-draft PRs. Please add 'deploy-test' label".
exit 1
129 changes: 0 additions & 129 deletions .github/workflows/tests.yml

This file was deleted.

6 changes: 5 additions & 1 deletion cdk/domino_cdk/config/eks.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,14 +45,14 @@ class NodegroupBase:
instance_types: ["m5.2xlarge", "m5.4xlarge"] - Instance types available to nodegroup
labels: some-label: "true" - Labels to apply to all nodes in nodegroup
tags: some-tag: "true" - Tags to apply to all nodes in nodegroup
ssm_agent: true/false - Install SSM agent (ie for console access via aws web ui)
...
Managed nodegroup-specific options:
spot: true/false - Use spot instances, may affect reliability/availability of nodegroup
desired_size: 1 - Preferred size of nodegroup
...
Unmanaged nodegroup-specific options:
gpu: true/false - Setup GPU instance support
ssm_agent: true/false - Install SSM agent (ie for console access via aws web ui)
taints: some-taint: "true" - Taints to apply to all nodes in nodegroup
ie to taint gpu nodes, etc.)
"""
Expand Down Expand Up @@ -146,6 +146,10 @@ def check_ami_exceptions(ng_name: str, ami_id: str, user_data: str, incompatible
errors.append(
f"Error: {error_name} has min_size of 0. Only unmanaged nodegroups support min_size of 0."
)
if ng.min_size > ng.desired_size:
errors.append(
f"Error: {error_name} has a desired_size of {ng.desired_size}, which can't be less than the min_size (currently: {ng.min_size})."
)
for name, ng in self.unmanaged_nodegroups.items():
error_name = f"Unmanaged nodegroup [{name}]"
check_ami_exceptions(error_name, ng.ami_id, ng.user_data, ["ssm_agent", "labels", "taints", "disk_size"])
Expand Down
4 changes: 4 additions & 0 deletions convert/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,10 @@ Change to the `terraform/` directory, and run the following two commands:

`imports.sh` will run *several* `terraform import` commands in a row. This is normal.

### Review and Configure Node Groups

This conversion process will create a tvfars file with the default nodegroups for the terraform-aws-eks module. Please review the instructions for that module, and configure the nodegroups as desired. Your old nodegroups will remain functional after running the conversion (until the "clean-stack" step).

### Evaluate the Terraform plan

Once everything has been imported, you can run `terraform plan` and evaluate what terraform will do:
Expand Down
4 changes: 2 additions & 2 deletions convert/cloudformation-only/main.tf
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
resource "aws_iam_policy" "cloudformation_only" {
name = "cloudformation-only"
name = "cloudformation-only${var.suffix}"

policy = jsonencode({
Version = "2012-10-17"
Expand All @@ -14,7 +14,7 @@ resource "aws_iam_policy" "cloudformation_only" {
}

resource "aws_iam_role" "cloudformation_only" {
name = "cloudformation-only"
name = "cloudformation-only${var.suffix}"
description = "Allows CloudFormation to create and manage AWS stacks and resources on your behalf, but nothing else"

assume_role_policy = jsonencode({
Expand Down
6 changes: 6 additions & 0 deletions convert/cloudformation-only/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,9 @@ variable "tags" {
type = map(string)
description = "Deployment tags."
}

variable "suffix" {
type = string
description = "Optional suffix for role/policy names"
default = ""
}
21 changes: 21 additions & 0 deletions convert/data/bastion_resources.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
name: bastion
type: optional
resources:
eks_stack:
- cf_sgr:
rule: _ingress_tcp_22_22_
rule_sg: bastionsg
rule_sg_stack: vpc_stack
sg: UnmanagedSG
tf: module.domino_eks.module.eks.aws_security_group_rule.bastion_eks["eks_nodes_ssh_from_bastion"]
vpc_stack:
- cf: bastionsg
tf: module.domino_eks.module.bastion[0].aws_security_group.bastion
- cf_sgr:
rule: _egress_all_0_0_0.0.0.0/0
sg: bastionsg
tf: module.domino_eks.module.bastion[0].aws_security_group_rule.bastion_outbound
- cf_sgr:
rule: _ingress_tcp_22_22_0.0.0.0/0
sg: bastionsg
tf: module.domino_eks.module.bastion[0].aws_security_group_rule.bastion["bastion_inbound_ssh"]
Loading

0 comments on commit c92d6e2

Please sign in to comment.