diff --git a/.github/dependabot.yml b/.github/dependabot.yml index fd402df3e..aef7fdd52 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -1,33 +1,33 @@ version: 2 updates: -- package-ecosystem: "github-actions" - directory: "/" - schedule: - interval: daily -- package-ecosystem: "docker" - directory: "/.github/workflows" - schedule: - interval: daily -- package-ecosystem: npm - directory: "/" - schedule: - interval: daily - open-pull-requests-limit: 10 - groups: - aws-sdk: - patterns: - - "@aws-sdk/*" - aws-cdk: - patterns: - - "@aws-cdk/*" - - "aws-cdk" - - "aws-cdk-lib" - - "cdk8s" - - "cdk8s-cli" - - "cdk8s-plus-*" - - "constructs" - ignore: - - dependency-name: "@aws-sdk/*" - update-types: ["version-update:semver-patch"] - - dependency-name: "@types/node" - update-types: ["version-update:semver-patch"] + - package-ecosystem: 'github-actions' + directory: '/' + schedule: + interval: daily + - package-ecosystem: 'docker' + directory: '/.github/workflows' + schedule: + interval: daily + - package-ecosystem: npm + directory: '/' + schedule: + interval: daily + open-pull-requests-limit: 10 + groups: + aws-sdk: + patterns: + - '@aws-sdk/*' + aws-cdk: + patterns: + - '@aws-cdk/*' + - 'aws-cdk' + - 'aws-cdk-lib' + - 'cdk8s' + - 'cdk8s-cli' + - 'cdk8s-plus-*' + - 'constructs' + ignore: + - dependency-name: '@aws-sdk/*' + update-types: ['version-update:semver-patch'] + - dependency-name: '@types/node' + update-types: ['version-update:semver-patch'] diff --git a/README.md b/README.md index 49fdfdd26..c0ab12387 100644 --- a/README.md +++ b/README.md @@ -31,7 +31,6 @@ To connect to the EKS cluster you need to be [logged into AWS](https://toitutewh Then to setup the cluster, only the first time using the cluster you need to run this - ```bash aws --region=ap-southeast-2 eks update-kubeconfig --name=Workflows ``` diff --git a/docs/infrastructure/components/karpenter.md b/docs/infrastructure/components/karpenter.md index f0f6647ce..bc93aaeed 100644 --- a/docs/infrastructure/components/karpenter.md +++ b/docs/infrastructure/components/karpenter.md @@ -1 +1 @@ -# Karpenter \ No newline at end of file +# Karpenter diff --git a/docs/infrastructure/destroy.md b/docs/infrastructure/destroy.md new file mode 100644 index 000000000..5f3099d15 --- /dev/null +++ b/docs/infrastructure/destroy.md @@ -0,0 +1,25 @@ +# How to destroy an installation + +Destroying the cluster and stack is not easy, because we use some custom EKS resources to link the two together. Based on a teardown, at time of writing the following sequence should work: + +1. Delete the cluster: + + ```bash + aws eks delete-cluster --name=Workflows + aws eks wait cluster-deleted --name=Workflows + ``` + +1. Attempt to delete the stack: + + ```bash + aws cloudformation delete-stack --stack-name=Workflows + aws cloudformation wait stack-delete-complete --stack-name=Workflows + ``` + +1. Wait for the above to fail. +1. Go to the [stack in AWS console](https://ap-southeast-2.console.aws.amazon.com/cloudformation/home?region=ap-southeast-2#/stacks/?filteringText=Workflows&filteringStatus=active&viewNested=true) +1. Delete the stack, retaining all the resources which could not be deleted + +The reason we don't use the CLI for the last step is that the logical ID of the resources which could not be deleted does not seem to be the same as the ones which need to be retained. The reason is uncertain, but for now deleting in the console is safer. + +[How do I troubleshoot custom resource failures in AWS CloudFormation?](https://repost.aws/knowledge-center/cfn-troubleshoot-custom-resource-failures) might be relevant for future issues like this. diff --git a/docs/infrastructure/helm.md b/docs/infrastructure/helm.md index 11c0dae50..a7d1ce1b4 100644 --- a/docs/infrastructure/helm.md +++ b/docs/infrastructure/helm.md @@ -12,4 +12,4 @@ However, some of the component Helm charts do not have a `values.schema.json`. A - [aws-for-fluent-bit](./components/fluentbit.md) () - [Karpenter](./components/karpenter.md) -- [Argo workflows](./components/argo.workflows.md) \ No newline at end of file +- [Argo workflows](./components/argo.workflows.md) diff --git a/docs/infrastructure/initial.deployment.md b/docs/infrastructure/initial.deployment.md index 33fec4a2d..6d13a61bb 100644 --- a/docs/infrastructure/initial.deployment.md +++ b/docs/infrastructure/initial.deployment.md @@ -12,5 +12,6 @@ The first time a cluster is deployed Custom Resource Definitions (CRD) will not This means that any resources that require a CRD will fail to deploy with a error similar to > resource mapping not found for name: "karpenter-template" namespace: "" from "dist/0003-karpenter-provisioner.k8s.yaml": no matches for kind "AWSNodeTemplate" in version "karpenter.k8s.aws/v1alpha1" +> ensure CRDs are installed first -To work around this problem the first deployment can be repeated, as the CRDs are deployed early in the deployment process. +To work around this problem, re-run the `kubectl apply` command. diff --git a/docs/infrastructure/kubernetes.version.md b/docs/infrastructure/kubernetes.version.md index 3b5c61e72..fc350dda0 100644 --- a/docs/infrastructure/kubernetes.version.md +++ b/docs/infrastructure/kubernetes.version.md @@ -17,6 +17,7 @@ If there is a version matching to the Kubernetes version to upgrade to, upgrade ```bash npm install --save-dev cdk8s-plus-27 ``` + 2. Remove the previous version ```bash @@ -34,12 +35,13 @@ Below is an example of upgrading from v1.27 to v1.28 ```bash npm install --save-dev @aws-cdk/lambda-layer-kubectl-v28 ``` - + While also removing the old lambda-layer version - + ```bash npm rm @aws-cdk/lambda-layer-kubectl-v27 ``` + 2. Set the new Kubernetes version in `LinzEksCluster` ```typescript @@ -50,9 +52,9 @@ Below is an example of upgrading from v1.27 to v1.28 ```typescript import { KubectlV28Layer } from '@aws-cdk/lambda-layer-kubectl-v28'; - + // ... - + kubectlLayer: new KubectlV28Layer(this, 'KubeCtlLayer'), ``` @@ -64,9 +66,9 @@ Below is an example of upgrading from v1.27 to v1.28 workflow_maintainer_role="$(aws cloudformation describe-stacks --stack-name=TopographicSharedResourcesProd | jq --raw-output .Stacks[0].Outputs[0].OutputValue)" npx cdk diff --context=maintainer-arns="${ci_role},${admin_role},${workflow_maintainer_role}" Workflows ``` - + The only changes should be Kubernetes version related. - + ``` Resources [~] AWS::Lambda::LayerVersion KubeCtlLayer KubeCtlLayer replace @@ -95,8 +97,9 @@ Below is an example of upgrading from v1.27 to v1.28 ## Cycle out EC2 Nodes to the new version + > **Are Amazon EKS managed node groups automatically updated along with the cluster control plane version?** -No. A managed node group creates Amazon EC2 instances in your account. These instances aren't automatically upgraded when you or Amazon EKS update your control plane. For more information, see Updating a managed node group. We recommend maintaining the same Kubernetes version on your control plane and nodes. +> No. A managed node group creates Amazon EC2 instances in your account. These instances aren't automatically upgraded when you or Amazon EKS update your control plane. For more information, see Updating a managed node group. We recommend maintaining the same Kubernetes version on your control plane and nodes. This process is necessary to avoid being blocked for a future Kubernetes version upgrade. For example, if Kubernetes get upgraded from `1.27` to `1.28` and the nodes remain in `1.27`, the next time Kubernetes will be upgraded to `1.29`, the upgrade will fail. @@ -105,10 +108,11 @@ This process is necessary to avoid being blocked for a future Kubernetes version ```bash node_group_name="$(aws eks list-nodegroups --cluster-name=Workflows | jq --raw-output '.nodegroups[]')" ``` + 2. Describe the nodegroup to validate the versions By describing the node group you can check the current version, or you can use `k get nodes` to see what version is currently running - + ```bash aws eks describe-nodegroup --cluster-name=Workflows --nodegroup-name="$node_group_name" | jq --raw-output .nodegroup.version ``` @@ -118,9 +122,9 @@ This process is necessary to avoid being blocked for a future Kubernetes version ```bash aws eks update-nodegroup-version --cluster-name=Workflows --nodegroup-name="$node_group_name" ``` - + This step takes some time to run. You can wait for it to finish with this command: - + ```bash aws eks wait nodegroup-active --cluster-name=Workflows --nodegroup-name="$node_group_name" ``` diff --git a/docs/labels.md b/docs/labels.md index a43ba2069..9b406b1f4 100644 --- a/docs/labels.md +++ b/docs/labels.md @@ -8,11 +8,11 @@ The following list of labels should be used in conjunction with Kubernetes [well ## Workflows -| Label | Description | Examples | -| --------------------- | ---------------------------------------- |--------------------------------------| -| `linz.govt.nz/ticket` | JIRA Ticket number | `TDE-912`, `BM-37` | -| `linz.govt.nz/region` | Geographic region that object relates to | "wellington", "auckland" | -| `linz.govt.nz/category` | The LINZ group that owns the workflow | "basemaps", "raster", "test", "util" | +| Label | Description | Examples | +| ----------------------- | ---------------------------------------- | ------------------------------------ | +| `linz.govt.nz/ticket` | JIRA Ticket number | `TDE-912`, `BM-37` | +| `linz.govt.nz/region` | Geographic region that object relates to | "wellington", "auckland" | +| `linz.govt.nz/category` | The LINZ group that owns the workflow | "basemaps", "raster", "test", "util" | For the type of data that is being processed @@ -25,12 +25,12 @@ For the type of data that is being processed Most other objects deployed via AWS-CDK and CDK8s should also include information about the CICD process that deployed it -| Label | Description | Examples | -| -------------------------- | ---------------------------------------- | ------------------------------------------ | -| `linz.govt.nz/git-hash` | git hash that deployed the object | "bb3dab2779922094d2b8ecd4c67f30c66b38613d" | -| `linz.govt.nz/git-version` | git version information | "v6.46.0", "v0.0.1-20-gbb3dab27" | -| `linz.govt.nz/git-repository` | git repository that the object came from | "linz\_\_topo-workflows" | -| `linz.govt.nz/build-id` | Unique ID of the build that deployed | "6806791032-1" | +| Label | Description | Examples | +| ----------------------------- | ---------------------------------------- | ------------------------------------------ | +| `linz.govt.nz/git-hash` | git hash that deployed the object | "bb3dab2779922094d2b8ecd4c67f30c66b38613d" | +| `linz.govt.nz/git-version` | git version information | "v6.46.0", "v0.0.1-20-gbb3dab27" | +| `linz.govt.nz/git-repository` | git repository that the object came from | "linz\_\_topo-workflows" | +| `linz.govt.nz/build-id` | Unique ID of the build that deployed | "6806791032-1" | ## Label Usage diff --git a/infra/README.md b/infra/README.md index c9ee43427..8072269eb 100644 --- a/infra/README.md +++ b/infra/README.md @@ -30,6 +30,7 @@ Main entry point: [app](./cdk8s.ts) ```shell npm install ``` + - Login to AWS ### Deploy CDK diff --git a/templates/argo-tasks/stac-validate.yml b/templates/argo-tasks/stac-validate.yml index bc6a87710..838535b7b 100644 --- a/templates/argo-tasks/stac-validate.yml +++ b/templates/argo-tasks/stac-validate.yml @@ -51,10 +51,10 @@ spec: - name: AWS_ROLE_CONFIG_PATH value: s3://linz-bucket-config/config.json args: - - 'stac' - - 'validate' - - '--concurrency={{inputs.parameters.concurrency}}' - - '--recursive={{inputs.parameters.recursive}}' - - '--checksum-assets={{inputs.parameters.checksum_assets}}' - - '--checksum-links={{inputs.parameters.checksum_links}}' - - '{{inputs.parameters.uri}}' + - 'stac' + - 'validate' + - '--concurrency={{inputs.parameters.concurrency}}' + - '--recursive={{inputs.parameters.recursive}}' + - '--checksum-assets={{inputs.parameters.checksum_assets}}' + - '--checksum-links={{inputs.parameters.checksum_links}}' + - '{{inputs.parameters.uri}}'