diff --git a/Makefile b/Makefile index 0d3eaae87..6359b31fd 100644 --- a/Makefile +++ b/Makefile @@ -110,7 +110,7 @@ test-go-units: .PHONY: test-go-units-crdb test-go-units-crdb: cleanup-test-go-units-crdb - @docker run -d --name dss-crdb-for-testing -p 26257:26257 -p 8080:8080 cockroachdb/cockroach:v21.2.7 start-single-node --listen-addr=0.0.0.0 --insecure > /dev/null + @docker run -d --name dss-crdb-for-testing -p 26257:26257 -p 8080:8080 cockroachdb/cockroach:v24.1.3 start-single-node --insecure > /dev/null @until [ -n "`docker logs dss-crdb-for-testing | grep 'nodeID'`" ]; do echo "Waiting for CRDB to be ready"; sleep 3; done; go run ./cmds/db-manager/main.go --schemas_dir ./build/db_schemas/rid --db_version latest --cockroach_host localhost go test -count=1 -v ./pkg/rid/store/cockroach --cockroach_host localhost --cockroach_port 26257 --cockroach_ssl_mode disable --cockroach_user root --cockroach_db_name rid diff --git a/build/dev/docker-compose_dss.yaml b/build/dev/docker-compose_dss.yaml index e84e4d078..3ce9dad1f 100644 --- a/build/dev/docker-compose_dss.yaml +++ b/build/dev/docker-compose_dss.yaml @@ -8,7 +8,7 @@ version: '3.8' services: local-dss-crdb: - image: cockroachdb/cockroach:v21.2.7 + image: cockroachdb/cockroach:v24.1.3 command: start-single-node --insecure ports: - "26257:26257" diff --git a/cmds/core-service/README.md b/cmds/core-service/README.md index 40e2573c4..bf58c629e 100644 --- a/cmds/core-service/README.md +++ b/cmds/core-service/README.md @@ -32,7 +32,7 @@ go run ./cmds/core-service \ To run correctly, core-service must be able to [access](../../pkg/cockroach/flags/flags.go) a CockroachDB cluster. Provision of this cluster is handled automatically for a local development environment if following [the instructions for a standalone instance](../../build/dev/standalone_instance.md). Or, a CockroachDB instance can be created manually with: ```bash -docker container run -p 26257:26257 -p 8080:8080 --rm cockroachdb/cockroach:v21.2.7 start-single-node --insecure +docker container run -p 26257:26257 -p 8080:8080 --rm cockroachdb/cockroach:v24.1.3 start-single-node --insecure ``` #### Database configuration diff --git a/deploy/MIGRATION.md b/deploy/MIGRATION.md index 72c3ee1cb..69cb6e993 100644 --- a/deploy/MIGRATION.md +++ b/deploy/MIGRATION.md @@ -1,55 +1,141 @@ -# Kubernetes version migration +# CockroachDB and Kubernetes version migration -This page provides information on how to upgrade your Kubernetes cluster deployed using the +This page provides information on how to upgrade your CockroachDB and Kubernetes cluster deployed using the tools from this repository. +## CockroachDB upgrades + +CockroachDB must be upgraded on all DSS instances of the pool one after the other. The rollout of the upgrades on +the whole CRDB cluster must be carefully performed in sequence to keep the majority of nodes healthy during that period +and prevent downtime. +For a Pooled deployment, one of the DSS Instance must take the role of the upgrade "Leader" and coordinate the +upgrade with other "Followers" DSS instances. +In general a CockroachDB upgrade consists of: +1. Upgrade preparation: Verify that the cluster is in a nominal state ready for upgrade. +1. Decide how the upgrade will be finalized (for major upgrades only): Like CockroachDB, we recommend disabling auto-finalization. +1. Perform the rolling upgrade: This step should be performed first by the Leader and as quickly as possible by the Followers **one after the other**. Note that during this period, the performance of the cluster may be impacted since, as documented by CockroachDB, "a query that is sent to an upgraded node can be distributed only among other upgraded nodes. Data accesses that would otherwise be local may become remote, and the performance of these queries can suffer." +1. Roll back the upgrade (optional): Like the rolling upgrade, this step should be carefully coordinated with all DSS instances to guarantee the minimum number of healthy nodes to keep the cluster available. +1. Finish the upgrade: This step should be accomplished by the Leader. + +The following sections provide links to the CockroachDB migration documentation depending on your deployment type, which can +be different by DSS instance. + +**Important notes:** + +- Further work is required to test and evaluate the availability of the DSS during migrations. +- We recommend to review carefully the instructions provided by CockroachDB and to rehearse all migrations on a test + environment before applying them to production. + +### Terraform deployment + +If a DSS instance has been deployed with terraform, first upgrade the cluster using [Helm](MIGRATION.md#helm-deployment) +or [Tanka](MIGRATION.md#tanka-deployment). Then, update the variable `crdb_image_tag` in your `terraform.tfvars` to +align your configuration with the new state of the cluster. + +### Helm deployment + +If you deployed the DSS using the Helm chart and the instructions provided in this repository, follow the instructions +provided by CockroachDB `Cluster Upgrade with Helm` (See specific links below). Note that the CockroachDB documentation +suggests to edit the values using `helm upgrade ... --set` commands. You will need to use the root key `cockroachdb` +since the cockroachdb Helm chart is a dependency of the dss chart. +For instance, setting the image tag and partition using the command line would look like this: +``` +helm upgrade [RELEASE_NAME] [PATH_TO_DSS_HELM] --set cockroachdb.image.tag=v24.1.3 --reuse-values +``` +``` +helm upgrade [RELEASE_NAME] [PATH_TO_DSS_HELM] --set cockroachdb.statefulset.updateStrategy.rollingUpdate.partition=0 --reuse-values +``` +Alternatively, you can update `helm_values.yml` in your deployment and set the new image tag and rollout partition like this: +```yaml +cockroachdb: + image: + # ... + tag: # version + statefulset: + updateStrategy: + rollingUpdate: + partition: 0 +``` +New values can then be applied using `helm upgrade [RELEASE_NAME] [PATH_TO_DSS_HELM] -f [helm_values.yml]`. +We recommend the second approach to keep your helm values in sync with the cluster state. + +#### 21.2.7 to 24.1.3 + +CockroachDB requires to upgrade one minor version at a time, therefore the following migrations have to be performed: + +1. 21.2.7 to 22.1: see [CockroachDB Cluster upgrade for Helm](https://www.cockroachlabs.com/docs/v22.1/upgrade-cockroachdb-kubernetes?filters=helm). +1. 22.1 to 22.2: see [CockroachDB Cluster upgrade for Helm](https://www.cockroachlabs.com/docs/v22.2/upgrade-cockroachdb-kubernetes?filters=helm). +1. 22.2 to 23.1: see [CockroachDB Cluster upgrade for Helm](https://www.cockroachlabs.com/docs/v23.1/upgrade-cockroachdb-kubernetes?filters=helm). +1. 23.1 to 23.2: see [CockroachDB Cluster upgrade for Helm](https://www.cockroachlabs.com/docs/v23.2/upgrade-cockroachdb-kubernetes?filters=helm). +1. 23.2 to 24.1.3: see [CockroachDB Cluster upgrade for Helm](https://www.cockroachlabs.com/docs/v24.1/upgrade-cockroachdb-kubernetes?filters=helm). + +### Tanka deployment + +For deployments using Tanka configuration, since no instructions are provided for Tanka specifically, +we recommend to follow the manual steps documented by CockroachDB: `Cluster Upgrade with Manual configs`. +(See specific links below) To apply the changes to your cluster, follow the manual steps and reflect the new +values in the *Leader* and *Followers* Tanka configurations, namely the new image version (see +[`VAR_CRDB_DOCKER_IMAGE_NAME`](../build/README.md)) to ensure the new configuration is aligned with the cluster state. + +#### 21.2.7 to 24.1.3 + +CockroachDB requires to upgrade one minor version at a time, therefore the following migrations have to be performed: + +1. 21.2.7 to 22.1: see [CockroachDB Cluster upgrade with Manual configs](https://www.cockroachlabs.com/docs/v22.1/upgrade-cockroachdb-kubernetes?filters=manual). +1. 22.1 to 22.2: see [CockroachDB Cluster upgrade with Manual configs](https://www.cockroachlabs.com/docs/v22.2/upgrade-cockroachdb-kubernetes?filters=manual). +1. 22.2 to 23.1: see [CockroachDB Cluster upgrade with Manual configs](https://www.cockroachlabs.com/docs/v23.1/upgrade-cockroachdb-kubernetes?filters=manual). +1. 23.1 to 23.2: see [CockroachDB Cluster upgrade with Manual configs](https://www.cockroachlabs.com/docs/v23.2/upgrade-cockroachdb-kubernetes?filters=manual). +1. 23.2 to 24.1.3: see [CockroachDB Cluster upgrade with Manual configs](https://www.cockroachlabs.com/docs/v24.1/upgrade-cockroachdb-kubernetes?filters=manual). + +## Kubernetes upgrades + **Important notes:** - The migration plan below has been tested with the deployment of services using [Helm](services/helm-charts) and [Tanka](../build/deploy) without Istio enabled. Note that this configuration flag has been decommissioned since [#995](https://github.com/interuss/dss/pull/995). - Further work is required to test and evaluate the availability of the DSS during migrations. - It is highly recommended to rehearse such operation on a test cluster before applying them to a production environment. -## Google - Google Kubernetes Engine +### Google - Google Kubernetes Engine Migrations of GKE clusters are managed using terraform. -### 1.27 to 1.28 +#### 1.27 to 1.28 1. Change your `terraform.tfvars` to use `1.28` by adding or updating the `kubernetes_version` variable: ```terraform kubernetes_version = 1.28 ``` -2. Run `terraform apply`. This operation may take more than 30min. -3. Monitor the upgrade of the nodes in the Google Cloud console. +1. Run `terraform apply`. This operation may take more than 30min. +1. Monitor the upgrade of the nodes in the Google Cloud console. -### 1.26 to 1.27 +#### 1.26 to 1.27 1. Change your `terraform.tfvars` to use `1.27` by adding or updating the `kubernetes_version` variable: ```terraform kubernetes_version = 1.27 ``` -2. Run `terraform apply`. This operation may take more than 30min. -3. Monitor the upgrade of the nodes in the Google Cloud console. +1. Run `terraform apply`. This operation may take more than 30min. +1. Monitor the upgrade of the nodes in the Google Cloud console. -### 1.25 to 1.26 +#### 1.25 to 1.26 1. Change your `terraform.tfvars` to use `1.26` by adding or updating the `kubernetes_version` variable: ```terraform kubernetes_version = 1.26 ``` -2. Run `terraform apply` -3. Monitor the upgrade of the nodes in the Google Cloud console. +1. Run `terraform apply` +1. Monitor the upgrade of the nodes in the Google Cloud console. -### 1.24 to 1.25 +#### 1.24 to 1.25 1. Change your `terraform.tfvars` to use `1.25` by adding or updating the `kubernetes_version` variable: ```terraform kubernetes_version = 1.25 ``` -2. Run `terraform apply`. This operation may take more than 30min. -3. Monitor the upgrade of the nodes in the Google Cloud console. +1. Run `terraform apply`. This operation may take more than 30min. +1. Monitor the upgrade of the nodes in the Google Cloud console. -## AWS - Elastic Kubernetes Service +### AWS - Elastic Kubernetes Service Currently, upgrades of EKS can't be achieved reliably with terraform directly. The recommended workaround is to use the web console of AWS Elastic Kubernetes Service (EKS) to upgrade the cluster. @@ -57,42 +143,42 @@ Before proceeding, always check on the cluster page the *Upgrade Insights* tab w availability of Kubernetes resources in each version. The following sections omit this check if no resource is expected to be reported in the context of a standard deployment performed with the tools in this repository. -### 1.27 to 1.28 +#### 1.27 to 1.28 1. Upgrade the cluster (control plane) using the AWS console. It should take ~15 minutes. -2. Update the *Node Group* in the *Compute* tab with *Rolling Update* strategy to upgrade the nodes using the AWS console. -3. Change your `terraform.tfvars` to use `1.28` by adding or updating the `kubernetes_version` variable: +1. Update the *Node Group* in the *Compute* tab with *Rolling Update* strategy to upgrade the nodes using the AWS console. +1. Change your `terraform.tfvars` to use `1.28` by adding or updating the `kubernetes_version` variable: ```terraform kubernetes_version = 1.28 ``` -### 1.26 to 1.27 +#### 1.26 to 1.27 1. Upgrade the cluster (control plane) using the AWS console. It should take ~15 minutes. -2. Update the *Node Group* in the *Compute* tab with *Rolling Update* strategy to upgrade the nodes using the AWS console. -3. Change your `terraform.tfvars` to use `1.27` by adding or updating the `kubernetes_version` variable: +1. Update the *Node Group* in the *Compute* tab with *Rolling Update* strategy to upgrade the nodes using the AWS console. +1. Change your `terraform.tfvars` to use `1.27` by adding or updating the `kubernetes_version` variable: ```terraform kubernetes_version = 1.27 ``` -### 1.25 to 1.26 +#### 1.25 to 1.26 1. Upgrade the cluster (control plane) using the AWS console. It should take ~15 minutes. -2. Update the *Node Group* in the *Compute* tab with *Rolling Update* strategy to upgrade the nodes using the AWS console. -3. Change your `terraform.tfvars` to use `1.26` by adding or updating the `kubernetes_version` variable: +1. Update the *Node Group* in the *Compute* tab with *Rolling Update* strategy to upgrade the nodes using the AWS console. +1. Change your `terraform.tfvars` to use `1.26` by adding or updating the `kubernetes_version` variable: ```terraform kubernetes_version = 1.26 ``` -### 1.24 to 1.25 +#### 1.24 to 1.25 1. Check for deprecated resources: - Click on the Upgrade Insights tab to see deprecation warnings on the cluster page. - Evaluate errors in Deprecated APIs removed in Kubernetes v1.25. Using `kubectl get podsecuritypolicies`, check if there is only one *Pod Security Policy* named `eks.privileged`. If it is the case, according to the [AWS documentation](https://docs.aws.amazon.com/eks/latest/userguide/pod-security-policy-removal-faq.html), you can proceed. -2. Upgrade the cluster using the AWS console. It should take ~15 minutes. -3. Change your `terraform.tfvars` to use `1.25` by adding or updating the `kubernetes_version` variable: +1. Upgrade the cluster using the AWS console. It should take ~15 minutes. +1. Change your `terraform.tfvars` to use `1.25` by adding or updating the `kubernetes_version` variable: ```terraform kubernetes_version = 1.25 ``` diff --git a/deploy/infrastructure/modules/terraform-aws-dss/terraform.dev.example.tfvars b/deploy/infrastructure/modules/terraform-aws-dss/terraform.dev.example.tfvars index d06e9bff3..d9cc764a3 100644 --- a/deploy/infrastructure/modules/terraform-aws-dss/terraform.dev.example.tfvars +++ b/deploy/infrastructure/modules/terraform-aws-dss/terraform.dev.example.tfvars @@ -26,7 +26,7 @@ authorization = { should_init = true # CockroachDB -crdb_image_tag = "v21.2.7" +crdb_image_tag = "v24.1.3" crdb_cluster_name = "interuss_example" crdb_locality = "interuss_dss-aws-ew1" crdb_external_nodes = [] diff --git a/deploy/infrastructure/modules/terraform-google-dss/terraform.dev.example.tfvars b/deploy/infrastructure/modules/terraform-google-dss/terraform.dev.example.tfvars index 9f192c0c6..70e21eff2 100644 --- a/deploy/infrastructure/modules/terraform-google-dss/terraform.dev.example.tfvars +++ b/deploy/infrastructure/modules/terraform-google-dss/terraform.dev.example.tfvars @@ -27,7 +27,7 @@ authorization = { should_init = true # CockroachDB -crdb_image_tag = "v21.2.7" +crdb_image_tag = "v24.1.3" crdb_cluster_name = "interuss_example" crdb_locality = "interuss_dss-dev-w6a" crdb_external_nodes = [] diff --git a/deploy/operations/Dockerfile b/deploy/operations/Dockerfile index 3d7eb0e2f..2e969f7ff 100644 --- a/deploy/operations/Dockerfile +++ b/deploy/operations/Dockerfile @@ -1,6 +1,6 @@ FROM ubuntu:22.04 -ENV COCKROACH_VERSION 21.2.7 +ENV COCKROACH_VERSION 24.1.3 RUN apt-get update \ && apt-get install -y unzip curl gnupg lsb-release apt-transport-https ca-certificates diff --git a/deploy/operations/ci/aws-1/terraform.tfvars b/deploy/operations/ci/aws-1/terraform.tfvars index c9c42c775..d4e007e02 100644 --- a/deploy/operations/ci/aws-1/terraform.tfvars +++ b/deploy/operations/ci/aws-1/terraform.tfvars @@ -23,7 +23,7 @@ authorization = { public_key_pem_path = "/test-certs/auth2.pem" } should_init = true -crdb_image_tag = "v21.2.7" +crdb_image_tag = "v24.3.1" crdb_cluster_name = "interuss-ci" crdb_locality = "interuss_dss-ci-aws-ue1" crdb_external_nodes = [] diff --git a/deploy/services/helm-charts/dss/values.example.yaml b/deploy/services/helm-charts/dss/values.example.yaml index e5b5094c2..c74d2c799 100644 --- a/deploy/services/helm-charts/dss/values.example.yaml +++ b/deploy/services/helm-charts/dss/values.example.yaml @@ -13,7 +13,7 @@ dss: cockroachdb: # See https://github.com/cockroachdb/helm-charts/blob/master/cockroachdb/values.yaml image: - tag: v21.2.7 + tag: v24.3.1 fullnameOverride: dss-cockroachdb conf: join: [] diff --git a/test/migrations/clear_db.sh b/test/migrations/clear_db.sh index 15d5194f9..abde3240d 100755 --- a/test/migrations/clear_db.sh +++ b/test/migrations/clear_db.sh @@ -8,5 +8,5 @@ echo "Starting CRDB container" docker run -d --rm --name dss-crdb-for-migration-testing \ -p 26257:26257 \ -p 8080:8080 \ - cockroachdb/cockroach:v21.2.7 start-single-node \ + cockroachdb/cockroach:v24.1.3 start-single-node \ --insecure > /dev/null