From c03e6c856cb612e7105baaca7cb4c2396a041e4e Mon Sep 17 00:00:00 2001 From: David Costa Date: Thu, 21 Dec 2023 15:35:27 +0000 Subject: [PATCH] feat!: autoscaler with scaling schedules Add the ability to use an autoscaler to scale down to zero outside the defined schedules. Only non-stateful MIGs can be used with autoscalers, so this commit also removes the responsibility of creating the home folder disk (atlantis-disk-0) from the MIG, effectively making it a stateless MIG. Nonetheless, destroying the group will not destroy the disk. Add resources for the disk and the autoscaler, and a usage example. Update the README. BREAKING CHANGE: the 50GB stateful disk is no longer created by the mig, which makes the mig no longer stateful. Additionally, if terraform destroy is executed, the disk is destroyed. --- README.md | 3 + examples/autoscaling/README.md | 35 +++++++++ examples/autoscaling/main.tf | 93 +++++++++++++++++++++++ examples/autoscaling/server-atlantis.yaml | 6 ++ main.tf | 77 +++++++++++++------ variables.tf | 14 ++++ 6 files changed, 204 insertions(+), 24 deletions(-) create mode 100644 examples/autoscaling/README.md create mode 100644 examples/autoscaling/main.tf create mode 100644 examples/autoscaling/server-atlantis.yaml diff --git a/README.md b/README.md index 2f4cbea..019d97f 100644 --- a/README.md +++ b/README.md @@ -47,6 +47,8 @@ This Terraform module deploys various resources to run Atlantis on Google Comput - **Confidential VM** - A Confidential VM is a type of Compute Engine VM that ensures that your data and applications stay private and encrypted even while in use. You can use a Confidential VM as part of your security strategy so you do not expose sensitive data or workloads during processing. Note that Confidential VM [does not support live migration](https://cloud.google.com/confidential-computing/confidential-vm/docs/error-messages#live_migration_isnt_supported), so if this feature is enabled, `onHostMaintenance` will be set to `TERMINATE`. +- **Scale to zero** - Use [scaling schedules](https://cloud.google.com/compute/docs/autoscaler/scaling-schedules#schedule_configuration_options) to allow scaling to zero outside of business hours. Useful to minimize costs. + ## Prerequisites This module expects that you already own or create the below resources yourself. @@ -66,6 +68,7 @@ Here are some examples to choose from. Look at the prerequisites above to find o - [Secure Environment Variables](https://github.com/runatlantis/terraform-gce-atlantis/tree/master/examples/secure-env-vars) - [Cloud Armor](https://github.com/runatlantis/terraform-gce-atlantis/tree/master/examples/cloud-armor) - [Shared VPC](https://github.com/runatlantis/terraform-gce-atlantis/tree/master/examples/shared-vpc) +- [Scale to zero](https://github.com/runatlantis/atlantis-on-gcp-vm/tree/master/examples/autoscaling) ```hcl module "atlantis" { diff --git a/examples/autoscaling/README.md b/examples/autoscaling/README.md new file mode 100644 index 0000000..c42c717 --- /dev/null +++ b/examples/autoscaling/README.md @@ -0,0 +1,35 @@ +# Example usage + +This example uses [scaling schedules](https://cloud.google.com/compute/docs/autoscaler/scaling-schedules#schedule_configuration_options) to only deploy Atlantis during business hours. + +The schedules follow the syntax [described in the documentation](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_autoscaler#nested_scaling_schedules), but in short: + +- The time zone must be a time zone from the tz database: +- The schedule field uses the extended cron format + +> [!NOTE] +> It takes 2 to 3 minutes from the beginning of the scheduled time for the instance to be ready to serve requests. After the scheduled end time, it approximately takes 10 minutes for the instance to be destroyed. + +Read through the below before you deploy this module. + +- [Prerequisites](#prerequisites) +- [How to deploy](#how-to-deploy) +- [After it's successfully deployed](#after-its-successfully-deployed) + +## Prerequisites + +This module expects that you already own or create the below resources yourself. + +- Google network, subnetwork and a Cloud NAT +- Service account, [specifics can be found here](../../README.md#service-account) +- Domain, [specifics can be found here](../../README.md#dns-record) + +If you prefer an example that includes the above resources, see [`complete example`](https://github.com/runatlantis/atlantis-on-gcp-vm/tree/master/examples/complete). + +## How to deploy + +See [`main.tf`](https://github.com/runatlantis/atlantis-on-gcp-vm/tree/master/examples/basic/main.tf) and the [`server-atlantis.yaml`](https://github.com/runatlantis/atlantis-on-gcp-vm/tree/master/examples/basic/server-atlantis.yaml). + +## After it's successfully deployed + +Once you're done, see [Configuring Webhooks for Atlantis](https://www.runatlantis.io/docs/configuring-webhooks.html#configuring-webhooks) diff --git a/examples/autoscaling/main.tf b/examples/autoscaling/main.tf new file mode 100644 index 0000000..6d0aa6a --- /dev/null +++ b/examples/autoscaling/main.tf @@ -0,0 +1,93 @@ +locals { + project_id = "" + network = "" + subnetwork = "" + region = "" + zone = "" + domain = "" + managed_zone = "" + + github_repo_allow_list = "github.com/example/*" + github_user = "" + github_token = "" + github_webhook_secret = "" +} + +# Create a service account and attach the required Cloud Logging permissions to it. +resource "google_service_account" "atlantis" { + account_id = "atlantis" + display_name = "Service Account for Atlantis" + project = local.project_id +} + +resource "google_project_iam_member" "atlantis_log_writer" { + role = "roles/logging.logWriter" + member = "serviceAccount:${google_service_account.atlantis.email}" + project = local.project_id +} + +resource "google_project_iam_member" "atlantis_metric_writer" { + role = "roles/monitoring.metricWriter" + member = "serviceAccount:${google_service_account.atlantis.email}" + project = local.project_id +} + +module "atlantis" { + source = "bschaatsbergen/atlantis/gce" + name = "atlantis" + network = local.network + subnetwork = local.subnetwork + region = local.region + zone = local.zone + service_account = { + email = google_service_account.atlantis.email + scopes = ["cloud-platform"] + } + # Note: environment variables are shown in the Google Cloud UI + # See the `examples/secure-env-vars` if you want to protect sensitive information + env_vars = { + ATLANTIS_GH_USER = local.github_user + ATLANTIS_GH_TOKEN = local.github_token + ATLANTIS_GH_WEBHOOK_SECRET = local.github_webhook_secret + ATLANTIS_REPO_ALLOWLIST = local.github_repo_allow_list + ATLANTIS_ATLANTIS_URL = "https://${local.domain}" + ATLANTIS_REPO_CONFIG_JSON = jsonencode(yamldecode(file("${path.module}/server-atlantis.yaml"))) + } + + autoscaling = { + schedules = [ + # Monday through Friday, between 7h30 and 19h30 + { + name = "business-hours" + description = "Deploy during business hours" + schedule = "30 07 * * 1-5" + time_zone = "Europe/London" + duration_sec = 12 * 60 * 60 + }, + # Monday through Friday, all day + # { + # name = "mon-fri" + # description = "Deploy during weekdays" + # schedule = "00 00 * * 1-5" + # time_zone = "Europe/London" + # duration_sec = 24 * 60 * 60 + # }, + ] + } + + domain = local.domain + project = local.project_id +} + +# As your DNS records might be managed at another registrar's site, we create the DNS record outside of the module. +# This record is mandatory in order to provision the managed SSL certificate successfully. +resource "google_dns_record_set" "default" { + name = "${local.domain}." + type = "A" + ttl = 60 + managed_zone = local.managed_zone + rrdatas = [ + module.atlantis.ip_address + ] + project = local.project_id +} diff --git a/examples/autoscaling/server-atlantis.yaml b/examples/autoscaling/server-atlantis.yaml new file mode 100644 index 0000000..71ec5f7 --- /dev/null +++ b/examples/autoscaling/server-atlantis.yaml @@ -0,0 +1,6 @@ +repos: +- id: /.*/ + apply_requirements: [mergeable] + allowed_overrides: [apply_requirements, workflow] + allow_custom_workflows: true + delete_source_branch_on_merge: true diff --git a/main.tf b/main.tf index 0c52930..38c61fc 100644 --- a/main.tf +++ b/main.tf @@ -170,24 +170,10 @@ resource "google_compute_instance_template" "default" { # Persistent disk for Atlantis disk { - device_name = "atlantis-disk-0" - disk_type = "pd-ssd" - mode = "READ_WRITE" - disk_size_gb = var.persistent_disk_size_gb - auto_delete = false - labels = merge( - local.atlantis_labels, - { - "disk-type" = "data" - }, - ) - - dynamic "disk_encryption_key" { - for_each = var.disk_kms_key_self_link != null ? [1] : [] - content { - kms_key_self_link = var.disk_kms_key_self_link - } - } + device_name = "atlantis-disk-0" + mode = "READ_WRITE" + source = google_compute_disk.persistent.name + auto_delete = false } network_interface { @@ -221,6 +207,27 @@ resource "google_compute_instance_template" "default" { } } +resource "google_compute_disk" "persistent" { + name = var.name + type = "pd-ssd" + size = var.persistent_disk_size_gb + zone = var.zone + labels = merge( + local.atlantis_labels, + { + "disk-type" = "data" + }, + ) + + dynamic "disk_encryption_key" { + for_each = var.disk_kms_key_self_link != null ? [1] : [] + content { + kms_key_self_link = var.disk_kms_key_self_link + } + } + +} + resource "google_compute_health_check" "default" { name = var.name check_interval_sec = 1 @@ -267,17 +274,13 @@ resource "google_compute_instance_group_manager" "default" { port = local.atlantis_port } - stateful_disk { - device_name = "atlantis-disk-0" - delete_rule = "NEVER" - } - auto_healing_policies { health_check = google_compute_health_check.default_instance_group_manager.id initial_delay_sec = 30 } - target_size = 1 + # We cannot set target_size when using an autoscaler + target_size = var.autoscaling == null ? 1 : null update_policy { type = "PROACTIVE" @@ -291,6 +294,32 @@ resource "google_compute_instance_group_manager" "default" { provider = google-beta } +resource "google_compute_autoscaler" "default" { + count = var.autoscaling == null ? 0 : 1 + + name = var.name + zone = var.zone + target = google_compute_instance_group_manager.default.id + + autoscaling_policy { + max_replicas = 1 # Allow at most one instance + min_replicas = 0 # Allow scaling down to zero + cooldown_period = 60 + + dynamic "scaling_schedules" { + for_each = var.autoscaling.schedules == null ? [] : var.autoscaling.schedules + content { + name = scaling_schedules.value.name + description = scaling_schedules.value.description + min_required_replicas = 1 + schedule = scaling_schedules.value.schedule + time_zone = scaling_schedules.value.time_zone + duration_sec = scaling_schedules.value.duration_sec + } + } + } +} + resource "google_compute_global_address" "default" { name = var.name project = var.project diff --git a/variables.tf b/variables.tf index 1ec7677..4a8db4a 100644 --- a/variables.tf +++ b/variables.tf @@ -195,3 +195,17 @@ variable "shared_vpc" { }) default = null } + +variable "autoscaling" { + description = "Allow the instance group to scale down to zero based on signals" + type = object({ + schedules = list(object({ + name = string + description = string + schedule = string + time_zone = string + duration_sec = number + })) + }) + default = null +}