Skip to content

Commit

Permalink
Full PRs of cost optimization topic (#144)
Browse files Browse the repository at this point in the history
* Allow to enable lifecycle for snapshot bucket, load balancer logs bucket and rds snapshot bucket (#111)

Default:
- enable lifecycle
- 7 days for fullnode snapshot, 31 days for logs and rds snapshot

* Allow to enable/disable multi_az option on master node and remove read_replica_2 instance on dev* (#113)

* Reduce fullnode, fullnode_snapshot, and fullnode_backup storage to 1TB (#114)

* Change root_block_device_delete_on_termination = true to avoid unattached volumes (#115)

* Set assign_public_ip = false as all the tasks are in private subnet (#116)

* Reduce msk storage of dev env to 500 (#117)

* Reduce socks memory to 8192 (#118)

* Allow to changes ecs tasks architecture, lambda funciton. Default value: (#119)

X86_64 to be ready with existing docker images

* Set default retetion for s3_rds_snapshot and load_balancer_logs to 14 (#121)

* Change full_node_root_block_device_size default value to 1000 GB (#122)

* Add new variable: msk_storage_size to set storage of kafka (#123)

* Allow to disable backup_full_node (#124)

* Fix image filter value to find image based on cpu architecture (#125)

* Change default value of create_backup_full_node to false (#127)

* Change msk storage to msk_storage_size

* Change default storage of full_node_snapshot to 1000

* Remove apne-1a az (#132)

* Revert "Remove apne-1a az (#132)" (#133)

This reverts commit 6c74332.

* [OTE-821] Add roundtable monitors for update affiliate info and update wallet total volume (#134)

* upgrade kafka version and reduce session timeout

* Add partition level logging for mainnet MSK

* Fix terraform apply resource conflict

* Add partition offset plot to vulcan dashboard

* dummy

* Add stale compliance data monitor for mainnet

* add roundtable monitors for update affiliate info and update wallet total volume

* Add AWS_REGION envvar to services that connect to kafka (#136)

* Add read replica storage variable (#137)

* add read replica storage var

* set default

* Remove Indexer dashboards from terraform (#139)

* Add variable for full_node root_block_device_size (#141)

* Update full_node_ap_northeast_1.tf

* Update variables.tf

* set log retention hours to 120 (#142)

* add default (#143)

---------

Co-authored-by: roy-dydx <[email protected]>
Co-authored-by: jerryfan01234 <[email protected]>
Co-authored-by: dydxwill <[email protected]>
  • Loading branch information
4 people authored Oct 16, 2024
1 parent 41a42b4 commit a51b44f
Show file tree
Hide file tree
Showing 17 changed files with 202 additions and 23 deletions.
10 changes: 10 additions & 0 deletions indexer/backup_full_node_ap_northeast_1.tf
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
module "backup_full_node_ap_northeast_1" {
source = "../modules/validator"
count = var.create_backup_full_node ? 1 : 0

environment = var.environment

Expand Down Expand Up @@ -37,7 +38,16 @@ module "backup_full_node_ap_northeast_1" {

use_persistent_docker_volume = var.full_node_use_persistent_docker_volume

root_block_device_size = var.full_node_root_block_device_size
root_block_device_delete_on_termination = true
ecs_task_cpu_architecture = var.fullnode_ecs_task_cpu_architecture

providers = {
aws = aws.ap_northeast_1
}
}

moved {
from = module.backup_full_node_ap_northeast_1
to = module.backup_full_node_ap_northeast_1[0]
}
3 changes: 2 additions & 1 deletion indexer/ecs.tf
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ resource "aws_ecs_service" "main" {
aws_subnet.private_subnets[subnet_name].id
] : [for subnet in aws_subnet.private_subnets : subnet.id]
security_groups = [aws_security_group.services[each.key].id]
assign_public_ip = true
assign_public_ip = false
}

dynamic "load_balancer" {
Expand Down Expand Up @@ -162,6 +162,7 @@ resource "aws_ecs_task_definition" "main" {

runtime_platform {
operating_system_family = "LINUX"
cpu_architecture = var.indexer_ecs_task_cpu_architecture
}

tags = {
Expand Down
6 changes: 4 additions & 2 deletions indexer/full_node_ap_northeast_1.tf
Original file line number Diff line number Diff line change
Expand Up @@ -36,9 +36,11 @@ module "full_node_ap_northeast_1" {

use_persistent_docker_volume = var.full_node_use_persistent_docker_volume

root_block_device_size = var.full_node_root_block_device_size
root_block_device_delete_on_termination = true
ecs_task_cpu_architecture = var.fullnode_ecs_task_cpu_architecture

providers = {
aws = aws.ap_northeast_1
}

root_block_device_size = var.full_node_root_block_device_size
}
2 changes: 1 addition & 1 deletion indexer/lambda.tf
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ resource "aws_lambda_function" "main" {
package_type = "Image"
function_name = "${each.key}_lambda_function"
role = aws_iam_role.lambda_services[each.key].arn
architectures = ["x86_64"]
architectures = [lower(var.lambda_cpu_architecture)]
timeout = 300

environment {
Expand Down
2 changes: 1 addition & 1 deletion indexer/locals.tf
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ locals {
},
"${local.service_names["socks"]}" : {
ecs_desired_count : var.socks_ecs_desired_count,
task_definition_memory : 20480,
task_definition_memory : 8192,
task_definition_cpu : 4096,
is_public_facing : true,
ports : [8080, 8000],
Expand Down
2 changes: 1 addition & 1 deletion indexer/msk.tf
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ resource "aws_msk_cluster" "main" {
instance_type = var.msk_instance_type
storage_info {
ebs_storage_info {
volume_size = var.environment == "mainnet" ? 4000 : 1000 # in GB
volume_size = var.msk_storage_size
}
}
client_subnets = [
Expand Down
3 changes: 2 additions & 1 deletion indexer/rds.tf
Original file line number Diff line number Diff line change
Expand Up @@ -215,7 +215,7 @@ resource "aws_db_instance" "main" {
performance_insights_enabled = true
performance_insights_retention_period = 31
auto_minor_version_upgrade = false
multi_az = true
multi_az = var.enable_rds_main_multiaz

tags = {
Name = local.aws_db_instance_main_name
Expand Down Expand Up @@ -251,6 +251,7 @@ resource "aws_db_instance" "read_replica" {

# Read replica 2
resource "aws_db_instance" "read_replica_2" {
count = var.create_read_replica_2 ? 1 : 0
identifier = "${local.aws_db_instance_main_name}-read-replica-2"
instance_class = var.rds_db_instance_class
# engine, engine_version, name, username, db_subnet_group_name, allocated_storage do not have to
Expand Down
3 changes: 2 additions & 1 deletion indexer/route53.tf
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,12 @@ resource "aws_route53_record" "read_replica_1" {
}

resource "aws_route53_record" "read_replica_2" {
count = var.create_read_replica_2 ? 1 : 0
zone_id = aws_route53_zone.main.zone_id
name = "postgres-main-rr.dydx-indexer.private"
type = "CNAME"
ttl = "30"
records = ["${aws_db_instance.read_replica_2.address}"]
records = ["${aws_db_instance.read_replica_2[count.index].address}"]
weighted_routing_policy {
weight = 1
}
Expand Down
9 changes: 5 additions & 4 deletions indexer/route_table.tf
Original file line number Diff line number Diff line change
Expand Up @@ -68,9 +68,10 @@ resource "aws_route" "full_node_route_to_indexer" {
# NOTE: This is not an individual AWS resource, but rather an attachment to the route table, and so
# no tags are added.
resource "aws_route" "backup_full_node_route_to_indexer" {
route_table_id = module.backup_full_node_ap_northeast_1.route_table_id
count = var.create_backup_full_node ? 1 : 0
route_table_id = module.backup_full_node_ap_northeast_1[0].route_table_id
destination_cidr_block = var.indexers[var.region].vpc_cidr_block
vpc_peering_connection_id = aws_vpc_peering_connection.backup_full_node_peer.id
vpc_peering_connection_id = aws_vpc_peering_connection.backup_full_node_peer[0].id
}

# Route from the Indexer's private subnets to the full node's VPC. Needed so that the full node can
Expand All @@ -88,9 +89,9 @@ resource "aws_route" "indexer_route_to_full_node" {
}

resource "aws_route" "indexer_route_to_backup_full_node" {
for_each = aws_route_table.private
for_each = var.create_backup_full_node ? aws_route_table.private : {}

route_table_id = each.value.id
destination_cidr_block = var.backup_full_node_cidr_vpc
vpc_peering_connection_id = aws_vpc_peering_connection.backup_full_node_peer.id
vpc_peering_connection_id = aws_vpc_peering_connection.backup_full_node_peer[0].id
}
43 changes: 43 additions & 0 deletions indexer/s3_bucket.tf
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,20 @@ resource "aws_s3_bucket" "load_balancer" {
}
}

resource "aws_s3_bucket_lifecycle_configuration" "load_balancer" {
count = var.enable_s3_load_balancer_logs_lifecycle ? 1 : 0
bucket = aws_s3_bucket.load_balancer.id

rule {
id = "expire-old-logs"
status = "Enabled"

expiration {
days = var.s3_load_balancer_logs_expiration_days
}
}
}

# TODO: refactor snapshotting full node into a separate module
# AWS S3 bucket to store all Indexer full node snapshots
resource "aws_s3_bucket" "indexer_full_node_snapshots" {
Expand All @@ -22,6 +36,21 @@ resource "aws_s3_bucket" "indexer_full_node_snapshots" {
}
}

resource "aws_s3_bucket_lifecycle_configuration" "indexer_full_node_snapshots" {
count = var.enable_s3_snapshot_lifecycle ? 1 : 0
bucket = aws_s3_bucket.indexer_full_node_snapshots.id

rule {
id = "expire-old-snapshots"
status = "Enabled"

expiration {
days = var.s3_snapshot_expiration_days
}
}
}


# Enable S3 bucket metrics to be sent to Datadog for monitoring
resource "aws_s3_bucket_metric" "indexer_full_node_snapshots" {
bucket = aws_s3_bucket.indexer_full_node_snapshots.id
Expand Down Expand Up @@ -64,3 +93,17 @@ resource "aws_s3_bucket" "athena_rds_snapshots" {
Environment = var.environment
}
}

resource "aws_s3_bucket_lifecycle_configuration" "athena_rds_snapshots" {
count = var.enable_s3_rds_snapshot_lifecycle ? 1 : 0
bucket = aws_s3_bucket.athena_rds_snapshots.id

rule {
id = "expire-old-snapshots"
status = "Enabled"

expiration {
days = var.s3_rds_snapshot_expiration_days
}
}
}
2 changes: 1 addition & 1 deletion indexer/security_group.tf
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ resource "aws_security_group" "msk" {
security_groups = flatten([
aws_security_group.devbox.id,
module.full_node_ap_northeast_1.aws_security_group_id,
module.backup_full_node_ap_northeast_1.aws_security_group_id,
var.create_backup_full_node ? [module.backup_full_node_ap_northeast_1[0].aws_security_group_id] : [],
# Lambda Services
[
for service in keys(local.lambda_services) :
Expand Down
4 changes: 3 additions & 1 deletion indexer/snapshot_full_node_ap_northeast_1.tf
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,9 @@ module "full_node_snapshot_ap_northeast_1" {

datadog_env = "snapshot-${var.environment}"

root_block_device_size = var.full_node_snapshot_ebs_volume_size
root_block_device_size = var.full_node_snapshot_ebs_volume_size
root_block_device_delete_on_termination = true
ecs_task_cpu_architecture = var.fullnode_ecs_task_cpu_architecture

entry_point = [
"sh",
Expand Down
113 changes: 106 additions & 7 deletions indexer/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,12 @@ variable "msk_instance_type" {
description = "Instance type for MSK brokers"
}

variable "msk_storage_size" {
type = string
description = "Storage size of MSK nodes. Suggested value: 2000 for mainnet, 1000 for staging and testnet and 500 for dev."
default = "500"
}

variable "rds_db_instance_class" {
type = string
description = "Instance class for the Postgres RDS DB"
Expand Down Expand Up @@ -163,6 +169,12 @@ variable "full_node_container_chain_home" {
description = "Full-node's home directory for the chain. Used to boot up the chain, and configure the `cmd` in ECS"
}

variable "full_node_root_block_device_size" {
type = number
description = "Size of root block device in gigabytes"
default = 1000
}

variable "snapshot_full_node_container_chain_home" {
type = string
description = "Snapshot full-node's home directory for the chain. Used to boot up the chain, and configure the `cmd` in ECS"
Expand Down Expand Up @@ -202,7 +214,7 @@ variable "full_node_snapshot_upload_period" {
variable "full_node_snapshot_ebs_volume_size" {
type = number
description = "Size (in GiB) of the EBS volume used for the fast sync full node"
default = 3000
default = 1000
}

variable "full_node_ec2_instance_type" {
Expand Down Expand Up @@ -474,6 +486,99 @@ variable "image_count" {
default = 100
}

variable "enable_s3_snapshot_lifecycle" {
type = bool
description = "Enables S3 lifecycle on snapshot bucket. Default is true"
default = true
}

variable "s3_snapshot_expiration_days" {
type = number
description = "Number of days to store fullnode snapshot on S3, defaults to 7."
default = 7
}

variable "enable_s3_rds_snapshot_lifecycle" {
type = bool
description = "Enables S3 lifecycle on rds snapshot bucket. Default is true"
default = true
}

variable "s3_rds_snapshot_expiration_days" {
type = number
description = "Number of days to store rds snapshot on S3, defaults to 14."
default = 14
}

variable "enable_s3_load_balancer_logs_lifecycle" {
type = bool
description = "Enables S3 lifecycle on snapshot bucket. Default is true"
default = true
}

variable "s3_load_balancer_logs_expiration_days" {
type = number
description = "Number of days to store load balancer logs on S3, defaults to 14."
default = 14
}

variable "create_read_replica_2" {
description = "Create read replia 2 or not. Default: true"
type = bool
default = true
}

variable "enable_rds_main_multiaz" {
description = "Enable RDS main instance. Default: true"
type = bool
default = true
}

variable "indexer_ecs_task_cpu_architecture" {
type = string
description = "Type of ecs cpu architecture. Accept: X86_64 or ARM64"
default = "X86_64"
validation {
condition = contains(
["X86_64", "ARM64"],
var.indexer_ecs_task_cpu_architecture
)
error_message = "Err: invalid environment. Must be one of {X86_64 | ARM64}."
}
}

variable "fullnode_ecs_task_cpu_architecture" {
type = string
description = "Type of ecs cpu architecture. Accept: X86_64 or ARM64"
default = "X86_64"
validation {
condition = contains(
["X86_64", "ARM64"],
var.fullnode_ecs_task_cpu_architecture
)
error_message = "Err: invalid environment. Must be one of {X86_64 | ARM64}."
}
}

variable "lambda_cpu_architecture" {
type = string
description = "Type of lambda cpu architecture. Accept: X86_64 or ARM64"
default = "X86_64"
validation {
condition = contains(
["X86_64", "ARM64"],
var.lambda_cpu_architecture
)
error_message = "Err: invalid environment. Must be one of {X86_64 | ARM64}."
}
}

variable "create_backup_full_node" {
description = "Create backup full node. Default: false for all envs test and dev environment. Mainnet and Testnet should enable it."
type = bool
default = false
}

variable "vulcan_ecs_desired_count" {
type = number
description = "Number of desired vulcan instances."
Expand All @@ -491,9 +596,3 @@ variable "socks_ecs_desired_count" {
description = "Number of desired socks instances."
default = 5
}

variable "full_node_root_block_device_size" {
type = number
description = "Size of Size of root block device in gigabytes."
default = 4000
}
3 changes: 2 additions & 1 deletion indexer/vpc.tf
Original file line number Diff line number Diff line change
Expand Up @@ -84,8 +84,9 @@ resource "aws_vpc_peering_connection" "full_node_peer" {
}

resource "aws_vpc_peering_connection" "backup_full_node_peer" {
count = var.create_backup_full_node ? 1 : 0
peer_vpc_id = aws_vpc.main.id
vpc_id = module.backup_full_node_ap_northeast_1.aws_vpc_id
vpc_id = module.backup_full_node_ap_northeast_1[0].aws_vpc_id
# Auto-accept allows the VPC peering connection to be made programmatically with no manual steps
# to accept the VPC peering connection in the console
# This can only be done if both VPCs are in the same region and AWS account (which they are)
Expand Down
2 changes: 1 addition & 1 deletion modules/validator/ec2.tf
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ data "aws_ami" "amazon_linux_ecs_ami" {

filter {
name = "name"
values = ["amzn2-ami-ecs-inf-hvm-*-x86_64-ebs"]
values = [var.ecs_task_cpu_architecture == "X86_64" ? "amzn2-ami-ecs-inf-hvm-*-x86_64-ebs" : "amzn2-ami-ecs-hvm-*-arm64-ebs"]
}
}

Expand Down
5 changes: 5 additions & 0 deletions modules/validator/ecs.tf
Original file line number Diff line number Diff line change
Expand Up @@ -230,6 +230,11 @@ resource "aws_ecs_task_definition" "main" {
}
}

runtime_platform {
operating_system_family = "LINUX"
cpu_architecture = var.ecs_task_cpu_architecture
}

tags = {
Name = "${var.environment}-${var.name}-task"
Environment = var.environment
Expand Down
Loading

0 comments on commit a51b44f

Please sign in to comment.