diff --git a/Cargo.lock b/Cargo.lock index 8ec0e5f..d14f6f5 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -17,17 +17,6 @@ version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" -[[package]] -name = "ahash" -version = "0.7.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "891477e0c6a8957309ee5c45a6368af3ae14bb510732d2684ffa19af310920f9" -dependencies = [ - "getrandom", - "once_cell", - "version_check", -] - [[package]] name = "ahash" version = "0.8.11" @@ -793,9 +782,6 @@ name = "hashbrown" version = "0.12.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888" -dependencies = [ - "ahash 0.7.8", -] [[package]] name = "hashbrown" @@ -803,7 +789,7 @@ version = "0.14.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1" dependencies = [ - "ahash 0.8.11", + "ahash", "allocator-api2", ] @@ -1302,7 +1288,7 @@ version = "0.87.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2d8893eb18fbf6bb6c80ef6ee7dd11ec32b1dc3c034c988ac1b3a84d46a230ae" dependencies = [ - "ahash 0.8.11", + "ahash", "async-trait", "backoff", "derivative", @@ -1786,7 +1772,7 @@ version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dbe55bddb694583a9db101e5ae5b31f570f2ccce312ac7d64c2e4a430510c4b3" dependencies = [ - "ahash 0.8.11", + "ahash", "async-trait", "blake2", "bytes", @@ -1820,7 +1806,7 @@ version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "51dbd9509e3bb25a699bee76ba1befbffb4e733694d7e682d4bfe35a1a48cbb4" dependencies = [ - "ahash 0.8.11", + "ahash", "async-trait", "brotli", "bytes", @@ -1933,7 +1919,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9bb8f0df84b4b9afd63742c78e6c4b39413554f857e7d41502825e4ff9798e3e" dependencies = [ "arrayvec", - "hashbrown 0.12.3", + "hashbrown 0.14.5", "parking_lot", "rand", ] diff --git a/bootstrap/cell/main.tf b/bootstrap/cell/main.tf new file mode 100644 index 0000000..09ad158 --- /dev/null +++ b/bootstrap/cell/main.tf @@ -0,0 +1,33 @@ +module "pvc" { + source = "../pvc" + + namespace = var.namespace + name = "pvc-${var.salt}" + storage_size = var.storage_size + storage_class = var.storage_class + volume_name = var.volume_name +} + +module "instances" { + for_each = var.instances + source = "../instance" + + namespace = var.namespace + tolerations = var.tolerations + salt = var.salt + instance_name = each.key + network = each.key + pvc_name = "pvc-${var.salt}" + dolos_version = each.value.dolos_version + replicas = coalesce(each.value.replicas, 1) + resources = coalesce(each.value.resources, { + requests = { + cpu = "50m" + memory = "512Mi" + } + limits = { + cpu = "1000m" + memory = "512Mi" + } + }) +} diff --git a/bootstrap/cell/variables.tf b/bootstrap/cell/variables.tf new file mode 100644 index 0000000..5a44a26 --- /dev/null +++ b/bootstrap/cell/variables.tf @@ -0,0 +1,74 @@ +variable "namespace" { + type = string +} + +variable "salt" { + type = string +} + +variable "extension_subdomain" { + type = string +} + +variable "dns_zone" { + default = "demeter.run" +} + +variable "storage_size" { + type = string +} + +variable "storage_class" { + type = string +} + +variable "volume_name" { + type = string +} + +variable "tolerations" { + type = list(object({ + effect = string + key = string + operator = string + value = string + })) + default = [ + { + effect = "NoSchedule" + key = "demeter.run/compute-profile" + operator = "Equal" + value = "disk-intensive" + }, + { + effect = "NoSchedule" + key = "demeter.run/compute-arch" + operator = "Equal" + value = "arm64" + }, + { + effect = "NoSchedule" + key = "demeter.run/availability-sla" + operator = "Equal" + value = "consistent" + } + ] +} + +// Instances +variable "instances" { + type = map(object({ + dolos_version = string + replicas = optional(number) + resources = optional(object({ + limits = object({ + cpu = string + memory = string + }) + requests = object({ + cpu = string + memory = string + }) + })) + })) +} diff --git a/bootstrap/cloudflared/config.tf b/bootstrap/cloudflared/config.tf index ebd35d4..793046c 100644 --- a/bootstrap/cloudflared/config.tf +++ b/bootstrap/cloudflared/config.tf @@ -14,9 +14,9 @@ resource "kubernetes_config_map" "tunnel-config" { tunnel_id = var.tunnel_id metrics_port = var.metrics_port hostname = var.hostname - service = "proxy" namespace = var.namespace port = 8080 + networks = var.networks })}" } } diff --git a/bootstrap/cloudflared/config.yml.tfpl b/bootstrap/cloudflared/config.yml.tfpl index 4822d1b..0199a06 100644 --- a/bootstrap/cloudflared/config.yml.tfpl +++ b/bootstrap/cloudflared/config.yml.tfpl @@ -15,9 +15,11 @@ no-autoupdate: true # from the internet to cloudflared, run `cloudflared tunnel route dns `. # E.g. `cloudflared tunnel route dns example-tunnel tunnel.example.com`. ingress: - - hostname: ${hostname} - service: https://${service}.${namespace}.svc.cluster.local:${port} +%{ for network in networks ~} + - hostname: ${network}-${hostname} + service: https://proxy-${network}.${namespace}.svc.cluster.local:${port} originRequest: noTLSVerify: true http2Origin: true +%{ endfor ~} - service: http_status:404 diff --git a/bootstrap/cloudflared/deployment.tf b/bootstrap/cloudflared/deployment.tf index 3f434e7..9b8a005 100644 --- a/bootstrap/cloudflared/deployment.tf +++ b/bootstrap/cloudflared/deployment.tf @@ -1,9 +1,13 @@ +locals { + name = "cloudflared" +} + resource "kubernetes_deployment" "cloudflared" { wait_for_rollout = false depends_on = [kubernetes_secret.tunnel_credentials] metadata { - name = "cloudflared" + name = local.name namespace = var.namespace labels = { @@ -102,23 +106,15 @@ resource "kubernetes_deployment" "cloudflared" { } } - toleration { - effect = "NoSchedule" - key = "demeter.run/compute-profile" - operator = "Exists" - } - - toleration { - effect = "NoSchedule" - key = "demeter.run/compute-arch" - operator = "Exists" - } + dynamic "toleration" { + for_each = var.tolerations - toleration { - effect = "NoSchedule" - key = "demeter.run/availability-sla" - operator = "Equal" - value = "consistent" + content { + effect = toleration.value.effect + key = toleration.value.key + operator = toleration.value.operator + value = toleration.value.value + } } } } diff --git a/bootstrap/cloudflared/main.tf b/bootstrap/cloudflared/main.tf index 298804e..fedc100 100644 --- a/bootstrap/cloudflared/main.tf +++ b/bootstrap/cloudflared/main.tf @@ -2,6 +2,10 @@ variable "namespace" { type = string } +variable "networks" { + type = list(string) +} + variable "tunnel_id" { type = string } @@ -25,6 +29,35 @@ variable "replicas" { default = 2 } +variable "tolerations" { + type = list(object({ + effect = string + key = string + operator = string + value = string + })) + default = [ + { + effect = "NoSchedule" + key = "demeter.run/compute-profile" + operator = "Equal" + value = "general-purpose" + }, + { + effect = "NoSchedule" + key = "demeter.run/compute-arch" + operator = "Equal" + value = "x86" + }, + { + effect = "NoSchedule" + key = "demeter.run/availability-sla" + operator = "Equal" + value = "best-effort" + } + ] +} + variable "resources" { type = object({ limits = object({ diff --git a/bootstrap/configs/mainnet.toml b/bootstrap/configs/mainnet.toml index 67c5e97..cccaca8 100644 --- a/bootstrap/configs/mainnet.toml +++ b/bootstrap/configs/mainnet.toml @@ -4,7 +4,7 @@ network_magic = 764824073 is_testnet = false [storage] -path = "/var/data/db" +path = "/var/data/mainnet/db" [genesis] byron_path = "/etc/genesis/mainnet/byron.json" diff --git a/bootstrap/configs/preprod.toml b/bootstrap/configs/preprod.toml index cb25b26..4bc3610 100644 --- a/bootstrap/configs/preprod.toml +++ b/bootstrap/configs/preprod.toml @@ -4,7 +4,7 @@ network_magic = 1 is_testnet = true [storage] -path = "/var/data/db" +path = "/var/data/preprod/db" [genesis] byron_path = "/etc/genesis/preprod/byron.json" diff --git a/bootstrap/configs/preview.toml b/bootstrap/configs/preview.toml index b0863c6..4630b4d 100644 --- a/bootstrap/configs/preview.toml +++ b/bootstrap/configs/preview.toml @@ -4,7 +4,7 @@ network_magic = 2 is_testnet = true [storage] -path = "/var/data/db" +path = "/var/data/preview/db" [genesis] byron_path = "/etc/genesis/preview/byron.json" diff --git a/bootstrap/configs/vector-testnet.toml b/bootstrap/configs/vector-testnet.toml index 0fd6841..d79dd24 100644 --- a/bootstrap/configs/vector-testnet.toml +++ b/bootstrap/configs/vector-testnet.toml @@ -5,11 +5,11 @@ network_id = 1 phase1_validation_enabled = false [rolldb] -path = "/var/data/rolldb" +path = "/var/data/vector-testnet/rolldb" k_param = 1000 [applydb] -path = "/var/data/applydb" +path = "/var/data/vector-testnet/applydb" [serve.grpc] diff --git a/bootstrap/proxy/certs.tf b/bootstrap/feature/certs.tf similarity index 84% rename from bootstrap/proxy/certs.tf rename to bootstrap/feature/certs.tf index fd165bf..8140374 100644 --- a/bootstrap/proxy/certs.tf +++ b/bootstrap/feature/certs.tf @@ -1,7 +1,7 @@ resource "kubernetes_config_map" "proxy-certs" { metadata { namespace = var.namespace - name = local.certs_configmap + name = var.certs_configmap } data = { diff --git a/bootstrap/feature/main.tf b/bootstrap/feature/main.tf index a810843..d0c4e23 100644 --- a/bootstrap/feature/main.tf +++ b/bootstrap/feature/main.tf @@ -15,6 +15,11 @@ variable "extension_subdomain" { default = "utxorpc-m0" } +variable "certs_configmap" { + type = string + default = "proxy-certs" +} + variable "dns_zone" { default = "demeter.run" } diff --git a/bootstrap/feature/operator.tf b/bootstrap/feature/operator.tf index fd31920..5b8b912 100644 --- a/bootstrap/feature/operator.tf +++ b/bootstrap/feature/operator.tf @@ -80,14 +80,14 @@ resource "kubernetes_deployment_v1" "operator" { toleration { effect = "NoSchedule" key = "demeter.run/compute-profile" - operator = "Equal" - value = "general-purpose" + operator = "Exists" } toleration { effect = "NoSchedule" key = "demeter.run/compute-arch" - operator = "Exists" + operator = "Equal" + value = "x86" } toleration { diff --git a/bootstrap/proxy/tls.crt b/bootstrap/feature/tls.crt similarity index 100% rename from bootstrap/proxy/tls.crt rename to bootstrap/feature/tls.crt diff --git a/bootstrap/proxy/tls.key b/bootstrap/feature/tls.key similarity index 100% rename from bootstrap/proxy/tls.key rename to bootstrap/feature/tls.key diff --git a/bootstrap/instance/main.tf b/bootstrap/instance/main.tf index 49529ca..fb649d9 100644 --- a/bootstrap/instance/main.tf +++ b/bootstrap/instance/main.tf @@ -12,6 +12,10 @@ variable "salt" { type = string } +variable "pvc_name" { + type = string +} + variable "instance_name" { type = string } @@ -24,6 +28,35 @@ variable "dolos_version" { type = string } +variable "tolerations" { + type = list(object({ + effect = string + key = string + operator = string + value = string + })) + default = [ + { + effect = "NoSchedule" + key = "demeter.run/compute-profile" + operator = "Equal" + value = "general-purpose" + }, + { + effect = "NoSchedule" + key = "demeter.run/compute-arch" + operator = "Equal" + value = "x86" + }, + { + effect = "NoSchedule" + key = "demeter.run/availability-sla" + operator = "Equal" + value = "best-effort" + } + ] +} + variable "resources" { type = object({ limits = object({ @@ -34,10 +67,6 @@ variable "resources" { cpu = string memory = string }) - storage = object({ - size = string - class = string - }) }) default = { requests = { @@ -48,11 +77,6 @@ variable "resources" { cpu = "1000m" memory = "512Mi" } - storage = { - size = "30Gi" - class = "fast" - } - } } diff --git a/bootstrap/instance/sts.tf b/bootstrap/instance/sts.tf index eae9ac2..e3cb073 100644 --- a/bootstrap/instance/sts.tf +++ b/bootstrap/instance/sts.tf @@ -14,21 +14,6 @@ resource "kubernetes_stateful_set_v1" "utxorpc" { replicas = var.replicas service_name = "utxorpc" - volume_claim_template { - metadata { - name = "data" - } - spec { - access_modes = ["ReadWriteOnce"] - resources { - requests = { - storage = var.resources.storage.size - } - } - storage_class_name = var.resources.storage.class - } - } - selector { match_labels = { "demeter.run/instance" = local.instance @@ -44,29 +29,32 @@ resource "kubernetes_stateful_set_v1" "utxorpc" { } spec { # @TODO: once the bootstrap command works in non-interactive, we can restore this. - init_container { - name = "init" - image = "ghcr.io/txpipe/dolos:${var.dolos_version}" - args = [ - "-c", - "/etc/config/dolos.toml", - "bootstrap", - "--download-dir", - "/var/data/snapshot", - ] - resources { - limits = var.resources.limits - requests = var.resources.requests - } - volume_mount { - name = "config" - mount_path = "/etc/config" - } - volume_mount { - name = "data" - mount_path = "/var/data" - } - } + # init_container { + # name = "init" + # image = "ghcr.io/txpipe/dolos:${var.dolos_version}" + # # command = ["sleep", "infinity"] + # args = [ + # "-c", + # "/etc/config/dolos.toml", + # "bootstrap", + # "--download-dir", + # "/var/data/${var.network}/snapshot", + # "--skip-if-not-empty", + # # "--skip-download", + # ] + # resources { + # limits = var.resources.limits + # requests = var.resources.requests + # } + # volume_mount { + # name = "config" + # mount_path = "/etc/config" + # } + # volume_mount { + # name = "data" + # mount_path = "/var/data" + # } + # } container { name = local.instance image = "ghcr.io/txpipe/dolos:${var.dolos_version}" @@ -75,7 +63,6 @@ resource "kubernetes_stateful_set_v1" "utxorpc" { "/etc/config/dolos.toml", "daemon" ] - # command = ["sleep", "infinity"] resources { limits = var.resources.limits requests = var.resources.requests @@ -97,11 +84,18 @@ resource "kubernetes_stateful_set_v1" "utxorpc" { name = "data" mount_path = "/var/data" } + volume_mount { name = "config" mount_path = "/etc/config" } + } + volume { + name = "data" + persistent_volume_claim { + claim_name = var.pvc_name + } } volume { @@ -112,23 +106,15 @@ resource "kubernetes_stateful_set_v1" "utxorpc" { } termination_grace_period_seconds = 180 - toleration { - effect = "NoSchedule" - key = "demeter.run/compute-profile" - operator = "Exists" - } - - toleration { - effect = "NoSchedule" - key = "demeter.run/compute-arch" - operator = "Equal" - value = "arm64" - } + dynamic "toleration" { + for_each = var.tolerations - toleration { - effect = "NoSchedule" - key = "demeter.run/availability-sla" - operator = "Exists" + content { + effect = toleration.value.effect + key = toleration.value.key + operator = toleration.value.operator + value = toleration.value.value + } } } } diff --git a/bootstrap/main.tf b/bootstrap/main.tf index 7a8f0f6..f0beba1 100644 --- a/bootstrap/main.tf +++ b/bootstrap/main.tf @@ -16,8 +16,9 @@ module "feature" { } module "configs" { - source = "./configs" - for_each = { for network in var.networks : "${network}" => network } + depends_on = [kubernetes_namespace_v1.namespace] + source = "./configs" + for_each = { for network in var.networks : "${network}" => network } namespace = var.namespace network = each.value @@ -26,28 +27,31 @@ module "configs" { module "services" { depends_on = [kubernetes_namespace_v1.namespace] - for_each = { for network in var.networks : "${network}" => network } - source = "./service" + source = "./services" namespace = var.namespace - network = each.value + networks = var.networks } module "proxy" { depends_on = [kubernetes_namespace_v1.namespace] source = "./proxy" + for_each = { for network in var.networks : "${network}" => network } - namespace = var.namespace - image_tag = var.proxy_image_tag - replicas = var.proxy_replicas - resources = var.proxy_resources + namespace = var.namespace + network = each.value + image_tag = var.proxies_image_tag + replicas = var.proxies_replicas + resources = var.proxies_resources + tolerations = var.proxies_tolerations } module "cloudflared" { - depends_on = [module.proxy] + depends_on = [kubernetes_namespace_v1.namespace] source = "./cloudflared" namespace = var.namespace + networks = var.networks tunnel_id = var.cloudflared_tunnel_id hostname = "${var.extension_subdomain}.${var.dns_zone}" tunnel_secret = var.cloudflared_tunnel_secret @@ -56,31 +60,44 @@ module "cloudflared" { image_tag = var.cloudflared_image_tag replicas = var.cloudflared_replicas resources = var.cloudflared_resources + tolerations = var.cloudflared_tolerations } -module "instances" { - depends_on = [module.feature, module.configs] - for_each = var.instances - source = "./instance" +module "cells" { + depends_on = [module.configs, module.feature] + for_each = var.cells + source = "./cell" - namespace = var.namespace - network = each.value.network - salt = each.key - instance_name = "${each.value.network}-${each.key}" - dolos_version = coalesce(each.value.dolos_version, "v0.13.1") - replicas = coalesce(each.value.replicas, 1) - resources = coalesce(each.value.resources, { - requests = { - cpu = "50m" - memory = "512Mi" - } - limits = { - cpu = "1000m" - memory = "512Mi" - } - storage = { - size = "30Gi" - class = "fast" + namespace = var.namespace + salt = each.key + extension_subdomain = var.extension_subdomain + dns_zone = var.dns_zone + tolerations = coalesce(each.value.tolerations, [ + { + effect = "NoSchedule" + key = "demeter.run/compute-profile" + operator = "Equal" + value = "disk-intensive" + }, + { + effect = "NoSchedule" + key = "demeter.run/compute-arch" + operator = "Equal" + value = "arm64" + }, + { + effect = "NoSchedule" + key = "demeter.run/availability-sla" + operator = "Equal" + value = "consistent" } - }) + ]) + + // PVC + storage_size = each.value.pvc.storage_size + storage_class = each.value.pvc.storage_class + volume_name = each.value.pvc.volume_name + + // Instances + instances = each.value.instances } diff --git a/bootstrap/proxy/deployment.tf b/bootstrap/proxy/deployment.tf index bbdf26d..067dd8b 100644 --- a/bootstrap/proxy/deployment.tf +++ b/bootstrap/proxy/deployment.tf @@ -51,6 +51,11 @@ resource "kubernetes_deployment_v1" "utxorpc_proxy" { protocol = "TCP" } + env { + name = "NETWORK" + value = var.network + } + env { name = "PROXY_NAMESPACE" value = var.namespace @@ -90,29 +95,19 @@ resource "kubernetes_deployment_v1" "utxorpc_proxy" { volume { name = "certs" config_map { - name = local.certs_configmap + name = var.certs_configmap } } - toleration { - effect = "NoSchedule" - key = "demeter.run/compute-profile" - operator = "Equal" - value = "general-purpose" - } + dynamic "toleration" { + for_each = var.tolerations - toleration { - effect = "NoSchedule" - key = "demeter.run/compute-arch" - operator = "Equal" - value = "x86" - } - - toleration { - effect = "NoSchedule" - key = "demeter.run/availability-sla" - operator = "Equal" - value = "consistent" + content { + effect = toleration.value.effect + key = toleration.value.key + operator = toleration.value.operator + value = toleration.value.value + } } } } diff --git a/bootstrap/proxy/main.tf b/bootstrap/proxy/main.tf index 50e6e6a..d97e034 100644 --- a/bootstrap/proxy/main.tf +++ b/bootstrap/proxy/main.tf @@ -1,18 +1,21 @@ locals { - name = "proxy" - role = "proxy" + name = "proxy-${var.network}" + role = "proxy-${var.network}" prometheus_port = 9187 prometheus_addr = "0.0.0.0:${local.prometheus_port}" proxy_port = 8080 proxy_addr = "0.0.0.0:${local.proxy_port}" - certs_configmap = "proxy-certs" } variable "namespace" { type = string } +variable "network" { + type = string +} + variable "replicas" { type = number default = 1 @@ -22,6 +25,38 @@ variable "image_tag" { type = string } +variable "certs_configmap" { + type = string + default = "proxy-certs" +} + +variable "tolerations" { + type = list(object({ + effect = string + key = string + operator = string + value = optional(string) + })) + default = [ + { + effect = "NoSchedule" + key = "demeter.run/compute-profile" + operator = "Exists" + }, + { + effect = "NoSchedule" + key = "demeter.run/compute-arch" + operator = "Equal" + value = "x86" + }, + { + effect = "NoSchedule" + key = "demeter.run/availability-sla" + operator = "Exists" + } + ] +} + variable "resources" { type = object({ limits = object({ diff --git a/bootstrap/proxy/service.tf b/bootstrap/proxy/service.tf deleted file mode 100644 index 06a65d8..0000000 --- a/bootstrap/proxy/service.tf +++ /dev/null @@ -1,30 +0,0 @@ -resource "kubernetes_service_v1" "proxy_service" { - metadata { - name = local.name - namespace = var.namespace - # annotations = { - # "service.beta.kubernetes.io/aws-load-balancer-nlb-target-type" : "instance" - # "service.beta.kubernetes.io/aws-load-balancer-scheme" : "internet-facing" - # "service.beta.kubernetes.io/aws-load-balancer-type" : "external" - # "service.beta.kubernetes.io/aws-load-balancer-healthcheck-protocol" : "HTTPS" - # "service.beta.kubernetes.io/aws-load-balancer-healthcheck-path" : "/dmtr_health" - # } - } - - spec { - # load_balancer_class = "service.k8s.aws/nlb" - selector = { - role = local.role - } - - port { - name = "proxy" - port = local.proxy_port - target_port = local.proxy_port - protocol = "TCP" - } - - # type = "LoadBalancer" - type = "ClusterIP" - } -} diff --git a/bootstrap/pvc/main.tf b/bootstrap/pvc/main.tf new file mode 100644 index 0000000..9c28463 --- /dev/null +++ b/bootstrap/pvc/main.tf @@ -0,0 +1,39 @@ +variable "namespace" { + description = "the namespace where the resources will be created" +} + +variable "volume_name" { + description = "the name of the volume" +} + +variable "name" { + description = "the name of the pvc" +} + +variable "storage_size" { + description = "the size of the volume" +} + +variable "storage_class" { + description = "the class of the volume" +} + +resource "kubernetes_persistent_volume_claim" "shared_disk" { + wait_until_bound = false + + metadata { + name = var.name + namespace = var.namespace + } + + spec { + access_modes = ["ReadWriteOnce"] + resources { + requests = { + storage = var.storage_size + } + } + storage_class_name = var.storage_class + volume_name = var.volume_name + } +} diff --git a/bootstrap/service/main.tf b/bootstrap/service/main.tf deleted file mode 100644 index 9d321a3..0000000 --- a/bootstrap/service/main.tf +++ /dev/null @@ -1,49 +0,0 @@ -variable "namespace" { - description = "The namespace where the resources will be created" -} - -variable "network" { - description = "Cardano node network" -} - -resource "kubernetes_service_v1" "well_known_service_grpc" { - metadata { - name = "utxorpc-${var.network}-grpc" - namespace = var.namespace - } - - spec { - port { - name = "grpc" - protocol = "TCP" - port = 50051 - } - - selector = { - "cardano.demeter.run/network" = var.network - } - - type = "ClusterIP" - } -} - -resource "kubernetes_service_v1" "well_known_service_grpc_web" { - metadata { - name = "utxorpc-${var.network}-grpc-web" - namespace = var.namespace - } - - spec { - port { - name = "grpc-web" - protocol = "TCP" - port = 50051 - } - - selector = { - "cardano.demeter.run/network" = var.network - } - - type = "ClusterIP" - } -} diff --git a/bootstrap/services/main.tf b/bootstrap/services/main.tf new file mode 100644 index 0000000..04cafdb --- /dev/null +++ b/bootstrap/services/main.tf @@ -0,0 +1,86 @@ +variable "namespace" { + description = "The namespace where the resources will be created" +} + +variable "networks" { + type = list(string) + default = ["mainnet", "preprod", "preview", "vector-testnet"] +} + +resource "kubernetes_service_v1" "well_known_service_grpc" { + for_each = { for network in var.networks : "${network}" => network } + + metadata { + name = "utxorpc-${each.value}-grpc" + namespace = var.namespace + } + + spec { + port { + name = "grpc" + protocol = "TCP" + port = 50051 + } + + selector = { + "cardano.demeter.run/network" = each.value + } + + type = "ClusterIP" + } +} + +resource "kubernetes_service_v1" "well_known_service_grpc_web" { + for_each = { for network in var.networks : "${network}" => network } + metadata { + name = "utxorpc-${each.value}-grpc-web" + namespace = var.namespace + } + + spec { + port { + name = "grpc-web" + protocol = "TCP" + port = 50051 + } + + selector = { + "cardano.demeter.run/network" = each.value + } + + type = "ClusterIP" + } +} + +resource "kubernetes_service_v1" "proxies" { + for_each = { for network in var.networks : "${network}" => network } + + metadata { + name = "proxy-${each.value}" + namespace = var.namespace + # annotations = { + # "service.beta.kubernetes.io/aws-load-balancer-nlb-target-type" : "instance" + # "service.beta.kubernetes.io/aws-load-balancer-scheme" : "internet-facing" + # "service.beta.kubernetes.io/aws-load-balancer-type" : "external" + # "service.beta.kubernetes.io/aws-load-balancer-healthcheck-protocol" : "HTTPS" + # "service.beta.kubernetes.io/aws-load-balancer-healthcheck-path" : "/dmtr_health" + # } + } + + spec { + # load_balancer_class = "service.k8s.aws/nlb" + selector = { + role = "proxy-${each.value}" + } + + port { + name = "proxy" + port = 8080 + target_port = 8080 + protocol = "TCP" + } + + # type = "LoadBalancer" + type = "ClusterIP" + } +} diff --git a/bootstrap/variables.tf b/bootstrap/variables.tf index 9041b05..11b7215 100644 --- a/bootstrap/variables.tf +++ b/bootstrap/variables.tf @@ -29,17 +29,17 @@ variable "dns_zone" { default = "demeter.run" } -// Proxy -variable "proxy_image_tag" { +// Proxies +variable "proxies_image_tag" { type = string } -variable "proxy_replicas" { +variable "proxies_replicas" { type = number default = 1 } -variable "proxy_resources" { +variable "proxies_resources" { type = object({ limits = object({ cpu = string @@ -52,7 +52,7 @@ variable "proxy_resources" { }) default = { limits : { - cpu : "50m", + cpu : "2", memory : "250Mi" } requests : { @@ -62,6 +62,33 @@ variable "proxy_resources" { } } +variable "proxies_tolerations" { + type = list(object({ + effect = string + key = string + operator = string + value = optional(string) + })) + default = [ + { + effect = "NoSchedule" + key = "demeter.run/compute-profile" + operator = "Exists" + }, + { + effect = "NoSchedule" + key = "demeter.run/compute-arch" + operator = "Equal" + value = "x86" + }, + { + effect = "NoSchedule" + key = "demeter.run/availability-sla" + operator = "Exists" + } + ] +} + // Cloudflared variable "cloudflared_tunnel_id" { type = string @@ -115,25 +142,58 @@ variable "cloudflared_resources" { } } -// Instances -variable "instances" { +variable "cloudflared_tolerations" { + type = list(object({ + effect = string + key = string + operator = string + value = optional(string) + })) + default = [ + { + effect = "NoSchedule" + key = "demeter.run/compute-profile" + operator = "Exists" + }, + { + effect = "NoSchedule" + key = "demeter.run/compute-arch" + operator = "Exists" + }, + { + effect = "NoSchedule" + key = "demeter.run/availability-sla" + operator = "Exists" + } + ] +} + +variable "cells" { type = map(object({ - network = string - replicas = optional(number) - dolos_version = optional(string) - resources = optional(object({ - limits = object({ - cpu = string - memory = string - }) - requests = object({ - cpu = string - memory = string - }) - storage = object({ - size = string - class = string - }) + tolerations = optional(list(object({ + effect = string + key = string + operator = string + value = string + }))) + pvc = object({ + storage_class = string + storage_size = string + volume_name = string + }) + instances = map(object({ + dolos_version = string + replicas = optional(number) + resources = optional(object({ + limits = object({ + cpu = string + memory = string + }) + requests = object({ + cpu = string + memory = string + }) + })) })) })) } diff --git a/operator/src/controller.rs b/operator/src/controller.rs index 23f406d..7d6534a 100644 --- a/operator/src/controller.rs +++ b/operator/src/controller.rs @@ -61,7 +61,7 @@ async fn reconcile(crd: Arc, ctx: Arc) -> Result { Some(key) => key.clone(), None => build_api_key(&crd).await?, }; - let (hostname, _) = build_hostname(&key); + let (hostname, _) = build_hostname(&key, &crd.spec.network); let status = UtxoRpcPortStatus { grpc_endpoint_url: hostname, diff --git a/operator/src/utils.rs b/operator/src/utils.rs index 6b088ae..42e7135 100644 --- a/operator/src/utils.rs +++ b/operator/src/utils.rs @@ -27,12 +27,12 @@ pub async fn patch_resource_status( Ok(()) } -pub fn build_hostname(key: &str) -> (String, String) { +pub fn build_hostname(key: &str, network: &str) -> (String, String) { let config = get_config(); let extension_subdomain = &config.extension_subdomain; let dns_zone = &config.dns_zone; - let hostname = format!("{extension_subdomain}.{dns_zone}"); - let hostname_key = format!("{key}.{extension_subdomain}.{dns_zone}"); + let hostname = format!("{network}-{extension_subdomain}.{dns_zone}"); + let hostname_key = format!("{key}.{network}-{extension_subdomain}.{dns_zone}"); (hostname, hostname_key) } @@ -94,7 +94,7 @@ mod test { "", UtxoRpcPortSpec { auth_token: None, - operator_version: "1".to_string(), + operator_version: Some("1".to_string()), network: "preview".to_string(), throughput_tier: Some("0".to_string()), utxorpc_version: Some("v1".to_string()), @@ -110,12 +110,12 @@ mod test { async fn test_build_hostname() { set_configs(); let key = "dmtr_utxorpc_v1_preview_ashjdcnoasdj"; - let (hostname, hostname_key) = build_hostname(key); + let (hostname, hostname_key) = build_hostname(key, "mainnet"); - assert_eq!(hostname, "extension_subdomain.dns_zone".to_string()); + assert_eq!(hostname, "mainnet-extension_subdomain.dns_zone".to_string()); assert_eq!( hostname_key, - "dmtr_utxorpc_v1_preview_ashjdcnoasdj.extension_subdomain.dns_zone".to_string() + "dmtr_utxorpc_v1_preview_ashjdcnoasdj.mainnet-extension_subdomain.dns_zone".to_string() ); } } diff --git a/proxy/src/config.rs b/proxy/src/config.rs index 94977aa..f9ed116 100644 --- a/proxy/src/config.rs +++ b/proxy/src/config.rs @@ -2,6 +2,7 @@ use std::env; #[derive(Debug, Clone)] pub struct Config { + pub network: String, pub proxy_addr: String, pub proxy_namespace: String, pub prometheus_addr: String, @@ -10,10 +11,12 @@ pub struct Config { pub utxorpc_dns: String, pub utxorpc_port: u16, pub health_endpoint: String, + pub health_pool_interval: std::time::Duration, } impl Config { pub fn new() -> Self { Self { + network: env::var("NETWORK").expect("NETWORK must be set"), proxy_addr: env::var("PROXY_ADDR").expect("PROXY_ADDR must be set"), proxy_namespace: env::var("PROXY_NAMESPACE").expect("PROXY_NAMESPACE must be set"), prometheus_addr: env::var("PROMETHEUS_ADDR").expect("PROMETHEUS_ADDR must be set"), @@ -25,8 +28,22 @@ impl Config { .parse() .expect("Unable to parse port."), health_endpoint: "/dmtr_health".to_string(), + health_pool_interval: std::time::Duration::from_secs(10), } } + + pub fn instance(&self) -> String { + format!( + "{}.{}:{}", + self.service(), + self.utxorpc_dns, + self.utxorpc_port + ) + } + + pub fn service(&self) -> String { + format!("utxorpc-{}-grpc", self.network) + } } impl Default for Config { fn default() -> Self { diff --git a/proxy/src/health.rs b/proxy/src/health.rs new file mode 100644 index 0000000..5b043c3 --- /dev/null +++ b/proxy/src/health.rs @@ -0,0 +1,102 @@ +use async_trait::async_trait; +use operator::{ + k8s_openapi::api::core::v1::{Endpoints, Pod}, + kube::{api::ListParams, Api, Client}, +}; +use pingora::{server::ShutdownWatch, services::background::BackgroundService}; +use std::{collections::HashSet, sync::Arc}; +use tracing::{info, warn}; + +use crate::{Config, State}; + +pub struct HealthBackgroundService { + state: Arc, + config: Arc, +} +impl HealthBackgroundService { + pub fn new(state: Arc, config: Arc) -> Self { + Self { state, config } + } + + async fn get_health(&self) -> bool { + // Create a Kubernetes client + let client = Client::try_default() + .await + .expect("Unable to instance k8s client."); + + // Get the Endpoints associated with the service + let endpoints_api: Api = Api::default_namespaced(client.clone()); + let endpoints = match endpoints_api.get(&self.config.service()).await { + Ok(endpoints) => endpoints, + Err(err) => { + warn!( + error = err.to_string(), + "Error getting endpoints for health." + ); + return false; + } + }; + + // Extract the IPs of the pods from the Endpoints + let mut pod_ips = HashSet::new(); + if let Some(subsets) = endpoints.subsets { + for subset in subsets { + if let Some(addresses) = subset.addresses { + for address in addresses { + pod_ips.insert(address.ip); + } + } + } + } + + // Get the Pods in the namespace + let pods_api: Api = Api::default_namespaced(client); + let pods = match pods_api.list(&ListParams::default()).await { + Ok(pods) => pods, + Err(err) => { + warn!(error = err.to_string(), "Error getting pods for health."); + return false; + } + }; + + // Filter the pods to match the IPs found in the Endpoints + let running_pods: Vec<_> = pods + .items + .into_iter() + .filter(|pod| { + if let Some(status) = &pod.status { + if let Some(pod_ip) = &status.pod_ip { + return pod_ips.contains(pod_ip); + } + } + false + }) + .collect(); + + !running_pods.is_empty() + } + + async fn update_health(&self) { + let current_health = *self.state.upstream_health.read().await; + + let new_health = self.get_health().await; + + match (current_health, new_health) { + (false, true) => info!("Upstream is now healthy, ready to proxy requests."), + (true, false) => warn!("Upstream is now deamed unhealthy, no pods in running state"), + _ => {} + } + + *self.state.upstream_health.write().await = new_health; + } +} + +#[async_trait] +impl BackgroundService for HealthBackgroundService { + async fn start(&self, mut _shutdown: ShutdownWatch) { + loop { + self.update_health().await; + tokio::time::sleep(self.config.health_pool_interval).await; + } + } +} diff --git a/proxy/src/main.rs b/proxy/src/main.rs index 6062884..1a9b8a9 100644 --- a/proxy/src/main.rs +++ b/proxy/src/main.rs @@ -1,6 +1,7 @@ use auth::AuthBackgroundService; use config::Config; use dotenv::dotenv; +use health::HealthBackgroundService; use operator::{kube::ResourceExt, UtxoRpcPort}; use pingora::{ server::{configuration::Opt, Server}, @@ -14,6 +15,7 @@ use tracing::Level; mod auth; mod config; +mod health; mod proxy; fn main() { @@ -44,19 +46,18 @@ fn main() { tls_settings.enable_h2(); utxorpc_http_proxy.add_tls_with_settings(&config.proxy_addr, None, tls_settings); - // utxorpc_http_proxy - // .add_tls( - // &config.proxy_addr, - // &config.ssl_crt_path, - // &config.ssl_key_path, - // ) - // .unwrap(); server.add_service(utxorpc_http_proxy); let mut prometheus_service = pingora::services::listening::Service::prometheus_http_service(); prometheus_service.add_tcp(&config.prometheus_addr); server.add_service(prometheus_service); + let health_background_service = background_service( + "K8S Auth Service", + HealthBackgroundService::new(state.clone(), config.clone()), + ); + server.add_service(health_background_service); + server.run_forever(); } @@ -64,6 +65,7 @@ fn main() { pub struct State { consumers: RwLock>, metrics: Metrics, + upstream_health: RwLock, } impl State { pub async fn get_consumer(&self, key: &str) -> Option { diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs index a3af136..4c6dd54 100644 --- a/proxy/src/proxy.rs +++ b/proxy/src/proxy.rs @@ -31,10 +31,18 @@ impl UtxoRpcProxy { async fn respond_health(&self, session: &mut Session, ctx: &mut Context) { ctx.is_health_request = true; session.set_keepalive(None); - let header = Box::new(ResponseHeader::build(200, None).unwrap()); + + let is_healthy = *self.state.upstream_health.read().await; + let (code, message) = if is_healthy { + (200, "OK") + } else { + (500, "UNHEALTHY") + }; + + let header = Box::new(ResponseHeader::build(code, None).unwrap()); session.write_response_header(header, true).await.unwrap(); session - .write_response_body(Some(Bytes::from("OK")), true) + .write_response_body(Some(Bytes::from(message)), true) .await .unwrap(); } @@ -65,17 +73,19 @@ impl ProxyHttp for UtxoRpcProxy { } let key = self.extract_key(session); - let consumer = self.state.get_consumer(&key).await; - if consumer.is_none() { - return session.respond_error(401).await.map(|_| true); + ctx.consumer = match self.state.get_consumer(&key).await { + Some(consumer) => consumer, + None => { + return session.respond_error(401).await.map(|_| true); + } + }; + + if ctx.consumer.network != self.config.network { + return session.respond_error(404).await.map(|_| true); } - ctx.consumer = consumer.unwrap(); - ctx.instance = format!( - "utxorpc-{}-grpc.{}:{}", - ctx.consumer.network, self.config.utxorpc_dns, self.config.utxorpc_port - ); + ctx.instance = self.config.instance(); Ok(false) }