diff --git a/applications/rag/variables.tf b/applications/rag/variables.tf index 21802c0a1..2c3f5d5d3 100644 --- a/applications/rag/variables.tf +++ b/applications/rag/variables.tf @@ -55,6 +55,7 @@ variable "enable_grafana_on_ray_dashboard" { description = "Add option to enable or disable grafana for the ray dashboard. Enabling requires anonymous access." default = false } + variable "create_ray_service_account" { type = bool description = "Creates a google IAM service account & k8s service account & configures workload identity" diff --git a/applications/ray/main.tf b/applications/ray/main.tf index 59e9863c9..3717d929c 100644 --- a/applications/ray/main.tf +++ b/applications/ray/main.tf @@ -51,16 +51,13 @@ data "google_container_cluster" "default" { } locals { - endpoint = var.create_cluster ? "https://${module.infra[0].endpoint}" : "https://${data.google_container_cluster.default[0].endpoint}" - ca_certificate = var.create_cluster ? base64decode(module.infra[0].ca_certificate) : base64decode(data.google_container_cluster.default[0].master_auth[0].cluster_ca_certificate) - private_cluster = var.create_cluster ? var.private_cluster : data.google_container_cluster.default[0].private_cluster_config.0.enable_private_endpoint - cluster_membership_id = var.cluster_membership_id == "" ? var.cluster_name : var.cluster_membership_id - enable_autopilot = var.create_cluster ? var.autopilot_cluster : data.google_container_cluster.default[0].enable_autopilot - enable_tpu = var.create_cluster ? var.enable_tpu : data.google_container_cluster.default[0].enable_tpu - host = local.private_cluster ? "https://connectgateway.googleapis.com/v1/projects/${data.google_project.project.number}/locations/${var.cluster_location}/gkeMemberships/${local.cluster_membership_id}" : local.endpoint -} - -locals { + endpoint = var.create_cluster ? "https://${module.infra[0].endpoint}" : "https://${data.google_container_cluster.default[0].endpoint}" + ca_certificate = var.create_cluster ? base64decode(module.infra[0].ca_certificate) : base64decode(data.google_container_cluster.default[0].master_auth[0].cluster_ca_certificate) + private_cluster = var.create_cluster ? var.private_cluster : data.google_container_cluster.default[0].private_cluster_config.0.enable_private_endpoint + cluster_membership_id = var.cluster_membership_id == "" ? var.cluster_name : var.cluster_membership_id + enable_autopilot = var.create_cluster ? var.autopilot_cluster : data.google_container_cluster.default[0].enable_autopilot + enable_tpu = var.create_cluster ? var.enable_tpu : data.google_container_cluster.default[0].enable_tpu + host = local.private_cluster ? "https://connectgateway.googleapis.com/v1/projects/${data.google_project.project.number}/locations/${var.cluster_location}/gkeMemberships/${local.cluster_membership_id}" : local.endpoint workload_identity_service_account = var.goog_cm_deployment_name != "" ? "${var.goog_cm_deployment_name}-${var.workload_identity_service_account}" : var.workload_identity_service_account ray_cluster_default_uri = "https://console.cloud.google.com/kubernetes/service/${var.cluster_location}/${var.cluster_name}/${var.kubernetes_namespace}/${var.ray_cluster_name}-kuberay-head-svc/overview?project=${var.project_id}" } @@ -142,18 +139,36 @@ module "gcs" { } module "kuberay-cluster" { - count = var.create_ray_cluster == true ? 1 : 0 - source = "../../modules/kuberay-cluster" - providers = { helm = helm.ray, kubernetes = kubernetes.ray } - name = var.ray_cluster_name - namespace = var.kubernetes_namespace - project_id = var.project_id - enable_tpu = local.enable_tpu - enable_gpu = var.enable_gpu - gcs_bucket = var.gcs_bucket - autopilot_cluster = local.enable_autopilot - google_service_account = local.workload_identity_service_account - grafana_host = var.enable_grafana_on_ray_dashboard ? module.kuberay-monitoring[0].grafana_uri : "" - depends_on = [module.gcs, module.kuberay-operator] + count = var.create_ray_cluster == true ? 1 : 0 + source = "../../modules/kuberay-cluster" + providers = { helm = helm.ray, kubernetes = kubernetes.ray } + name = var.ray_cluster_name + namespace = var.kubernetes_namespace + project_id = var.project_id + enable_tpu = local.enable_tpu + enable_gpu = var.enable_gpu + gcs_bucket = var.gcs_bucket + autopilot_cluster = local.enable_autopilot + google_service_account = local.workload_identity_service_account + grafana_host = var.enable_grafana_on_ray_dashboard ? module.kuberay-monitoring[0].grafana_uri : "" + network_policy_allow_cidr = var.kuberay_network_policy_allow_cidr + disable_network_policy = var.disable_ray_cluster_network_policy + depends_on = [module.gcs, module.kuberay-operator] } + +# Assign resource quotas to Ray namespace to ensure that they don't overutilize resources +resource "kubernetes_resource_quota" "ray_namespace_resource_quota" { + provider = kubernetes.ray + count = var.disable_resource_quotas ? 0 : 1 + metadata { + name = "ray-resource-quota" + namespace = var.kubernetes_namespace + } + + spec { + hard = var.resource_quotas + } + + depends_on = [module.namespace] +} diff --git a/applications/ray/variables.tf b/applications/ray/variables.tf index d894abbc9..7e8402496 100644 --- a/applications/ray/variables.tf +++ b/applications/ray/variables.tf @@ -183,3 +183,39 @@ variable "goog_cm_deployment_name" { type = string default = "" } + +# These default resource quotas are set intentionally high as an example that won't be limiting for most Ray clusters. +# Consult https://kubernetes.io/docs/concepts/policy/resource-quotas/ for additional quotas that may be set. +variable "resource_quotas" { + description = "Kubernetes ResourceQuota object to attach to the Ray cluster's namespace" + type = map(string) + default = { + cpu = "1000" + memory = "10Ti" + "requests.nvidia.com/gpu" = "100" + "requests.google.com/tpu" = "100" + } +} + +variable "disable_resource_quotas" { + description = "Set to true to remove resource quotas from your Ray clusters. Not recommended" + type = bool + default = false +} + +# This is a list of CIDR ranges allowed to access a Ray cluster's job submission API and Dashboard. +# +# Example: +# kuberay_network_policy_allow_cidr = "10.0.0.0/8" +# +variable "kuberay_network_policy_allow_cidr" { + description = "List of CIDRs that are allowed to access this Ray cluster's job submission and dashboard port." + type = string + default = "" +} + +variable "disable_ray_cluster_network_policy" { + description = "Disables Kubernetes Network Policy for Ray Clusters for this demo. Defaulting to 'true' aka disabled pending fixes to the kuberay-monitoring module. This should be defaulted to false." + type = bool + default = false +} diff --git a/modules/kuberay-cluster/main.tf b/modules/kuberay-cluster/main.tf index 4869da541..f783407e6 100644 --- a/modules/kuberay-cluster/main.tf +++ b/modules/kuberay-cluster/main.tf @@ -95,10 +95,10 @@ data "kubernetes_service" "head-svc" { } # Allow ingress to the kuberay head from outside the cluster -resource "kubernetes_network_policy" "kuberay-head-network-policy" { +resource "kubernetes_network_policy" "ray-job-and-dashboard-namespace-network-policy" { count = var.disable_network_policy ? 0 : 1 metadata { - name = "terraform-kuberay-head-network-policy" + name = "terraform-kuberay-head-namespace-network-policy" namespace = var.namespace } @@ -110,12 +110,7 @@ resource "kubernetes_network_policy" "kuberay-head-network-policy" { } ingress { - # Ray Client server - ports { - port = "10001" - protocol = "TCP" - } - # Ray Dashboard + # Ray job submission and dashboard ports { port = "8265" protocol = "TCP" @@ -123,33 +118,44 @@ resource "kubernetes_network_policy" "kuberay-head-network-policy" { from { namespace_selector { - match_labels = { - "kubernetes.io/metadata.name" = var.namespace - } - } - dynamic "namespace_selector" { - for_each = var.network_policy_allow_namespaces_by_label - content { - match_labels = { - each.key = each.value - } + match_expressions { + key = "kubernetes.io/metadata.name" + operator = "In" + values = var.network_policy_allow_namespaces } } + } + } - dynamic "pod_selector" { - for_each = var.network_policy_allow_pods_by_label - content { - match_labels = { - each.key = each.value - } - } - } + policy_types = ["Ingress"] + } +} - dynamic "ip_block" { - for_each = var.network_policy_allow_ips - content { - cidr = each.key - } +# Allow ingress to the kuberay head from outside the cluster +resource "kubernetes_network_policy" "kuberay-job-and-dashboard-cidr-network-policy" { + count = var.network_policy_allow_cidr != "" && !var.disable_network_policy ? 1 : 0 + metadata { + name = "terraform-kuberay-head-cidr-network-policy" + namespace = var.namespace + } + + spec { + pod_selector { + match_labels = { + "ray.io/is-ray-node" : "yes" + } + } + + ingress { + # Ray job submission and dashboard + ports { + port = "8265" + protocol = "TCP" + } + + from { + ip_block { + cidr = var.network_policy_allow_cidr } } } @@ -158,7 +164,7 @@ resource "kubernetes_network_policy" "kuberay-head-network-policy" { } } -# Allow all intranamespace traffic to allow intracluster traffic +# Allow all same namespace and gmp traffic resource "kubernetes_network_policy" "kuberay-cluster-allow-network-policy" { count = var.disable_network_policy ? 0 : 1 metadata { @@ -180,9 +186,10 @@ resource "kubernetes_network_policy" "kuberay-cluster-allow-network-policy" { from { namespace_selector { - match_labels = { - "kubernetes.io/metadata.name" = var.namespace - "kubernetes.io/metadata.name" = "gke-gmp-system" + match_expressions { + key = "kubernetes.io/metadata.name" + operator = "In" + values = [var.namespace, "gke-gmp-system"] } } } diff --git a/modules/kuberay-cluster/variables.tf b/modules/kuberay-cluster/variables.tf index 445da06db..40e8d935d 100644 --- a/modules/kuberay-cluster/variables.tf +++ b/modules/kuberay-cluster/variables.tf @@ -132,41 +132,29 @@ variable "security_context" { } } -# Note: By default, intra-namespace ingress is allowed to let the cluster talk to itself +# Note: Same namespace and gmp-system ingress are hardcoded into the allowlist. # -# This is a list of maps of arbitrary key/value pairs of namespace labels allowed to access -# a Ray cluster's job submission API and Dashboard. These labels act as ORs, not ANDs. +# This is a list of maps of Kubernetes namespaces, by name, to +# a Ray cluster's job submission API and Dashboard. # # Example: -# network_policy_allow_namespaces_by_label = [{user: "jane"}, {"kubernetes.io/metadata.name": "janespace"}] +# network_policy_allow_namespaces = ["onespace", "twospace", "redspace", "bluespace"] # -variable "network_policy_allow_namespaces_by_label" { +variable "network_policy_allow_namespaces" { description = "Namespaces allowed to access this kuberay cluster" - type = list(map(string)) - default = [] -} - -# This is a list of maps of arbitrary key/value pairs of pod labels allowed to access -# a Ray cluster's job submission API and Dashboard. These labels act as ORs, not ANDs. -# -# Example: -# network_policy_allow_pods_by_label = [{role: "frontend"}, {"app": "jupyter"}] -# -variable "network_policy_allow_pods_by_label" { - description = "Pods allowed to access this kuberay cluster" - type = list(map(string)) - default = [] + type = list(string) + default = [""] } # This is a list of CIDR ranges allowed to access a Ray cluster's job submission API and Dashboard. # # Example: -# network_policy_allow_ips = ['10.0.0.0/8', '192.168.0.0/24'] +# network_policy_allow_cidr = "10.0.0.0/8" # -variable "network_policy_allow_ips" { - description = "CIDR ranges allowed to access this kuberay cluster" - type = list(string) - default = [] +variable "network_policy_allow_cidr" { + description = "CIDR range allowed to access this kuberay cluster" + type = string + default = "" } variable "db_secret_name" { diff --git a/modules/kuberay-monitoring/charts/gmp-engine/templates/deployment.yaml b/modules/kuberay-monitoring/charts/gmp-engine/templates/deployment.yaml index 4a268c695..a354907f5 100644 --- a/modules/kuberay-monitoring/charts/gmp-engine/templates/deployment.yaml +++ b/modules/kuberay-monitoring/charts/gmp-engine/templates/deployment.yaml @@ -47,3 +47,7 @@ spec: httpGet: path: /-/healthy port: web + resources: + requests: + memory: {{ .Values.memory }} + cpu: {{ .Values.cpu }} diff --git a/modules/kuberay-monitoring/charts/gmp-engine/values.yaml b/modules/kuberay-monitoring/charts/gmp-engine/values.yaml index 681ecc7f5..de4935494 100644 --- a/modules/kuberay-monitoring/charts/gmp-engine/values.yaml +++ b/modules/kuberay-monitoring/charts/gmp-engine/values.yaml @@ -13,6 +13,9 @@ image: replicaCount: 2 +cpu: "1m" +memory: "5Mi" + podMonitoring: - name: ray-monitoring selector: