Skip to content

Commit

Permalink
Add Ray Cluster ResourceQuotas (#380)
Browse files Browse the repository at this point in the history
* Add networkpolicy for kuberay clusters
* Turn on Dataplanev2 for GKE Standard clusters
* Add an option to disable network policy
* Disable network policy by default in RAG App
* Disabled Ray cluster network policy by default in RAG App due to
  gmp-operator's inability to handle inconsistent network state on
  startup.
* Add Ray resource quotas
* Add resource requests to GMP
* Fix broken iam.gke.io label in kuberay-cluster module
* Rework networkpolicy definition and variables
* Terraform format fixes
* Add namespace dependency for resource quotas
  • Loading branch information
bjornsen committed Mar 21, 2024
1 parent 551c24a commit 4131ea6
Show file tree
Hide file tree
Showing 7 changed files with 136 additions and 82 deletions.
1 change: 1 addition & 0 deletions applications/rag/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ variable "enable_grafana_on_ray_dashboard" {
description = "Add option to enable or disable grafana for the ray dashboard. Enabling requires anonymous access."
default = false
}

variable "create_ray_service_account" {
type = bool
description = "Creates a google IAM service account & k8s service account & configures workload identity"
Expand Down
61 changes: 38 additions & 23 deletions applications/ray/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -51,16 +51,13 @@ data "google_container_cluster" "default" {
}

locals {
endpoint = var.create_cluster ? "https://${module.infra[0].endpoint}" : "https://${data.google_container_cluster.default[0].endpoint}"
ca_certificate = var.create_cluster ? base64decode(module.infra[0].ca_certificate) : base64decode(data.google_container_cluster.default[0].master_auth[0].cluster_ca_certificate)
private_cluster = var.create_cluster ? var.private_cluster : data.google_container_cluster.default[0].private_cluster_config.0.enable_private_endpoint
cluster_membership_id = var.cluster_membership_id == "" ? var.cluster_name : var.cluster_membership_id
enable_autopilot = var.create_cluster ? var.autopilot_cluster : data.google_container_cluster.default[0].enable_autopilot
enable_tpu = var.create_cluster ? var.enable_tpu : data.google_container_cluster.default[0].enable_tpu
host = local.private_cluster ? "https://connectgateway.googleapis.com/v1/projects/${data.google_project.project.number}/locations/${var.cluster_location}/gkeMemberships/${local.cluster_membership_id}" : local.endpoint
}

locals {
endpoint = var.create_cluster ? "https://${module.infra[0].endpoint}" : "https://${data.google_container_cluster.default[0].endpoint}"
ca_certificate = var.create_cluster ? base64decode(module.infra[0].ca_certificate) : base64decode(data.google_container_cluster.default[0].master_auth[0].cluster_ca_certificate)
private_cluster = var.create_cluster ? var.private_cluster : data.google_container_cluster.default[0].private_cluster_config.0.enable_private_endpoint
cluster_membership_id = var.cluster_membership_id == "" ? var.cluster_name : var.cluster_membership_id
enable_autopilot = var.create_cluster ? var.autopilot_cluster : data.google_container_cluster.default[0].enable_autopilot
enable_tpu = var.create_cluster ? var.enable_tpu : data.google_container_cluster.default[0].enable_tpu
host = local.private_cluster ? "https://connectgateway.googleapis.com/v1/projects/${data.google_project.project.number}/locations/${var.cluster_location}/gkeMemberships/${local.cluster_membership_id}" : local.endpoint
workload_identity_service_account = var.goog_cm_deployment_name != "" ? "${var.goog_cm_deployment_name}-${var.workload_identity_service_account}" : var.workload_identity_service_account
ray_cluster_default_uri = "https://console.cloud.google.com/kubernetes/service/${var.cluster_location}/${var.cluster_name}/${var.kubernetes_namespace}/${var.ray_cluster_name}-kuberay-head-svc/overview?project=${var.project_id}"
}
Expand Down Expand Up @@ -142,18 +139,36 @@ module "gcs" {
}

module "kuberay-cluster" {
count = var.create_ray_cluster == true ? 1 : 0
source = "../../modules/kuberay-cluster"
providers = { helm = helm.ray, kubernetes = kubernetes.ray }
name = var.ray_cluster_name
namespace = var.kubernetes_namespace
project_id = var.project_id
enable_tpu = local.enable_tpu
enable_gpu = var.enable_gpu
gcs_bucket = var.gcs_bucket
autopilot_cluster = local.enable_autopilot
google_service_account = local.workload_identity_service_account
grafana_host = var.enable_grafana_on_ray_dashboard ? module.kuberay-monitoring[0].grafana_uri : ""
depends_on = [module.gcs, module.kuberay-operator]
count = var.create_ray_cluster == true ? 1 : 0
source = "../../modules/kuberay-cluster"
providers = { helm = helm.ray, kubernetes = kubernetes.ray }
name = var.ray_cluster_name
namespace = var.kubernetes_namespace
project_id = var.project_id
enable_tpu = local.enable_tpu
enable_gpu = var.enable_gpu
gcs_bucket = var.gcs_bucket
autopilot_cluster = local.enable_autopilot
google_service_account = local.workload_identity_service_account
grafana_host = var.enable_grafana_on_ray_dashboard ? module.kuberay-monitoring[0].grafana_uri : ""
network_policy_allow_cidr = var.kuberay_network_policy_allow_cidr
disable_network_policy = var.disable_ray_cluster_network_policy
depends_on = [module.gcs, module.kuberay-operator]
}


# Assign resource quotas to Ray namespace to ensure that they don't overutilize resources
resource "kubernetes_resource_quota" "ray_namespace_resource_quota" {
provider = kubernetes.ray
count = var.disable_resource_quotas ? 0 : 1
metadata {
name = "ray-resource-quota"
namespace = var.kubernetes_namespace
}

spec {
hard = var.resource_quotas
}

depends_on = [module.namespace]
}
36 changes: 36 additions & 0 deletions applications/ray/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -183,3 +183,39 @@ variable "goog_cm_deployment_name" {
type = string
default = ""
}

# These default resource quotas are set intentionally high as an example that won't be limiting for most Ray clusters.
# Consult https://kubernetes.io/docs/concepts/policy/resource-quotas/ for additional quotas that may be set.
variable "resource_quotas" {
description = "Kubernetes ResourceQuota object to attach to the Ray cluster's namespace"
type = map(string)
default = {
cpu = "1000"
memory = "10Ti"
"requests.nvidia.com/gpu" = "100"
"requests.google.com/tpu" = "100"
}
}

variable "disable_resource_quotas" {
description = "Set to true to remove resource quotas from your Ray clusters. Not recommended"
type = bool
default = false
}

# This is a list of CIDR ranges allowed to access a Ray cluster's job submission API and Dashboard.
#
# Example:
# kuberay_network_policy_allow_cidr = "10.0.0.0/8"
#
variable "kuberay_network_policy_allow_cidr" {
description = "List of CIDRs that are allowed to access this Ray cluster's job submission and dashboard port."
type = string
default = ""
}

variable "disable_ray_cluster_network_policy" {
description = "Disables Kubernetes Network Policy for Ray Clusters for this demo. Defaulting to 'true' aka disabled pending fixes to the kuberay-monitoring module. This should be defaulted to false."
type = bool
default = false
}
77 changes: 42 additions & 35 deletions modules/kuberay-cluster/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -95,10 +95,10 @@ data "kubernetes_service" "head-svc" {
}

# Allow ingress to the kuberay head from outside the cluster
resource "kubernetes_network_policy" "kuberay-head-network-policy" {
resource "kubernetes_network_policy" "ray-job-and-dashboard-namespace-network-policy" {
count = var.disable_network_policy ? 0 : 1
metadata {
name = "terraform-kuberay-head-network-policy"
name = "terraform-kuberay-head-namespace-network-policy"
namespace = var.namespace
}

Expand All @@ -110,46 +110,52 @@ resource "kubernetes_network_policy" "kuberay-head-network-policy" {
}

ingress {
# Ray Client server
ports {
port = "10001"
protocol = "TCP"
}
# Ray Dashboard
# Ray job submission and dashboard
ports {
port = "8265"
protocol = "TCP"
}

from {
namespace_selector {
match_labels = {
"kubernetes.io/metadata.name" = var.namespace
}
}
dynamic "namespace_selector" {
for_each = var.network_policy_allow_namespaces_by_label
content {
match_labels = {
each.key = each.value
}
match_expressions {
key = "kubernetes.io/metadata.name"
operator = "In"
values = var.network_policy_allow_namespaces
}
}
}
}

dynamic "pod_selector" {
for_each = var.network_policy_allow_pods_by_label
content {
match_labels = {
each.key = each.value
}
}
}
policy_types = ["Ingress"]
}
}

dynamic "ip_block" {
for_each = var.network_policy_allow_ips
content {
cidr = each.key
}
# Allow ingress to the kuberay head from outside the cluster
resource "kubernetes_network_policy" "kuberay-job-and-dashboard-cidr-network-policy" {
count = var.network_policy_allow_cidr != "" && !var.disable_network_policy ? 1 : 0
metadata {
name = "terraform-kuberay-head-cidr-network-policy"
namespace = var.namespace
}

spec {
pod_selector {
match_labels = {
"ray.io/is-ray-node" : "yes"
}
}

ingress {
# Ray job submission and dashboard
ports {
port = "8265"
protocol = "TCP"
}

from {
ip_block {
cidr = var.network_policy_allow_cidr
}
}
}
Expand All @@ -158,7 +164,7 @@ resource "kubernetes_network_policy" "kuberay-head-network-policy" {
}
}

# Allow all intranamespace traffic to allow intracluster traffic
# Allow all same namespace and gmp traffic
resource "kubernetes_network_policy" "kuberay-cluster-allow-network-policy" {
count = var.disable_network_policy ? 0 : 1
metadata {
Expand All @@ -180,9 +186,10 @@ resource "kubernetes_network_policy" "kuberay-cluster-allow-network-policy" {

from {
namespace_selector {
match_labels = {
"kubernetes.io/metadata.name" = var.namespace
"kubernetes.io/metadata.name" = "gke-gmp-system"
match_expressions {
key = "kubernetes.io/metadata.name"
operator = "In"
values = [var.namespace, "gke-gmp-system"]
}
}
}
Expand Down
36 changes: 12 additions & 24 deletions modules/kuberay-cluster/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -132,41 +132,29 @@ variable "security_context" {
}
}

# Note: By default, intra-namespace ingress is allowed to let the cluster talk to itself
# Note: Same namespace and gmp-system ingress are hardcoded into the allowlist.
#
# This is a list of maps of arbitrary key/value pairs of namespace labels allowed to access
# a Ray cluster's job submission API and Dashboard. These labels act as ORs, not ANDs.
# This is a list of maps of Kubernetes namespaces, by name, to
# a Ray cluster's job submission API and Dashboard.
#
# Example:
# network_policy_allow_namespaces_by_label = [{user: "jane"}, {"kubernetes.io/metadata.name": "janespace"}]
# network_policy_allow_namespaces = ["onespace", "twospace", "redspace", "bluespace"]
#
variable "network_policy_allow_namespaces_by_label" {
variable "network_policy_allow_namespaces" {
description = "Namespaces allowed to access this kuberay cluster"
type = list(map(string))
default = []
}

# This is a list of maps of arbitrary key/value pairs of pod labels allowed to access
# a Ray cluster's job submission API and Dashboard. These labels act as ORs, not ANDs.
#
# Example:
# network_policy_allow_pods_by_label = [{role: "frontend"}, {"app": "jupyter"}]
#
variable "network_policy_allow_pods_by_label" {
description = "Pods allowed to access this kuberay cluster"
type = list(map(string))
default = []
type = list(string)
default = [""]
}

# This is a list of CIDR ranges allowed to access a Ray cluster's job submission API and Dashboard.
#
# Example:
# network_policy_allow_ips = ['10.0.0.0/8', '192.168.0.0/24']
# network_policy_allow_cidr = "10.0.0.0/8"
#
variable "network_policy_allow_ips" {
description = "CIDR ranges allowed to access this kuberay cluster"
type = list(string)
default = []
variable "network_policy_allow_cidr" {
description = "CIDR range allowed to access this kuberay cluster"
type = string
default = ""
}

variable "db_secret_name" {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,3 +47,7 @@ spec:
httpGet:
path: /-/healthy
port: web
resources:
requests:
memory: {{ .Values.memory }}
cpu: {{ .Values.cpu }}
3 changes: 3 additions & 0 deletions modules/kuberay-monitoring/charts/gmp-engine/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,9 @@ image:

replicaCount: 2

cpu: "1m"
memory: "5Mi"

podMonitoring:
- name: ray-monitoring
selector:
Expand Down

0 comments on commit 4131ea6

Please sign in to comment.