Skip to content

Commit

Permalink
Refactor: move workload identity service account out of kuberay-opera…
Browse files Browse the repository at this point in the history
…tor (#769)

* Refactor: create module for workload identity service account

Change-Id: I29e985e77a1ff2d5f4a8d9493c1e65907c89c100

* fix: add todo

Change-Id: I3357e8f9dd16c7958dff0f0cf0f990fce980f474

---------

Co-authored-by: Gen Lu <[email protected]>
  • Loading branch information
genlu2011 and Gen Lu committed Aug 15, 2024
1 parent 3345790 commit 1f4c968
Show file tree
Hide file tree
Showing 4 changed files with 63 additions and 75 deletions.
64 changes: 39 additions & 25 deletions applications/rag/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -153,15 +153,13 @@ module "namespace" {
}

module "kuberay-operator" {
source = "../../modules/kuberay-operator"
providers = { helm = helm.rag, kubernetes = kubernetes.rag }
name = "kuberay-operator"
project_id = var.project_id
create_namespace = true
namespace = local.kubernetes_namespace
google_service_account = local.ray_service_account
create_service_account = var.create_ray_service_account
autopilot_cluster = local.enable_autopilot
source = "../../modules/kuberay-operator"
providers = { helm = helm.rag, kubernetes = kubernetes.rag }
name = "kuberay-operator"
project_id = var.project_id
create_namespace = true
namespace = local.kubernetes_namespace
autopilot_cluster = local.enable_autopilot
}

module "gcs" {
Expand Down Expand Up @@ -225,6 +223,32 @@ module "kuberay-logging" {
depends_on = [module.namespace]
}

module "kuberay-workload-identity" {
providers = { kubernetes = kubernetes.rag }
source = "terraform-google-modules/kubernetes-engine/google//modules/workload-identity"
version = "30.0.0" # Pinning to a previous version as current version (30.1.0) showed inconsitent behaviour with workload identity service accounts
use_existing_gcp_sa = !var.create_ray_service_account
name = local.ray_service_account
namespace = local.kubernetes_namespace
project_id = var.project_id
roles = ["roles/cloudsql.client", "roles/monitoring.viewer"]
automount_service_account_token = true
depends_on = [module.namespace]
}

module "kuberay-monitoring" {
source = "../../modules/kuberay-monitoring"
providers = { helm = helm.rag, kubernetes = kubernetes.rag }
project_id = var.project_id
autopilot_cluster = local.enable_autopilot
namespace = local.kubernetes_namespace
create_namespace = true
enable_grafana_on_ray_dashboard = var.enable_grafana_on_ray_dashboard
k8s_service_account = local.ray_service_account
//TODO(genlu): remove the module.kuberay-operator after migrated using ray addon.
depends_on = [module.namespace, module.kuberay-operator, module.kuberay-workload-identity]
}

module "kuberay-cluster" {
source = "../../modules/kuberay-cluster"
providers = { helm = helm.rag, kubernetes = kubernetes.rag }
Expand All @@ -233,16 +257,17 @@ module "kuberay-cluster" {
enable_gpu = true
gcs_bucket = var.gcs_bucket
autopilot_cluster = local.enable_autopilot
db_secret_name = module.cloudsql.db_secret_name
cloudsql_instance_name = local.cloudsql_instance
db_region = local.cloudsql_instance_region
google_service_account = local.ray_service_account
grafana_host = module.kuberay-monitoring.grafana_uri
disable_network_policy = var.disable_ray_cluster_network_policy
depends_on = [module.kuberay-operator]
use_custom_image = true
additional_labels = var.additional_labels

# Implicit dependency
db_secret_name = module.cloudsql.db_secret_name
grafana_host = module.kuberay-monitoring.grafana_uri

# IAP Auth parameters
add_auth = var.ray_dashboard_add_auth
create_brand = var.create_brand
Expand All @@ -256,19 +281,8 @@ module "kuberay-cluster" {
k8s_backend_service_port = var.ray_dashboard_k8s_backend_service_port
domain = var.ray_dashboard_domain
members_allowlist = var.ray_dashboard_members_allowlist != "" ? split(",", var.ray_dashboard_members_allowlist) : []
}

module "kuberay-monitoring" {
source = "../../modules/kuberay-monitoring"
providers = { helm = helm.rag, kubernetes = kubernetes.rag }
project_id = var.project_id
autopilot_cluster = local.enable_autopilot
namespace = local.kubernetes_namespace
create_namespace = true
enable_grafana_on_ray_dashboard = var.enable_grafana_on_ray_dashboard
k8s_service_account = local.ray_service_account
# TODO(umeshkumhar): remove kuberay-operator depends, figure out service account dependency
depends_on = [module.namespace, module.kuberay-operator]
//TODO(genlu): remove the module.kuberay-operator after migrated using ray addon.
depends_on = [module.gcs, module.kuberay-operator, module.kuberay-workload-identity]
}

module "inference-server" {
Expand Down
35 changes: 24 additions & 11 deletions applications/ray/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -134,16 +134,27 @@ module "namespace" {
namespace = local.kubernetes_namespace
}

module "kuberay-workload-identity" {
providers = { kubernetes = kubernetes.ray }
source = "terraform-google-modules/kubernetes-engine/google//modules/workload-identity"
version = "30.0.0" # Pinning to a previous version as current version (30.1.0) showed inconsitent behaviour with workload identity service accounts
use_existing_gcp_sa = !var.create_service_account
name = local.workload_identity_service_account
namespace = local.kubernetes_namespace
project_id = var.project_id
roles = ["roles/cloudsql.client", "roles/monitoring.viewer"]
automount_service_account_token = true
depends_on = [module.namespace]
}

module "kuberay-operator" {
source = "../../modules/kuberay-operator"
providers = { helm = helm.ray, kubernetes = kubernetes.ray }
name = "kuberay-operator"
create_namespace = true
namespace = local.kubernetes_namespace
project_id = var.project_id
autopilot_cluster = local.enable_autopilot
google_service_account = local.workload_identity_service_account
create_service_account = var.create_service_account
source = "../../modules/kuberay-operator"
providers = { helm = helm.ray, kubernetes = kubernetes.ray }
name = "kuberay-operator"
create_namespace = true
namespace = local.kubernetes_namespace
project_id = var.project_id
autopilot_cluster = local.enable_autopilot
}

module "kuberay-logging" {
Expand All @@ -164,7 +175,8 @@ module "kuberay-monitoring" {
create_namespace = true
enable_grafana_on_ray_dashboard = var.enable_grafana_on_ray_dashboard
k8s_service_account = local.workload_identity_service_account
depends_on = [module.kuberay-operator]
//TODO(genlu): remove the module.kuberay-operator after migrated using ray addon.
depends_on = [module.kuberay-workload-identity, module.kuberay-operator]
}

module "gcs" {
Expand Down Expand Up @@ -204,7 +216,8 @@ module "kuberay-cluster" {
k8s_backend_service_port = var.ray_dashboard_k8s_backend_service_port
domain = var.ray_dashboard_domain
members_allowlist = var.ray_dashboard_members_allowlist != "" ? split(",", var.ray_dashboard_members_allowlist) : []
depends_on = [module.gcs, module.kuberay-operator]
//TODO(genlu): remove the module.kuberay-operator after migrated using ray addon.
depends_on = [module.gcs, module.kuberay-operator, module.kuberay-workload-identity]
}


Expand Down
27 changes: 0 additions & 27 deletions modules/kuberay-operator/kuberay.tf
Original file line number Diff line number Diff line change
Expand Up @@ -23,33 +23,6 @@ resource "helm_release" "kuberay-operator" {
create_namespace = var.create_namespace
}

module "kuberay-workload-identity" {
source = "terraform-google-modules/kubernetes-engine/google//modules/workload-identity"
version = "30.0.0" # Pinning to a previous version as current version (30.1.0) showed inconsitent behaviour with workload identity service accounts
use_existing_gcp_sa = !var.create_service_account
name = var.google_service_account
namespace = var.namespace
project_id = var.project_id
roles = ["roles/cloudsql.client", "roles/monitoring.viewer"]

automount_service_account_token = true

depends_on = [helm_release.kuberay-operator]
}

resource "kubernetes_secret_v1" "service_account_token" {
metadata {
name = "kuberay-sa-token"
namespace = var.namespace
annotations = {
"kubernetes.io/service-account.name" = var.google_service_account
}
}
type = "kubernetes.io/service-account-token"

depends_on = [module.kuberay-workload-identity]
}

# Grant access to batchv1/Jobs to kuberay-operator since the kuberay-operator role is missing some permissions.
# See https://github.com/ray-project/kuberay/issues/1706 for more details.
# TODO: remove this role binding once the kuberay-operator helm chart is upgraded to v1.1
Expand Down
12 changes: 0 additions & 12 deletions modules/kuberay-operator/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -34,15 +34,3 @@ variable "create_namespace" {
variable "autopilot_cluster" {
type = bool
}

variable "google_service_account" {
type = string
description = "Google service account name"
default = "kuberay-gcp-sa"
}

variable "create_service_account" {
type = bool
description = "Creates a google service account & k8s service account & configures workload identity"
default = true
}

0 comments on commit 1f4c968

Please sign in to comment.