From be5f89ecbb33c0e904a907c687f66df843761de6 Mon Sep 17 00:00:00 2001 From: Shobhit Gupta <43795024+gushob21@users.noreply.github.com> Date: Thu, 29 Feb 2024 23:02:10 +0000 Subject: [PATCH 01/39] Adding ML platform reference architecture in the folder ml-platform (#265) initial push to kick off collaboration --- ml-platform/01_gcp_project/README.md | 107 ++++++ ml-platform/01_gcp_project/backend.tf | 21 ++ ml-platform/01_gcp_project/main.tf | 22 ++ .../modules/projects/outputs.tf | 17 + .../modules/projects/projects.tf | 96 ++++++ .../modules/projects/variables.tf | 43 +++ ml-platform/01_gcp_project/outputs.tf | 17 + ml-platform/01_gcp_project/providers.tf | 22 ++ ml-platform/01_gcp_project/variables.tf | 43 +++ ml-platform/02_gke/README.md | 139 ++++++++ ml-platform/02_gke/backend.tf | 20 ++ ml-platform/02_gke/main.tf | 117 +++++++ .../02_gke/modules/cloud-nat/README.md | 108 ++++++ ml-platform/02_gke/modules/cloud-nat/main.tf | 80 +++++ .../02_gke/modules/cloud-nat/outputs.tf | 34 ++ .../02_gke/modules/cloud-nat/variables.tf | 145 ++++++++ .../02_gke/modules/cloud-nat/versions.tf | 30 ++ ml-platform/02_gke/modules/cluster/gke.tf | 124 +++++++ ml-platform/02_gke/modules/cluster/outputs.tf | 33 ++ .../02_gke/modules/cluster/variables.tf | 58 ++++ .../02_gke/modules/cluster/versions.tf | 27 ++ ml-platform/02_gke/modules/network/README.md | 110 ++++++ ml-platform/02_gke/modules/network/outputs.tf | 28 ++ .../02_gke/modules/network/variables.tf | 56 ++++ .../02_gke/modules/network/versions.tf | 22 ++ ml-platform/02_gke/modules/network/vpc.tf | 46 +++ .../02_gke/modules/node-pools/nodepools.tf | 64 ++++ .../02_gke/modules/node-pools/variables.tf | 83 +++++ .../02_gke/modules/node-pools/versions.tf | 27 ++ .../02_gke/modules/vm-reservations/outputs.tf | 17 + .../modules/vm-reservations/reservations.tf | 30 ++ .../modules/vm-reservations/variables.tf | 51 +++ .../modules/vm-reservations/versions.tf | 27 ++ ml-platform/02_gke/outputs.tf | 17 + ml-platform/02_gke/providers.tf | 27 ++ ml-platform/02_gke/variables.tf | 126 +++++++ ml-platform/03_configsync/README.md | 145 ++++++++ ml-platform/03_configsync/backend.tf | 20 ++ .../03_configsync/create_cluster_yamls.sh | 82 +++++ ml-platform/03_configsync/main.tf | 131 ++++++++ ml-platform/03_configsync/outputs.tf | 20 ++ ml-platform/03_configsync/providers.tf | 39 +++ .../acm-template/manifests/apps/.gitkeep | 0 .../acm-template/manifests/clusters/.gitkeep | 0 .../templates/_cluster_template/cluster.yaml | 21 ++ .../_cluster_template/config-selector.yaml | 23 ++ .../kuberay/kustomization.yaml | 30 ++ .../kuberay/rayclusters.yaml | 22 ++ .../_cluster_template/kuberay/rayjobs.yaml | 22 ++ .../kuberay/rayservices.yaml | 22 ++ .../_cluster_template/kuberay/rbac.yaml | 44 +++ .../_cluster_template/kuberay/values.yaml | 113 +++++++ .../_cluster_template/kustomization.yaml | 19 ++ .../templates/_cluster_template/selector.yaml | 22 ++ .../_cluster_template/team/kustomization.yaml | 22 ++ .../_cluster_template/team/namespace.yaml | 20 ++ .../team/network-policy.yaml | 32 ++ .../_cluster_template/team/rbac.yaml | 59 ++++ .../_cluster_template/team/reposync.yaml | 135 ++++++++ .../app/fluentd_config.yaml | 44 +++ .../app/kustomization.yaml | 27 ++ .../app/serviceaccount.yaml | 21 ++ .../_namespace_template/app/values.yaml | 313 ++++++++++++++++++ ml-platform/03_configsync/variables.tf | 45 +++ ml-platform/04_setup_clusters/README.md | 138 ++++++++ ml-platform/05_setup_teams/README.md | 169 ++++++++++ ml-platform/06_operating_teams/README.md | 154 +++++++++ ml-platform/README.md | 82 +++++ 68 files changed, 4070 insertions(+) create mode 100644 ml-platform/01_gcp_project/README.md create mode 100644 ml-platform/01_gcp_project/backend.tf create mode 100644 ml-platform/01_gcp_project/main.tf create mode 100644 ml-platform/01_gcp_project/modules/projects/outputs.tf create mode 100644 ml-platform/01_gcp_project/modules/projects/projects.tf create mode 100644 ml-platform/01_gcp_project/modules/projects/variables.tf create mode 100644 ml-platform/01_gcp_project/outputs.tf create mode 100644 ml-platform/01_gcp_project/providers.tf create mode 100644 ml-platform/01_gcp_project/variables.tf create mode 100644 ml-platform/02_gke/README.md create mode 100644 ml-platform/02_gke/backend.tf create mode 100644 ml-platform/02_gke/main.tf create mode 100644 ml-platform/02_gke/modules/cloud-nat/README.md create mode 100644 ml-platform/02_gke/modules/cloud-nat/main.tf create mode 100644 ml-platform/02_gke/modules/cloud-nat/outputs.tf create mode 100644 ml-platform/02_gke/modules/cloud-nat/variables.tf create mode 100644 ml-platform/02_gke/modules/cloud-nat/versions.tf create mode 100644 ml-platform/02_gke/modules/cluster/gke.tf create mode 100644 ml-platform/02_gke/modules/cluster/outputs.tf create mode 100644 ml-platform/02_gke/modules/cluster/variables.tf create mode 100644 ml-platform/02_gke/modules/cluster/versions.tf create mode 100644 ml-platform/02_gke/modules/network/README.md create mode 100644 ml-platform/02_gke/modules/network/outputs.tf create mode 100644 ml-platform/02_gke/modules/network/variables.tf create mode 100644 ml-platform/02_gke/modules/network/versions.tf create mode 100644 ml-platform/02_gke/modules/network/vpc.tf create mode 100644 ml-platform/02_gke/modules/node-pools/nodepools.tf create mode 100644 ml-platform/02_gke/modules/node-pools/variables.tf create mode 100644 ml-platform/02_gke/modules/node-pools/versions.tf create mode 100644 ml-platform/02_gke/modules/vm-reservations/outputs.tf create mode 100644 ml-platform/02_gke/modules/vm-reservations/reservations.tf create mode 100644 ml-platform/02_gke/modules/vm-reservations/variables.tf create mode 100644 ml-platform/02_gke/modules/vm-reservations/versions.tf create mode 100644 ml-platform/02_gke/outputs.tf create mode 100644 ml-platform/02_gke/providers.tf create mode 100644 ml-platform/02_gke/variables.tf create mode 100644 ml-platform/03_configsync/README.md create mode 100644 ml-platform/03_configsync/backend.tf create mode 100755 ml-platform/03_configsync/create_cluster_yamls.sh create mode 100644 ml-platform/03_configsync/main.tf create mode 100644 ml-platform/03_configsync/outputs.tf create mode 100644 ml-platform/03_configsync/providers.tf create mode 100644 ml-platform/03_configsync/templates/acm-template/manifests/apps/.gitkeep create mode 100644 ml-platform/03_configsync/templates/acm-template/manifests/clusters/.gitkeep create mode 100644 ml-platform/03_configsync/templates/acm-template/templates/_cluster_template/cluster.yaml create mode 100644 ml-platform/03_configsync/templates/acm-template/templates/_cluster_template/config-selector.yaml create mode 100644 ml-platform/03_configsync/templates/acm-template/templates/_cluster_template/kuberay/kustomization.yaml create mode 100644 ml-platform/03_configsync/templates/acm-template/templates/_cluster_template/kuberay/rayclusters.yaml create mode 100644 ml-platform/03_configsync/templates/acm-template/templates/_cluster_template/kuberay/rayjobs.yaml create mode 100644 ml-platform/03_configsync/templates/acm-template/templates/_cluster_template/kuberay/rayservices.yaml create mode 100644 ml-platform/03_configsync/templates/acm-template/templates/_cluster_template/kuberay/rbac.yaml create mode 100644 ml-platform/03_configsync/templates/acm-template/templates/_cluster_template/kuberay/values.yaml create mode 100644 ml-platform/03_configsync/templates/acm-template/templates/_cluster_template/kustomization.yaml create mode 100644 ml-platform/03_configsync/templates/acm-template/templates/_cluster_template/selector.yaml create mode 100644 ml-platform/03_configsync/templates/acm-template/templates/_cluster_template/team/kustomization.yaml create mode 100644 ml-platform/03_configsync/templates/acm-template/templates/_cluster_template/team/namespace.yaml create mode 100644 ml-platform/03_configsync/templates/acm-template/templates/_cluster_template/team/network-policy.yaml create mode 100644 ml-platform/03_configsync/templates/acm-template/templates/_cluster_template/team/rbac.yaml create mode 100644 ml-platform/03_configsync/templates/acm-template/templates/_cluster_template/team/reposync.yaml create mode 100644 ml-platform/03_configsync/templates/acm-template/templates/_namespace_template/app/fluentd_config.yaml create mode 100644 ml-platform/03_configsync/templates/acm-template/templates/_namespace_template/app/kustomization.yaml create mode 100644 ml-platform/03_configsync/templates/acm-template/templates/_namespace_template/app/serviceaccount.yaml create mode 100644 ml-platform/03_configsync/templates/acm-template/templates/_namespace_template/app/values.yaml create mode 100644 ml-platform/03_configsync/variables.tf create mode 100644 ml-platform/04_setup_clusters/README.md create mode 100644 ml-platform/05_setup_teams/README.md create mode 100644 ml-platform/06_operating_teams/README.md create mode 100644 ml-platform/README.md diff --git a/ml-platform/01_gcp_project/README.md b/ml-platform/01_gcp_project/README.md new file mode 100644 index 000000000..91fbdca18 --- /dev/null +++ b/ml-platform/01_gcp_project/README.md @@ -0,0 +1,107 @@ +Copyright 2024 Google LLC + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +## Requirements + +| Name | Version | +|------|---------| +| [google](#requirement\_google) | 4.72.1 | + + +## Modules + +| Name | Source | Version | +|------|--------|---------| +| [gcp-project](#module\_gcp-project) | ./modules/projects | n/a | + + +## Inputs + +| Name | Description | Type | Default | Required | +|-----------------------------------------------------------------------------------|--------------------------------------------------|------|------------------------------|:--------:| +| [billing\_account](#input\_billing\_account) | GCP billing account | `string` | n/a | yes | +| [env](#input\_env) | List of environments | `set(string)` |
[
"dev"
]
| no | +| [folder\_id](#input\_folder\_id) | Folder Id where the GCP projects will be created | `string` | `null` | no | +| [org\_id](#input\_org\_id) | The GCP orig id | `string` | n/a | yes | +| [project\_name](#input\_project\_name) | Project name | `string` | `ml-platfrom` | no | + +## Outputs + +| Name | Description | +|------|-------------| +| [project\_ids](#output\_project\_ids) | n/a | + + +## Workflow + +This module accepts a list of environments and creates a GCP project for each environment. + +Typically, you would want to have dev, staging and production environments created in separate projects. To have such isolation, pass `env` input variable as `[ "dev", "staging", "prod" ]`. This will create one project for dev, staging and prod environments. You can update the input variable `env` based on how many environments/projects you want to create. + +However, if you want to use a single project for multiple environments, you can create just one project by passing one element to `env` input variable list e.g [ "dev" ] or ["my-playground"] etc. + +## Prerequisite +To run this Terraform Module, you need to have the following IAM roles: +- roles/resourcemanager.projectCreator + +## Usage + +- Create a new GCP project that will host the TF state bucket. + - To create a new project, open `cloudshell` and run the following command: + ``` + gcloud projects create + ``` + - Associate billing account to the project + ``` + gcloud beta billing projects link \ + --billing-account + ``` + +- Create a GCS bucket in the project for storing TF state. + - To create a new bucket, run the following command in `cloudshell` + ``` + gcloud storage buckets create gs://-tf-state --location= --project + ``` +- Clone the repo and change dir + ``` + git clone https://github.com/GoogleCloudPlatform/ai-on-gke + cd ml-platform/01_gcp_project + ``` +- In backend.tf replace `YOUR_STATE_BUCKET` with the name of the GCS bucket. +- In variables.tf: + - replace `YOUR_GCP_ORG_ID` with your GCP Org ID. + - replace `YOUR_BILLING_ACCOUNT` with GCP your Billing account. + - (optional) overridde the default value of `folder_id` with the numeric ID of the folder this project should be created under. If you leave `folder_id` null, the projects will bw created under your org. + - (optional) override the default value of `env`. See [workflow](#workflow) for details. + +- terraform init +- terraform plan +- terraform apply --auto-approve + + +## Clean up + +1. The easiest way to prevent continued billing for the resources that you created for this tutorial is to delete the project you created for the tutorial. Run the following commands from Cloud Shell: + + ```bash + gcloud config unset project && \ + echo y | gcloud projects delete + ``` + +2. If the project needs to be left intact, another option is to destroy the infrastructure created from this module. Note, this does not destroy the Cloud Storage bucket containing the Terraform state and service enablement created out of Terraform. + + ```bash + cd ml-platform/01_gcp_project && \ + terraform destroy --auto-approve + ``` \ No newline at end of file diff --git a/ml-platform/01_gcp_project/backend.tf b/ml-platform/01_gcp_project/backend.tf new file mode 100644 index 000000000..5b9bff1bd --- /dev/null +++ b/ml-platform/01_gcp_project/backend.tf @@ -0,0 +1,21 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +terraform { + backend "gcs" { + prefix = "01_gcp_project" + bucket = "YOUR_STATE_BUCKET" + } +} + diff --git a/ml-platform/01_gcp_project/main.tf b/ml-platform/01_gcp_project/main.tf new file mode 100644 index 000000000..305bfce2c --- /dev/null +++ b/ml-platform/01_gcp_project/main.tf @@ -0,0 +1,22 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +module "gcp-project" { + source = "./modules/projects" + org_id = var.org_id + folder_id = var.folder_id + env = var.env + billing_account = var.billing_account + project_name = var.project_name +} diff --git a/ml-platform/01_gcp_project/modules/projects/outputs.tf b/ml-platform/01_gcp_project/modules/projects/outputs.tf new file mode 100644 index 000000000..e087e6c85 --- /dev/null +++ b/ml-platform/01_gcp_project/modules/projects/outputs.tf @@ -0,0 +1,17 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +output "project_ids" { + value = "${google_project.project_under_folder}" == {} ? "${google_project.project_under_org}" : "${google_project.project_under_folder}" +} \ No newline at end of file diff --git a/ml-platform/01_gcp_project/modules/projects/projects.tf b/ml-platform/01_gcp_project/modules/projects/projects.tf new file mode 100644 index 000000000..55c88ee86 --- /dev/null +++ b/ml-platform/01_gcp_project/modules/projects/projects.tf @@ -0,0 +1,96 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +resource "random_id" "random_project_id_suffix" { + byte_length = 2 +} + +resource "google_project" "project_under_folder" { + for_each = var.folder_id != null ? var.env : toset([]) + name = format("%s-%s",var.project_name,each.value) + project_id = format("%s-%s-%s",var.project_name,random_id.random_project_id_suffix.hex,each.value) + folder_id = var.folder_id + billing_account = var.billing_account +} + +resource "google_project" "project_under_org" { + for_each = var.folder_id == null ? var.env : toset([]) + name = format("%s-%s",var.project_name,each.value) + project_id = format("%s-%s-%s",var.project_name,random_id.random_project_id_suffix.hex,each.value) + org_id = var.org_id + billing_account = var.billing_account +} + +resource "google_project_service" "project_services" { + for_each = var.folder_id == null ? google_project.project_under_org : google_project.project_under_folder + project = each.value.id + service = "cloudresourcemanager.googleapis.com" + disable_on_destroy = true + disable_dependent_services = true + depends_on = [google_project.project_under_folder,google_project.project_under_org] +} + +resource "google_project_service" "project_services-1" { + for_each = var.folder_id == null ? google_project.project_under_org : google_project.project_under_folder + project = each.value.id + service = "iam.googleapis.com" + disable_on_destroy = true + disable_dependent_services = true + depends_on = [google_project.project_under_folder,google_project.project_under_org] +} + +resource "google_project_service" "project_services-2" { + for_each = var.folder_id == null ? google_project.project_under_org: google_project.project_under_folder + project = each.value.id + service = "container.googleapis.com" + disable_on_destroy = true + disable_dependent_services = true + depends_on = [google_project.project_under_folder,google_project.project_under_org] +} + +resource "google_project_service" "project_services-3" { + for_each = var.folder_id == null ? google_project.project_under_org: google_project.project_under_folder + project = each.value.id + service = "compute.googleapis.com" + disable_on_destroy = true + disable_dependent_services = true + depends_on = [google_project.project_under_folder,google_project.project_under_org] +} + +resource "google_project_service" "project_services-4" { + for_each = var.folder_id == null ? google_project.project_under_org : google_project.project_under_folder + project = each.value.id + service = "anthos.googleapis.com" + disable_on_destroy = true + disable_dependent_services = true + depends_on = [google_project.project_under_folder,google_project.project_under_org] +} + +resource "google_project_service" "project_services-5" { + for_each = var.folder_id == null ? google_project.project_under_org : google_project.project_under_folder + project = each.value.id + service = "anthosconfigmanagement.googleapis.com" + disable_on_destroy = true + disable_dependent_services = true + depends_on = [google_project.project_under_folder,google_project.project_under_org] +} + +resource "google_project_service" "project_services-6" { + for_each = var.folder_id == null ? google_project.project_under_org : google_project.project_under_folder + project = each.value.id + service = "gkehub.googleapis.com" + disable_on_destroy = true + disable_dependent_services = true + depends_on = [google_project.project_under_folder,google_project.project_under_org] +} \ No newline at end of file diff --git a/ml-platform/01_gcp_project/modules/projects/variables.tf b/ml-platform/01_gcp_project/modules/projects/variables.tf new file mode 100644 index 000000000..91f8cd0f9 --- /dev/null +++ b/ml-platform/01_gcp_project/modules/projects/variables.tf @@ -0,0 +1,43 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +variable "org_id" { + type = string + description = "The GCP orig id" + default = "" +} + +variable "env" { + type = set(string) + description = "List of environments" + default = ["dev"] +} + +variable "folder_id" { + type = string + description = "Folder id where the GCP projects will be created" + default = null +} + +variable "billing_account" { + type = string + description = "GCP billing account" + default = "" +} + +variable "project_name" { + type = string + description = "GCP project name" + default = "" +} diff --git a/ml-platform/01_gcp_project/outputs.tf b/ml-platform/01_gcp_project/outputs.tf new file mode 100644 index 000000000..7e4d72a6c --- /dev/null +++ b/ml-platform/01_gcp_project/outputs.tf @@ -0,0 +1,17 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +output "project_ids" { + value = {for k, v in "${module.gcp-project.project_ids}" : k => v.project_id} +} \ No newline at end of file diff --git a/ml-platform/01_gcp_project/providers.tf b/ml-platform/01_gcp_project/providers.tf new file mode 100644 index 000000000..1817d23eb --- /dev/null +++ b/ml-platform/01_gcp_project/providers.tf @@ -0,0 +1,22 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +terraform { + required_providers { + google = { + source = "hashicorp/google" + version = "4.72.1" + } + } +} \ No newline at end of file diff --git a/ml-platform/01_gcp_project/variables.tf b/ml-platform/01_gcp_project/variables.tf new file mode 100644 index 000000000..bb1adda73 --- /dev/null +++ b/ml-platform/01_gcp_project/variables.tf @@ -0,0 +1,43 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +variable "org_id" { + type = string + description = "The GCP orig id" + default = "YOUR_GCP_ORG_ID" +} + +variable "env" { + type = set(string) + description = "List of environments" + default = ["dev"] +} + +variable "folder_id" { + type = string + description = "Folder Id where the GCP projects will be created" + default = null +} + +variable "billing_account" { + type = string + description = "GCP billing account" + default = "YOUR_BILLING_ACCOUNT" +} + +variable "project_name" { + type = string + description = "GCP project name" + default = "ml-platform" +} \ No newline at end of file diff --git a/ml-platform/02_gke/README.md b/ml-platform/02_gke/README.md new file mode 100644 index 000000000..a136d3ef3 --- /dev/null +++ b/ml-platform/02_gke/README.md @@ -0,0 +1,139 @@ +Copyright 2024 Google LLC + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +## Requirements + +| Name | Version | +|------|---------| +| [google](#requirement\_google) | 4.72.1 | +| [google-beta](#requirement\_google-beta) | 4.72.1 | + +## Modules + +| Name | Source | Version | +|------|--------|---------| +| [cloud-nat](#module\_cloud-nat) | ./modules/cloud-nat | n/a | +| [create-vpc](#module\_create-vpc) | ./modules/network | n/a | +| [gke](#module\_gke) | ./modules/cluster | n/a | +| [node\_pool-ondemand](#module\_node\_pool-ondemand) | ./modules/node-pools | n/a | +| [node\_pool-reserved](#module\_node\_pool-reserved) | ./modules/node-pools | n/a | +| [node\_pool-spot](#module\_node\_pool-spot) | ./modules/node-pools | n/a | +| [reservation](#module\_reservation) | ./modules/vm-reservations | n/a | + +## Inputs + +| Name | Description | Type | Default | Required | +|------|-------------|------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------|:--------:| +| [cluster\_name](#input\_cluster\_name) | Name of the GKE cluster | `string` | `"gke-ml"` | no | +| [lookup\_state\_bucket](#input\_lookup\_state\_bucket) | GCS bucket to look up TF state from previous steps. | `string` | n/a | yes | +| [network\_name](#input\_network\_name) | VPC network where GKE cluster will be created | `string` | `"ml-vpc"` | no | +| [ondemand\_taints](#input\_ondemand\_taints) | Taints to be applied to the on-demand node pool. |
list(object({
key = string
value = any
effect = string
}))
|
[
{
"effect": "NO_SCHEDULE",
"key": "ondemand",
"value": true
}
]
| no | +| [project\_id](#input\_project\_id) | The GCP project where the resources will be created. It is a map with environments a skeys and project\_ids s values | `map` | n/a
 An example : 
project_id = {
"dev": "gkebatchexpce3c8dcb",
"prod": "gkebatchexpce3c8dcb",
"staging": "gkebatchexpce3c8dcb"
}
| yes | +| [reserved\_taints](#input\_reserved\_taints) | Taints to be applied to the reserved node pool. |
list(object({
key = string
value = any
effect = string
}))
|
[
{
"effect": "NO_SCHEDULE",
"key": "reserved",
"value": true
}
]
| no | +| [routing\_mode](#input\_routing\_mode) | VPC routing mode. | `string` | `"GLOBAL"` | no | +| [spot\_taints](#input\_spot\_taints) | Taints to be applied to the spot node pool. |
list(object({
key = string
value = any
effect = string
}))
|
[
{
"effect": "NO_SCHEDULE",
"key": "spot",
"value": true
}
]
| no | +| [subnet\_01\_description](#input\_subnet\_01\_description) | Description of the first subnet. | `string` | `"subnet 01"` | no | +| [subnet\_01\_ip](#input\_subnet\_01\_ip) | CIDR of the first subnet. | `string` | `"10.40.0.0/22"` | no | +| [subnet\_01\_name](#input\_subnet\_01\_name) | Name of the first subnet in the VPC network. | `string` | `"ml-vpc-subnet-01"` | no | +| [subnet\_01\_region](#input\_subnet\_01\_region) | Region of the first subnet. | `string` | `"us-central1"` | no | +| [subnet\_02\_description](#input\_subnet\_02\_description) | Description of the second subnet. | `string` | `"subnet 02"` | no | +| [subnet\_02\_ip](#input\_subnet\_02\_ip) | CIDR of the second subnet. | `string` | `"10.12.0.0/22"` | no | +| [subnet\_02\_name](#input\_subnet\_02\_name) | Name of the second subnet in the VPC network. | `string` | `"gke-vpc-subnet-02"` | no | +| [subnet\_02\_region](#input\_subnet\_02\_region) | Region of the second subnet. | `string` | `"us-west2"` | no | + +## Outputs + +| Name | Description | +|------|------------------| +| [gke\_cluster](#output\_gke\_cluster) | GKE cluster info | + +## Prerequisite +To run this Terraform Module, you need to have the following IAM roles on the projects where the GKE clusters will be created: +- roles/Owner + +## Usage +- Skip this step if you have run [01_gcp_project][projects] to create GCP projects. If you are starting from this module, run these steps. + - Create a new GCP project that will host the TF state bucket or use an existing project. + - To create a new project, open `cloudshell` and run the following command: + ``` + gcloud projects create + ``` + - Associate billing account to the project + ``` + gcloud beta billing projects link \ + --billing-account + ``` + + - Create a GCS bucket in the project for storing TF state. + - To create a new bucket, run the following command in `cloudshell` + ``` + gcloud storage buckets create gs://-tf-state --location= --project + ``` +- Clone the repo and change dir + ``` + git clone https://github.com/GoogleCloudPlatform/ai-on-gke + cd ml-platform/02_gke + ``` +- In backend.tf replace `YOUR_STATE_BUCKET` with the name of the GCS bucket. +- In variables.tf, provide the values of the following variables: + - `project_id` : If you created the projects using [01_gcp_project][projects] module, no need to provide a value for it as TF will read the project ids from the state file. + If you are providing your existing project ids, provide it in the following format. + + The following is an example of creating three env in the same GCP project : + ``` + { "dev" : "project1", "staging" : "project1", "prod" : "project1" } + ``` + The following is an example of creating three env in three different projects: + ``` + { "dev" : "project1", "staging" : "project2", "prod" : "project3" } + ``` + + - `lookup_state_bucket` : provide the name of the GCS bucket. + + +- If you did not use [01_gcp_projects][projects] module to create GCP projects and are supplying your project ids in variables.tf, enable the following APIs in those project. + - In `cloudshell`, run: + ``` + gcloud config set project + + gcloud services enable cloudresourcemanager.googleapis.com iam.googleapis.com container.googleapis.com gkehub.googleapis.com anthos.googleapis.com anthosconfigmanagement.googleapis.com compute.googleapis.com + ``` + +- terraform init +- terraform plan +- terraform apply --auto-approve + +When Terraform apply has been completed, you will get the following resources: +- A VPC network per environment with a NAT gateway and Cloud router. +- A private GKE cluster per environment. This cluster will be created in the respective VPC. +- VM reservation for `nvidia-l4` +- Three node pools, spot, reserved and on-demand respectively. + + +## Clean up + +1. The easiest way to prevent continued billing for the resources that you created for this tutorial is to delete the project you created for the tutorial. Run the following commands from Cloud Shell: + + ```bash + gcloud config unset project && \ + echo y | gcloud projects delete + ``` + +2. If the project needs to be left intact, another option is to destroy the infrastructure created from this module. Note, this does not destroy the Cloud Storage bucket containing the Terraform state and service enablement created out of Terraform. + + ```bash + cd ml-platform/02_gke && \ + terraform destroy --auto-approve + ``` + +[projects]: ../01_gcp_project/README.md \ No newline at end of file diff --git a/ml-platform/02_gke/backend.tf b/ml-platform/02_gke/backend.tf new file mode 100644 index 000000000..97deced77 --- /dev/null +++ b/ml-platform/02_gke/backend.tf @@ -0,0 +1,20 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +terraform { + backend "gcs" { + prefix = "02_gke" + bucket = "YOUR_STATE_BUCKET" + } +} diff --git a/ml-platform/02_gke/main.tf b/ml-platform/02_gke/main.tf new file mode 100644 index 000000000..2d8ed3dea --- /dev/null +++ b/ml-platform/02_gke/main.tf @@ -0,0 +1,117 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +data "terraform_remote_state" "gcp-projects" { + count = length(keys("${var.project_id}")) == 0 ? 1 : 0 + backend = "gcs" + config = { + bucket = var.lookup_state_bucket + prefix = "01_gcp_project" + } +} + +locals { + parsed_project_id = length(keys("${var.project_id}")) == 0 ? data.terraform_remote_state.gcp-projects[0].outputs.project_ids : var.project_id +} + +module "create-vpc" { + for_each = local.parsed_project_id + source = "./modules/network" + project_id = each.value + network_name = format("%s-%s",var.network_name,each.key) + routing_mode = var.routing_mode + subnet_01_name = format("%s-%s",var.subnet_01_name,each.key) + subnet_01_ip = var.subnet_01_ip + subnet_01_region = var.subnet_01_region + subnet_02_name = format("%s-%s",var.subnet_02_name,each.key) + subnet_02_ip = var.subnet_02_ip + subnet_02_region = var.subnet_02_region + #default_route_name = format("%s-%s","default-route",each.key) +} + +resource "google_gke_hub_feature" "configmanagement_acm_feature" { + count = length(distinct(values(local.parsed_project_id))) + name = "configmanagement" + project = distinct(values(local.parsed_project_id))[count.index] + location = "global" + provider = google-beta +} + +module "gke" { + for_each = local.parsed_project_id + source = "./modules/cluster" + cluster_name = format("%s-%s",var.cluster_name,each.key) + network = module.create-vpc[each.key].vpc + subnet = module.create-vpc[each.key].subnet-1 + project_id = each.value + region = var.subnet_01_region + zone = "${var.subnet_01_region}-a" + master_auth_networks_ipcidr = var.subnet_01_ip + depends_on = [ google_gke_hub_feature.configmanagement_acm_feature ] + env = each.key +} +module "reservation" { + for_each = local.parsed_project_id + source = "./modules/vm-reservations" + cluster_name = module.gke[each.key].cluster_name + zone = "${var.subnet_01_region}-a" + project_id = each.value + depends_on = [ module.gke ] +} +module "node_pool-reserved" { + for_each = local.parsed_project_id + source = "./modules/node-pools" + node_pool_name = "reservation" + project_id = each.value + cluster_name = module.gke[each.key].cluster_name + region = "${var.subnet_01_region}" + taints = var.reserved_taints + resource_type = "reservation" + reservation_name = module.reservation[each.key].reservation_name +} + +module "node_pool-ondemand" { + for_each = local.parsed_project_id + source = "./modules/node-pools" + node_pool_name = "ondemand" + project_id = each.value + cluster_name = module.gke[each.key].cluster_name + region = "${var.subnet_01_region}" + taints = var.ondemand_taints + resource_type = "ondemand" +} + +module "node_pool-spot" { + for_each = local.parsed_project_id + source = "./modules/node-pools" + node_pool_name = "spot" + project_id = each.value + cluster_name = module.gke[each.key].cluster_name + region = "${var.subnet_01_region}" + taints = var.spot_taints + resource_type = "spot" + +} + +module "cloud-nat" { + for_each = local.parsed_project_id + source = "./modules/cloud-nat" + project_id = each.value + region = split("/", module.create-vpc[each.key].subnet-1)[3] + name = format("%s-%s","nat-for-acm",each.key) + network = module.create-vpc[each.key].vpc + create_router = true + router = format("%s-%s","router-for-acm",each.key) + depends_on = [ module.create-vpc ] +} diff --git a/ml-platform/02_gke/modules/cloud-nat/README.md b/ml-platform/02_gke/modules/cloud-nat/README.md new file mode 100644 index 000000000..6952d4e9f --- /dev/null +++ b/ml-platform/02_gke/modules/cloud-nat/README.md @@ -0,0 +1,108 @@ +Copyright 2024 Google LLC + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +# Terraform Google Cloud NAT Module + +This module handles opinionated Google Cloud Platform Cloud NAT creation and configuration. + +## Compatibility +This module is meant for use with Terraform 0.13+ and tested using Terraform 1.0+. If you find incompatibilities using Terraform >=0.13, please open an issue. + +## Usage + +```hcl +module "cloud-nat" { + source = "terraform-google-modules/cloud-nat/google" + version = "~> 1.2" + project_id = var.project_id + region = var.region + router = google_compute_router.router.name +} +``` + +Then perform the following commands on the root folder: + +- `terraform init` to get the plugins +- `terraform plan` to see the infrastructure plan +- `terraform apply` to apply the infrastructure build +- `terraform destroy` to destroy the built infrastructure + + +## Inputs + +| Name | Description | Type | Default | Required | +|------|-------------|------|---------|:--------:| +| create\_router | Create router instead of using an existing one, uses 'router' variable for new resource name. | `bool` | `false` | no | +| enable\_dynamic\_port\_allocation | Enable Dynamic Port Allocation. If minPorts is set, minPortsPerVm must be set to a power of two greater than or equal to 32. | `bool` | `false` | no | +| enable\_endpoint\_independent\_mapping | Specifies if endpoint independent mapping is enabled. | `bool` | `null` | no | +| icmp\_idle\_timeout\_sec | Timeout (in seconds) for ICMP connections. Defaults to 30s if not set. Changing this forces a new NAT to be created. | `string` | `"30"` | no | +| log\_config\_enable | Indicates whether or not to export logs | `bool` | `false` | no | +| log\_config\_filter | Specifies the desired filtering of logs on this NAT. Valid values are: "ERRORS\_ONLY", "TRANSLATIONS\_ONLY", "ALL" | `string` | `"ALL"` | no | +| max\_ports\_per\_vm | Maximum number of ports allocated to a VM from this NAT. This field can only be set when enableDynamicPortAllocation is enabled.This will be ignored if enable\_dynamic\_port\_allocation is set to false. | `string` | `null` | no | +| min\_ports\_per\_vm | Minimum number of ports allocated to a VM from this NAT config. Defaults to 64 if not set. Changing this forces a new NAT to be created. | `string` | `"64"` | no | +| name | Defaults to 'cloud-nat-RANDOM\_SUFFIX'. Changing this forces a new NAT to be created. | `string` | `""` | no | +| nat\_ips | List of self\_links of external IPs. Changing this forces a new NAT to be created. Value of `nat_ip_allocate_option` is inferred based on nat\_ips. If present set to MANUAL\_ONLY, otherwise AUTO\_ONLY. | `list(string)` | `[]` | no | +| network | VPN name, only if router is not passed in and is created by the module. | `string` | `""` | no | +| project\_id | The project ID to deploy to | `string` | n/a | yes | +| region | The region to deploy to | `string` | n/a | yes | +| router | The name of the router in which this NAT will be configured. Changing this forces a new NAT to be created. | `string` | n/a | yes | +| router\_asn | Router ASN, only if router is not passed in and is created by the module. | `string` | `"64514"` | no | +| router\_keepalive\_interval | Router keepalive\_interval, only if router is not passed in and is created by the module. | `string` | `"20"` | no | +| source\_subnetwork\_ip\_ranges\_to\_nat | Defaults to ALL\_SUBNETWORKS\_ALL\_IP\_RANGES. How NAT should be configured per Subnetwork. Valid values include: ALL\_SUBNETWORKS\_ALL\_IP\_RANGES, ALL\_SUBNETWORKS\_ALL\_PRIMARY\_IP\_RANGES, LIST\_OF\_SUBNETWORKS. Changing this forces a new NAT to be created. | `string` | `"ALL_SUBNETWORKS_ALL_IP_RANGES"` | no | +| subnetworks | Specifies one or more subnetwork NAT configurations |
list(object({
name = string,
source_ip_ranges_to_nat = list(string)
secondary_ip_range_names = list(string)
}))
| `[]` | no | +| tcp\_established\_idle\_timeout\_sec | Timeout (in seconds) for TCP established connections. Defaults to 1200s if not set. Changing this forces a new NAT to be created. | `string` | `"1200"` | no | +| tcp\_time\_wait\_timeout\_sec | Timeout (in seconds) for TCP connections that are in TIME\_WAIT state. Defaults to 120s if not set. | `string` | `"120"` | no | +| tcp\_transitory\_idle\_timeout\_sec | Timeout (in seconds) for TCP transitory connections. Defaults to 30s if not set. Changing this forces a new NAT to be created. | `string` | `"30"` | no | +| udp\_idle\_timeout\_sec | Timeout (in seconds) for UDP connections. Defaults to 30s if not set. Changing this forces a new NAT to be created. | `string` | `"30"` | no | + +## Outputs + +| Name | Description | +|------|-------------| +| name | Name of the Cloud NAT | +| nat\_ip\_allocate\_option | NAT IP allocation mode | +| region | Cloud NAT region | +| router\_name | Cloud NAT router name | + + + +## Requirements + +Before this module can be used on a project, you must ensure that the following pre-requisites are fulfilled: + +1. Terraform and kubectl are [installed](#software-dependencies) on the machine where Terraform is executed. +2. The Service Account you execute the module with has the right [permissions](#iam-roles). +3. The APIs are [active](#enable-apis) on the project you will launch the cluster in. +4. If you are using a Shared VPC, the APIs must also be activated on the Shared VPC host project and your service account needs the proper permissions there. + +### Terraform plugins + +- [Terraform](https://www.terraform.io/downloads.html) >= 0.13.0 +- [terraform-provider-google](https://github.com/terraform-providers/terraform-provider-google) plugin v4.27.0 + +### Configure a Service Account + +In order to execute this module you must have a Service Account with the +following project roles: + +- [roles/compute.networkAdmin](https://cloud.google.com/nat/docs/using-nat#iam_permissions) + +### Enable APIs + +In order to operate with the Service Account you must activate the following APIs on the project where the Service Account was created: + +- Compute Engine API - compute.googleapis.com + +## Contributing + +Refer to the [contribution guidelines](./CONTRIBUTING.md) for information on contributing to this module. diff --git a/ml-platform/02_gke/modules/cloud-nat/main.tf b/ml-platform/02_gke/modules/cloud-nat/main.tf new file mode 100644 index 000000000..8efd57188 --- /dev/null +++ b/ml-platform/02_gke/modules/cloud-nat/main.tf @@ -0,0 +1,80 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +resource "random_string" "name_suffix" { + length = 6 + upper = false + special = false +} + +locals { + # intermediate locals + default_name = "cloud-nat-${random_string.name_suffix.result}" + # locals for google_compute_router_nat + nat_ip_allocate_option = length(var.nat_ips) > 0 ? "MANUAL_ONLY" : "AUTO_ONLY" + name = var.name != "" ? var.name : local.default_name + router = var.create_router ? google_compute_router.router[0].name : var.router +} + +resource "google_compute_router" "router" { + count = var.create_router ? 1 : 0 + name = var.router + project = var.project_id + region = var.region + network = var.network + bgp { + asn = var.router_asn + keepalive_interval = var.router_keepalive_interval + } +} + +resource "google_compute_router_nat" "main" { + project = var.project_id + region = var.region + name = local.name + router = local.router + nat_ip_allocate_option = local.nat_ip_allocate_option + nat_ips = var.nat_ips + source_subnetwork_ip_ranges_to_nat = var.source_subnetwork_ip_ranges_to_nat + min_ports_per_vm = var.min_ports_per_vm + max_ports_per_vm = var.enable_dynamic_port_allocation ? var.max_ports_per_vm : null + udp_idle_timeout_sec = var.udp_idle_timeout_sec + icmp_idle_timeout_sec = var.icmp_idle_timeout_sec + tcp_established_idle_timeout_sec = var.tcp_established_idle_timeout_sec + tcp_transitory_idle_timeout_sec = var.tcp_transitory_idle_timeout_sec + tcp_time_wait_timeout_sec = var.tcp_time_wait_timeout_sec + enable_endpoint_independent_mapping = var.enable_endpoint_independent_mapping + enable_dynamic_port_allocation = var.enable_dynamic_port_allocation + + dynamic "subnetwork" { + for_each = var.subnetworks + content { + name = subnetwork.value.name + source_ip_ranges_to_nat = subnetwork.value.source_ip_ranges_to_nat + secondary_ip_range_names = contains(subnetwork.value.source_ip_ranges_to_nat, "LIST_OF_SECONDARY_IP_RANGES") ? subnetwork.value.secondary_ip_range_names : [] + } + } + + dynamic "log_config" { + for_each = var.log_config_enable == true ? [{ + enable = var.log_config_enable + filter = var.log_config_filter + }] : [] + + content { + enable = log_config.value.enable + filter = log_config.value.filter + } + } +} diff --git a/ml-platform/02_gke/modules/cloud-nat/outputs.tf b/ml-platform/02_gke/modules/cloud-nat/outputs.tf new file mode 100644 index 000000000..86bf7c39d --- /dev/null +++ b/ml-platform/02_gke/modules/cloud-nat/outputs.tf @@ -0,0 +1,34 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +output "name" { + description = "Name of the Cloud NAT" + value = local.name +} + +output "nat_ip_allocate_option" { + description = "NAT IP allocation mode" + value = local.nat_ip_allocate_option +} + +output "region" { + description = "Cloud NAT region" + value = google_compute_router_nat.main.region +} + +output "router_name" { + description = "Cloud NAT router name" + value = local.router +} + diff --git a/ml-platform/02_gke/modules/cloud-nat/variables.tf b/ml-platform/02_gke/modules/cloud-nat/variables.tf new file mode 100644 index 000000000..84cd6fbbb --- /dev/null +++ b/ml-platform/02_gke/modules/cloud-nat/variables.tf @@ -0,0 +1,145 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +variable "project_id" { + type = string + description = "The project ID to deploy to" +} + +variable "region" { + type = string + description = "The region to deploy to" +} + +variable "icmp_idle_timeout_sec" { + type = string + description = "Timeout (in seconds) for ICMP connections. Defaults to 30s if not set. Changing this forces a new NAT to be created." + default = "30" +} + +variable "min_ports_per_vm" { + type = string + description = "Minimum number of ports allocated to a VM from this NAT config. Defaults to 64 if not set. Changing this forces a new NAT to be created." + default = "64" +} + +variable "max_ports_per_vm" { + type = string + description = "Maximum number of ports allocated to a VM from this NAT. This field can only be set when enableDynamicPortAllocation is enabled.This will be ignored if enable_dynamic_port_allocation is set to false." + default = null +} + +variable "name" { + type = string + description = "Defaults to 'cloud-nat-RANDOM_SUFFIX'. Changing this forces a new NAT to be created." + default = "" +} + +variable "nat_ips" { + type = list(string) + description = "List of self_links of external IPs. Changing this forces a new NAT to be created. Value of `nat_ip_allocate_option` is inferred based on nat_ips. If present set to MANUAL_ONLY, otherwise AUTO_ONLY." + default = [] +} + +variable "network" { + type = string + description = "VPN name, only if router is not passed in and is created by the module." + default = "" +} + +variable "create_router" { + type = bool + description = "Create router instead of using an existing one, uses 'router' variable for new resource name." + default = false +} + +variable "router" { + type = string + description = "The name of the router in which this NAT will be configured. Changing this forces a new NAT to be created." +} + +variable "router_asn" { + type = string + description = "Router ASN, only if router is not passed in and is created by the module." + default = "64514" +} + +variable "router_keepalive_interval" { + type = string + description = "Router keepalive_interval, only if router is not passed in and is created by the module." + default = "20" +} + +variable "source_subnetwork_ip_ranges_to_nat" { + type = string + description = "Defaults to ALL_SUBNETWORKS_ALL_IP_RANGES. How NAT should be configured per Subnetwork. Valid values include: ALL_SUBNETWORKS_ALL_IP_RANGES, ALL_SUBNETWORKS_ALL_PRIMARY_IP_RANGES, LIST_OF_SUBNETWORKS. Changing this forces a new NAT to be created." + default = "ALL_SUBNETWORKS_ALL_IP_RANGES" +} + +variable "tcp_established_idle_timeout_sec" { + type = string + description = "Timeout (in seconds) for TCP established connections. Defaults to 1200s if not set. Changing this forces a new NAT to be created." + default = "1200" +} + +variable "tcp_transitory_idle_timeout_sec" { + type = string + description = "Timeout (in seconds) for TCP transitory connections. Defaults to 30s if not set. Changing this forces a new NAT to be created." + default = "30" +} + +variable "tcp_time_wait_timeout_sec" { + type = string + description = "Timeout (in seconds) for TCP connections that are in TIME_WAIT state. Defaults to 120s if not set." + default = "120" +} + +variable "udp_idle_timeout_sec" { + type = string + description = "Timeout (in seconds) for UDP connections. Defaults to 30s if not set. Changing this forces a new NAT to be created." + default = "30" +} + +variable "subnetworks" { + description = "Specifies one or more subnetwork NAT configurations" + type = list(object({ + name = string, + source_ip_ranges_to_nat = list(string) + secondary_ip_range_names = list(string) + })) + default = [] +} + +variable "log_config_enable" { + type = bool + description = "Indicates whether or not to export logs" + default = false +} +variable "log_config_filter" { + type = string + description = "Specifies the desired filtering of logs on this NAT. Valid values are: \"ERRORS_ONLY\", \"TRANSLATIONS_ONLY\", \"ALL\"" + default = "ALL" +} + +variable "enable_dynamic_port_allocation" { + type = bool + description = "Enable Dynamic Port Allocation. If minPorts is set, minPortsPerVm must be set to a power of two greater than or equal to 32." + default = false + +} +variable "enable_endpoint_independent_mapping" { + type = bool + description = "Specifies if endpoint independent mapping is enabled." + default = null +} diff --git a/ml-platform/02_gke/modules/cloud-nat/versions.tf b/ml-platform/02_gke/modules/cloud-nat/versions.tf new file mode 100644 index 000000000..8422786e6 --- /dev/null +++ b/ml-platform/02_gke/modules/cloud-nat/versions.tf @@ -0,0 +1,30 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +terraform { + required_providers { + + google = { + source = "hashicorp/google" + #version = ">= 4.51, < 5.0" + version = "4.72.1" + } + + random = { + source = "hashicorp/random" + version = "~> 2.2" + } + } + +} diff --git a/ml-platform/02_gke/modules/cluster/gke.tf b/ml-platform/02_gke/modules/cluster/gke.tf new file mode 100644 index 000000000..34186dbc8 --- /dev/null +++ b/ml-platform/02_gke/modules/cluster/gke.tf @@ -0,0 +1,124 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +data "google_client_config" "default" {} + +data "google_project" "project" { + project_id = var.project_id +} + +resource "google_container_cluster" "gke_batch" { + provider = google-beta + name = var.cluster_name + project = var.project_id + location = var.region + network = var.network + subnetwork = var.subnet + node_locations = ["${var.region}-a", "${var.region}-b", "${var.region}-c"] + initial_node_count = 2 + workload_identity_config { + workload_pool = "${var.project_id}.svc.id.goog" + } + addons_config { + gcp_filestore_csi_driver_config { + enabled = true + } + gcs_fuse_csi_driver_config { + enabled = true + } + gce_persistent_disk_csi_driver_config { + enabled = true + } + } + cluster_autoscaling { + enabled = true + autoscaling_profile = "OPTIMIZE_UTILIZATION" + resource_limits { + resource_type = "cpu" + minimum = 4 + maximum = 600 + } + resource_limits { + resource_type = "memory" + minimum = 16 + maximum = 2400 + } + resource_limits { + resource_type = "nvidia-tesla-t4" + maximum = 300 + } + resource_limits { + resource_type = "nvidia-l4" + maximum = 30 + } + resource_limits { + resource_type = "nvidia-tesla-a100" + maximum = 50 + } + resource_limits { + resource_type = "nvidia-a100-80gb" + maximum = 30 + } + resource_limits { + resource_type = "nvidia-tesla-v100" + maximum = 30 + } + resource_limits { + resource_type = "nvidia-tesla-p100" + maximum = 30 + } + resource_limits { + resource_type = "nvidia-tesla-p4" + maximum = 30 + } + resource_limits { + resource_type = "nvidia-tesla-k80" + maximum = 30 + } + auto_provisioning_defaults { + management { + auto_repair = true + auto_upgrade = true + } + + upgrade_settings { + strategy = "SURGE" + max_surge = 0 + max_unavailable = 1 + } + + oauth_scopes = [ + "https://www.googleapis.com/auth/cloud-platform" + ] + } + } + release_channel { + channel = "RAPID" + } + private_cluster_config { + enable_private_nodes = true + enable_private_endpoint = true + master_ipv4_cidr_block = "172.16.0.32/28" + } + master_authorized_networks_config { + + cidr_blocks { + cidr_block = var.master_auth_networks_ipcidr + display_name = "vpc-cidr" + } + } + ip_allocation_policy {} +} + + diff --git a/ml-platform/02_gke/modules/cluster/outputs.tf b/ml-platform/02_gke/modules/cluster/outputs.tf new file mode 100644 index 000000000..b26d3be8e --- /dev/null +++ b/ml-platform/02_gke/modules/cluster/outputs.tf @@ -0,0 +1,33 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +output "cluster_location" { + value = google_container_cluster.gke_batch.location +} + +output "cluster_name" { + value = google_container_cluster.gke_batch.name +} + +output "cluster_id" { + value = google_container_cluster.gke_batch.id +} + +output "gke_project_id" { + value = var.project_id +} + +output "env" { + value = var.env +} \ No newline at end of file diff --git a/ml-platform/02_gke/modules/cluster/variables.tf b/ml-platform/02_gke/modules/cluster/variables.tf new file mode 100644 index 000000000..66e3cda06 --- /dev/null +++ b/ml-platform/02_gke/modules/cluster/variables.tf @@ -0,0 +1,58 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +variable "project_id" { + type = string + description = "The GCP project where the resources will be created" + default = "" +} +variable "cluster_name" { + type = string + description = "GKE cluster name" + default = "" +} + +variable "region" { + type = string + description = "The GCP region where the GKE cluster will be created" + default = "us-central1" +} + +variable "zone" { + type = string + description = "The GCP zone where the reservation will be created" + default = "us-central1-a" +} + +variable "master_auth_networks_ipcidr" { + type = string + description = "master authorized network" +} + +variable "network" { + type = string + description = "VPC network where the cluster will be created" +} + +variable "subnet" { + type = string + description = "subnetwork where the cluster will be created" + +} + +variable "env" { + type = string + description = "environment" + +} \ No newline at end of file diff --git a/ml-platform/02_gke/modules/cluster/versions.tf b/ml-platform/02_gke/modules/cluster/versions.tf new file mode 100644 index 000000000..dc628619e --- /dev/null +++ b/ml-platform/02_gke/modules/cluster/versions.tf @@ -0,0 +1,27 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +terraform { + required_providers { + google-beta = { + source = "hashicorp/google-beta" + version = "4.72.1" + } + google = { + source = "hashicorp/google" + version = "4.72.1" + } + } +} + diff --git a/ml-platform/02_gke/modules/network/README.md b/ml-platform/02_gke/modules/network/README.md new file mode 100644 index 000000000..6de9bdc13 --- /dev/null +++ b/ml-platform/02_gke/modules/network/README.md @@ -0,0 +1,110 @@ +Copyright 2024 Google LLC + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +## Requirements + +| Name | Version | +|------|---------| +| [google](#requirement\_google) | >= 4.28.0 | + +## Modules + +| Name | Source | Version | +|------|--------|---------| +| [vpc](#module\_vpc) | terraform-google-modules/network/google | 5.2.0 | + +## Inputs + +| Name | Description | Type | Default | Required | +|------|-------------|------|---------|:--------:| +| [network\_name](#input\_network\_name) | Name of the VPC network. | `string` | n/a | yes | +| [project\_id](#input\_project\_id) | Id of the GCP project where VPC is to be created. | `string` | n/a | yes | +| [routing\_mode](#input\_routing\_mode) | The network routing mode. | `string` | n/a | yes | +| [subnet\_01\_description](#input\_subnet\_01\_description) | Subnet description. | `string` | n/a | yes | +| [subnet\_01\_ip](#input\_subnet\_01\_ip) | IP range of first subnet. | `string` | n/a | yes | +| [subnet\_01\_name](#input\_subnet\_01\_name) | Name of first subnet. | `string` | n/a | yes | +| [subnet\_01\_region](#input\_subnet\_01\_region) | Region of first subnet. | `string` | n/a | yes | +| [subnet\_01\_secondary\_pod\_name](#input\_subnet\_01\_secondary\_pod\_name) | Name of pods IP range. | `string` | n/a | yes | +| [subnet\_01\_secondary\_pod\_range](#input\_subnet\_01\_secondary\_pod\_range) | IP range of the pods. | `string` | n/a | yes | +| [subnet\_01\_secondary\_svc\_1\_name](#input\_subnet\_01\_secondary\_svc\_1\_name) | Name of service IP range. | `string` | n/a | yes | +| [subnet\_01\_secondary\_svc\_1\_range](#input\_subnet\_01\_secondary\_svc\_1\_range) | IP range of the service. | `string` | n/a | yes | +| [subnet\_01\_secondary\_svc\_2\_name](#input\_subnet\_01\_secondary\_svc\_2\_name) | Name of service IP range. | `string` | n/a | yes | +| [subnet\_01\_secondary\_svc\_2\_range](#input\_subnet\_01\_secondary\_svc\_2\_range) | IP range of the service. | `string` | n/a | yes | +| [subnet\_02\_description](#input\_subnet\_02\_description) | Subnet description. | `string` | n/a | yes | +| [subnet\_02\_ip](#input\_subnet\_02\_ip) | IP range of second subnet. | `string` | n/a | yes | +| [subnet\_02\_name](#input\_subnet\_02\_name) | Name of the second subnet. | `string` | n/a | yes | +| [subnet\_02\_region](#input\_subnet\_02\_region) | Region of second subnet. | `string` | n/a | yes | +| [subnet\_02\_secondary\_pod\_name](#input\_subnet\_02\_secondary\_pod\_name) | Name of pods IP range. | `string` | n/a | yes | +| [subnet\_02\_secondary\_pod\_range](#input\_subnet\_02\_secondary\_pod\_range) | IP range of the pods. | `string` | n/a | yes | +| [subnet\_02\_secondary\_svc\_1\_name](#input\_subnet\_02\_secondary\_svc\_1\_name) | Name of service IP range. | `string` | n/a | yes | +| [subnet\_02\_secondary\_svc\_1\_range](#input\_subnet\_02\_secondary\_svc\_1\_range) | IP range of the service. | `string` | n/a | yes | +| [subnet\_02\_secondary\_svc\_2\_name](#input\_subnet\_02\_secondary\_svc\_2\_name) | Name of service IP range. | `string` | n/a | yes | +| [subnet\_02\_secondary\_svc\_2\_range](#input\_subnet\_02\_secondary\_svc\_2\_range) | IP range of the service. | `string` | n/a | yes | + +## Outputs + +| Name | Description | +|------|-------------| +| [network](#output\_network) | Object containing details of the VPC network. | + +## Usage + +```hcl + source = "git::https://github.com/YOUR_GITHUB_ORG/terraform-modules.git//vpc/" + project_id = "my-project" + network_name = "my-network" + routing_mode = "GLOBAL" + subnet_01_name = "subnet-1" + subnet_01_ip = "10.40.0.0/22" + subnet_01_region = "us-central1" + subnet_01_description = "subnet 1" + subnet_02_name = "subnet-2" + subnet_02_ip = "10.12.0.0/22" + subnet_02_region = "us-central1" + subnet_02_description = "subnet 2" + subnet_01_secondary_svc_1_name = "subnet1-service1" + subnet_01_secondary_svc_1_range = "10.5.0.0/20" + subnet_01_secondary_svc_2_name = "subnet1-service2" + subnet_01_secondary_svc_2_range = "10.5.16.0/20" + subnet_01_secondary_pod_name = "subnet1-pod" + subnet_01_secondary_pod_range = "10.0.0.0/14" + subnet_02_secondary_svc_1_name = "subnet2-service1" + subnet_02_secondary_svc_1_range = "10.13.0.0/20" + subnet_02_secondary_svc_2_name = "subnet2-service2" + subnet_02_secondary_svc_2_range = "10.13.16.0/20" + subnet_02_secondary_pod_name = "subnet2-pod" + subnet_02_secondary_pod_range = "10.8.0.0/14" + +} +``` + +## Workflow + +This module is called from [multi-tenant platform repo][muti-tenant-platform-repo] that stands up multi-tenant infrastructure for [dev][dev-multi-tenant], [staging][staging-multi-tenant] and [prod][prod-multi-tenant] environments to create a VPC network. Additionally, this module can be called by [infrastructure repo][infra-repo] if the application needs its own VPC networks inside its projects. + +## Contributing + +* [Contributing guidelines][contributing-guidelines] +* [Code of conduct][code-of-conduct] + + + +[contributing-guidelines]: CONTRIBUTING.md +[code-of-conduct]: code-of-conduct.md + + +[muti-tenant-platform-repo]: ../../platform-template +[dev-multi-tenant]: ../../platform-template/env/dev/main.tf?plain=1#L50 +[staging-multi-tenant]: ../../platform-template/env/staging/main.tf?plain=1#L50 +[prod-multi-tenant]: ../../platform-template/env/prod/main.tf?plain=1#L50 +[infra-repo]: ../../app-factory-template/README.md?plain=1#L64 diff --git a/ml-platform/02_gke/modules/network/outputs.tf b/ml-platform/02_gke/modules/network/outputs.tf new file mode 100644 index 000000000..bf9d36dad --- /dev/null +++ b/ml-platform/02_gke/modules/network/outputs.tf @@ -0,0 +1,28 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +output "vpc" { + value = google_compute_network.vpc-network.id + description = "VPC." +} + +output "subnet-1" { + value = google_compute_subnetwork.subnet-1.id + description = "subnet1." +} + +output "subnet-2" { + value = google_compute_subnetwork.subnet-2.id + description = "subnet2." +} \ No newline at end of file diff --git a/ml-platform/02_gke/modules/network/variables.tf b/ml-platform/02_gke/modules/network/variables.tf new file mode 100644 index 000000000..e85ab0e48 --- /dev/null +++ b/ml-platform/02_gke/modules/network/variables.tf @@ -0,0 +1,56 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +variable "project_id" { + description = "Id of the GCP project where VPC is to be created." + type = string +} +variable "network_name" { + description = "Name of the VPC network." + type = string +} +variable "routing_mode" { + description = "The network routing mode." + type = string + default = "GLOBAL" +} +variable "subnet_01_name" { + description = "Name of first subnet." + type = string +} +variable "subnet_01_ip" { + description = "IP range of first subnet." + type = string +} +variable "subnet_01_region" { + description = "Region of first subnet." + type = string +} + +variable "subnet_02_name" { + description = "Name of the second subnet." + type = string +} +variable "subnet_02_ip" { + description = "IP range of second subnet." + type = string +} +variable "subnet_02_region" { + description = "Region of second subnet." + type = string +} +//variable "default_route_name" { +// description = "Name of the default route to internet." +// type = string +//} diff --git a/ml-platform/02_gke/modules/network/versions.tf b/ml-platform/02_gke/modules/network/versions.tf new file mode 100644 index 000000000..033f83d8f --- /dev/null +++ b/ml-platform/02_gke/modules/network/versions.tf @@ -0,0 +1,22 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +terraform { + required_providers { + google = { + source = "hashicorp/google" + version = ">= 4.28.0" + } + } +} \ No newline at end of file diff --git a/ml-platform/02_gke/modules/network/vpc.tf b/ml-platform/02_gke/modules/network/vpc.tf new file mode 100644 index 000000000..ad7071b5a --- /dev/null +++ b/ml-platform/02_gke/modules/network/vpc.tf @@ -0,0 +1,46 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +resource "google_compute_network" "vpc-network" { + project = var.project_id + name = var.network_name + auto_create_subnetworks = false + routing_mode = var.routing_mode +} + +resource "google_compute_subnetwork" "subnet-1" { + project = var.project_id + name = var.subnet_01_name + ip_cidr_range = var.subnet_01_ip + region = var.subnet_01_region + network = google_compute_network.vpc-network.id + private_ip_google_access = true +} + +resource "google_compute_subnetwork" "subnet-2" { + project = var.project_id + name = var.subnet_02_name + ip_cidr_range = var.subnet_02_ip + region = var.subnet_02_region + network = google_compute_network.vpc-network.id + private_ip_google_access = true +} + +//resource "google_compute_route" "default-route" { +//name = var.default_route_name +//dest_range = "0.0.0.0/0" +//network = google_compute_network.vpc-network.id +//priority = 1000 +//next_hop_gateway = "default-internet-gateway" +//} diff --git a/ml-platform/02_gke/modules/node-pools/nodepools.tf b/ml-platform/02_gke/modules/node-pools/nodepools.tf new file mode 100644 index 000000000..6eec2bc7d --- /dev/null +++ b/ml-platform/02_gke/modules/node-pools/nodepools.tf @@ -0,0 +1,64 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +resource "google_container_node_pool" "node-pool" { + name = format("%s-%s",var.cluster_name,var.node_pool_name) + project = var.project_id + cluster = var.cluster_name + location = var.region + node_config { + machine_type = var.machine_type + taint = var.taints + labels = { + "resource-type" : var.resource_type + } + + guest_accelerator { + type = var.accelerator + count = var.accelerator_count + } + oauth_scopes = [ + "https://www.googleapis.com/auth/cloud-platform" + ] + + dynamic "reservation_affinity" { + for_each = var.reservation_name != "" ? [1] : [ ] + content { + consume_reservation_type = "SPECIFIC_RESERVATION" + key = "compute.googleapis.com/reservation-name" + values = [var.reservation_name] + } + } + } + autoscaling { + total_min_node_count = var.autoscaling["total_min_node_count"] + total_max_node_count = var.autoscaling["total_max_node_count"] + location_policy = var.autoscaling["location_policy"] + } + + timeouts { + create = "30m" + update = "20m" + } + + lifecycle { + ignore_changes = [ + node_config[0].labels, + node_config[0].taint, + ] + } + network_config { + enable_private_nodes = true + } +} \ No newline at end of file diff --git a/ml-platform/02_gke/modules/node-pools/variables.tf b/ml-platform/02_gke/modules/node-pools/variables.tf new file mode 100644 index 000000000..f217268b8 --- /dev/null +++ b/ml-platform/02_gke/modules/node-pools/variables.tf @@ -0,0 +1,83 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +variable "node_pool_name" { + type = string + description = "Name of the node pool" +} +variable "project_id" { + type = string + description = "The GCP project where the resources will be created" + default = "" +} +variable "cluster_name" { + type = string + description = "GKE cluster name" + default = "" +} +variable "region" { + type = string + description = "The GCP zone where the reservation will be created" + default = "us-central1-a" +} + +variable "machine_type" { + type = string + description = "The machine type to use." + default = "g2-standard-24" +} + +variable "taints" { + description = "Taints to be applied to the on-demand node pool." + type = list(object({ + key = string + value = any + effect = string + })) +} + +variable "resource_type" { + description = "ondemand/spot/reserved." + type = string + default = "ondemand" +} + + +variable "accelerator" { + type = string + description = "The GPU accelerator to use." + default = "nvidia-l4" +} + +variable "accelerator_count" { + type = number + description = "The number of accelerators per machine." + default = 2 +} +variable "machine_reservation_count" { + type = number + description = "Number of machines reserved instances with GPUs" + default = 4 +} + +variable "autoscaling" { + type = map + default = { "total_min_node_count" : 0, "total_max_node_count" : 24, "location_policy" : "ANY"} +} + +variable "reservation_name" { + description = "reservation name to which the nodepool will be associated" + type = string + default = "" +} \ No newline at end of file diff --git a/ml-platform/02_gke/modules/node-pools/versions.tf b/ml-platform/02_gke/modules/node-pools/versions.tf new file mode 100644 index 000000000..dc628619e --- /dev/null +++ b/ml-platform/02_gke/modules/node-pools/versions.tf @@ -0,0 +1,27 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +terraform { + required_providers { + google-beta = { + source = "hashicorp/google-beta" + version = "4.72.1" + } + google = { + source = "hashicorp/google" + version = "4.72.1" + } + } +} + diff --git a/ml-platform/02_gke/modules/vm-reservations/outputs.tf b/ml-platform/02_gke/modules/vm-reservations/outputs.tf new file mode 100644 index 000000000..367c796d1 --- /dev/null +++ b/ml-platform/02_gke/modules/vm-reservations/outputs.tf @@ -0,0 +1,17 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +output "reservation_name" { + value = split("/",google_compute_reservation.machine_reservation.id)[5] +} \ No newline at end of file diff --git a/ml-platform/02_gke/modules/vm-reservations/reservations.tf b/ml-platform/02_gke/modules/vm-reservations/reservations.tf new file mode 100644 index 000000000..3e35e47c5 --- /dev/null +++ b/ml-platform/02_gke/modules/vm-reservations/reservations.tf @@ -0,0 +1,30 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +resource "google_compute_reservation" "machine_reservation" { + project = var.project_id + specific_reservation_required = true + name = format("%s-%s",var.cluster_name,"reservation") + zone = var.zone + specific_reservation { + count = var.machine_reservation_count + instance_properties { + machine_type = var.machine_type + guest_accelerators { + accelerator_type = var.accelerator + accelerator_count = var.accelerator_count + } + } + } +} \ No newline at end of file diff --git a/ml-platform/02_gke/modules/vm-reservations/variables.tf b/ml-platform/02_gke/modules/vm-reservations/variables.tf new file mode 100644 index 000000000..3a8e3482d --- /dev/null +++ b/ml-platform/02_gke/modules/vm-reservations/variables.tf @@ -0,0 +1,51 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +variable "project_id" { + type = string + description = "The GCP project where the resources will be created" + default = "" +} +variable "cluster_name" { + type = string + description = "GKE cluster name" + default = "" +} +variable "zone" { + type = string + description = "The GCP zone where the reservation will be created" + default = "us-central1-a" +} +variable "machine_type" { + type = string + description = "The machine type to use." + default = "g2-standard-24" +} + +variable "accelerator" { + type = string + description = "The GPU accelerator to use." + default = "nvidia-l4" +} + +variable "accelerator_count" { + type = number + description = "The number of accelerators per machine." + default = 2 +} +variable "machine_reservation_count" { + type = number + description = "Number of machines reserved instances with GPUs" + default = 2 +} diff --git a/ml-platform/02_gke/modules/vm-reservations/versions.tf b/ml-platform/02_gke/modules/vm-reservations/versions.tf new file mode 100644 index 000000000..dc628619e --- /dev/null +++ b/ml-platform/02_gke/modules/vm-reservations/versions.tf @@ -0,0 +1,27 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +terraform { + required_providers { + google-beta = { + source = "hashicorp/google-beta" + version = "4.72.1" + } + google = { + source = "hashicorp/google" + version = "4.72.1" + } + } +} + diff --git a/ml-platform/02_gke/outputs.tf b/ml-platform/02_gke/outputs.tf new file mode 100644 index 000000000..76dca95a5 --- /dev/null +++ b/ml-platform/02_gke/outputs.tf @@ -0,0 +1,17 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +output "gke_cluster" { + value = module.gke +} \ No newline at end of file diff --git a/ml-platform/02_gke/providers.tf b/ml-platform/02_gke/providers.tf new file mode 100644 index 000000000..dc628619e --- /dev/null +++ b/ml-platform/02_gke/providers.tf @@ -0,0 +1,27 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +terraform { + required_providers { + google-beta = { + source = "hashicorp/google-beta" + version = "4.72.1" + } + google = { + source = "hashicorp/google" + version = "4.72.1" + } + } +} + diff --git a/ml-platform/02_gke/variables.tf b/ml-platform/02_gke/variables.tf new file mode 100644 index 000000000..bd3da28f1 --- /dev/null +++ b/ml-platform/02_gke/variables.tf @@ -0,0 +1,126 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +variable "project_id" { + type = map + description = "The GCP project where the resources will be created. It is a map with environments a skeys and project_ids s values" + default = {} + #Below is an example of not null project_id variable + #default = { "dev" : "gkebatchexpce3c8dcb", "staging" : "gkebatchexpce3c8dcb", "prod" : "gkebatchexpce3c8dcb" } +} + +variable "network_name" { + default = "ml-vpc" + description = "VPC network where GKE cluster will be created" + type = string +} +variable "routing_mode" { + default = "GLOBAL" + description = "VPC routing mode." + type = string +} +variable "subnet_01_name" { + default = "ml-vpc-subnet-01" + description = "Name of the first subnet in the VPC network." + type = string +} +variable "subnet_01_ip" { + default = "10.40.0.0/22" + description = "CIDR of the first subnet." + type = string +} +variable "subnet_01_region" { + default = "us-central1" + description = "Region of the first subnet." + type = string +} +variable "subnet_01_description" { + default = "subnet 01" + description = "Description of the first subnet." + type = string +} +variable "subnet_02_name" { + default = "gke-vpc-subnet-02" + description = "Name of the second subnet in the VPC network." + type = string +} +variable "subnet_02_ip" { + default = "10.12.0.0/22" + description = "CIDR of the second subnet." + type = string +} +variable "subnet_02_region" { + default = "us-west2" + description = "Region of the second subnet." + type = string +} +variable "subnet_02_description" { + default = "subnet 02" + description = "Description of the second subnet." + type = string +} + +variable "lookup_state_bucket" { + description = "GCS bucket to look up TF state from previous steps." + type = string + default = "YOUR_STATE_BUCKET" +} + +variable "cluster_name" { + description = "Name of the GKE cluster" + default = "gke-ml" + type = string +} +variable "reserved_taints" { + description = "Taints to be applied to the reserved node pool." + type = list(object({ + key = string + value = any + effect = string + })) + default = [{ + key = "reserved" + value = true + effect = "NO_SCHEDULE" + }] +} + +variable "ondemand_taints" { + description = "Taints to be applied to the on-demand node pool." + type = list(object({ + key = string + value = any + effect = string + })) + default = [{ + key = "ondemand" + value = true + effect = "NO_SCHEDULE" + }] +} + +variable "spot_taints" { + description = "Taints to be applied to the spot node pool." + type = list(object({ + key = string + value = any + effect = string + })) + default = [{ + key = "spot" + value = true + effect = "NO_SCHEDULE" + }] +} + diff --git a/ml-platform/03_configsync/README.md b/ml-platform/03_configsync/README.md new file mode 100644 index 000000000..ef777933e --- /dev/null +++ b/ml-platform/03_configsync/README.md @@ -0,0 +1,145 @@ +Copyright 2024 Google LLC + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +## Requirements + +| Name | Version | +|------|--------| +| [github](#requirement\_github) | >= 4.3.0 | +| [google](#requirement\_google) | >= 4.72.1 | +| [google-beta](#requirement\_google-beta) | >= 4.72.1 | + +## Inputs + +| Name | Description | Type | Default | Required | +|----------------------------------------------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------|----------|---------|:--------:| +| [project\_id](#input\_project\_id) | Id of the GCP Project where the resources will be created. It is a map with environments as keys and project ids as values. | `map` | n/a | yes | +| [github\_user](#github\_user) | GitHub user name. | `string` | n/a | yes | +| [github\_email](#input\_github\_email) | GitHub user email. | `string` | n/a | yes | +| [github\_org](#input\_github\_org) | GitHub org. | `string` | n/a | yes | +| [github\_token](#input\_github\_token) | GitHub access token | `string` | n/a | yes | +| [lookup\_state\_bucket](#input\_lookup\_state\_bucket) | Lookup TF State bucket. Used for looking up resources created in steps 01 and 02. | `string` | n/a | yes | +| [configsync\_repo\_name](#input\_configsync\_repo\_name) | Configsync repo name to be created in GitHub. | `string` | n/a | no | + +## Prerequisite +- You have created GKE clusters using [02_gke][cluster] module. +- You have the role `roles/Owner` on the projects where you have created GKE clusters. + +## Usage +- Clone the repo and change dir + ``` + git clone https://github.com/GoogleCloudPlatform/ai-on-gke + cd ml-platform/03_configsync + ``` +- In backend.tf replace `YOUR_STATE_BUCKET` with the name of the GCS bucket. +- In variables.tf, provide the values of the following variables: + - `github_user` : GitHub user. We recommend you use a system user account. + - `github_email` : Email of the system user account. + - `github_org` : GitHub org where the config sync repo will be created. + - `lookup_state_bucket` : name of the GCS bucket. + - `configsync_repo_name` : Suitable name for your config sync repo. + +- You also need to provide a personal access token for the GitHub user. Generate a [personal access token][personal-access-token] with access to create and delete repo for the user in GitHub and pass it as env variable: + - export TF_VAR_github_token="``" +- terraform init +- terraform plan +- terraform apply --auto-approve + + +This module performs the following actions: +- Looks up project_id from the state file if not provided. +- Looks up GKE clusters created in step 02. +- Creates a GitHub repository and branches corresponding to each environment and apply branch protection rules on it. This is the configsync repo. +- Creates Config sync on each GKE clusters. +- Hydrates templates into K8s manifests and commit them to the default branch of the GitHub repo to do initial cluster setup. + +## Config sync repo workflow +After this module has been successfully completed, you will get a [root-sync][root-sync] object created on all the GKE clusters. + +Go to Google Cloud Console, click on the navigation menu and click on Kubernetes Engine > Config. Now, click on the `PACKAGES` tab. You will see three [root-sync][root-sync] objects created, one for each cluster. Review the `Source url` against the `dev` cluster. It should be something like: +``` +https://github.com//experiment-acm-repo/tree/dev/manifests/clusters + +``` +This means that the `dev` cluster is associated with the `manifests/clusters` folder on the `dev` branch of the configsync repo. So, manifests under `manifests/clusters` folder on the `dev` branch will be synced with the dev cluster. +Similarly, the folder `manifests/clusters` on `staging` branch will be synced with the `staging` cluster and `manifests/clusters` on `prod` branch will be sycned with `prod` cluster. + +We will follow GitOps methodology to create resources on the clusters. This means you can only make changes to the default branch while other branches are protected. In order to merge changes to non-default branches, you will need to create a pull request. + +The following documentation will assume that you have three clusters `dev`, `staging` and `prod` and that resulted in three branches on the configsync repo `dev`, `staging` and `prod`. The `dev` branch is the default branch. + +To follow `GitOps` approach, you will make changes and push them to the `dev` branch. Config sync will then sync the `dev` branch with the `dev` cluster. If the changes look good in `dev` environment, +and are ready to be moved to `staging` you create a pull request from `dev` to `staging` branch. Once this pull request is approved and merged, the `staging branch` will be synced with `staging` cluster reflecting the changes in staging environment. +Similarly, when you are ready to promote the changes in production environment, create a pull request from `staging` to `prod` branch and merge it. + +## Managing cluster-level and application-level objects + +It is recommended to have a separation of duties on who should be able to create what objects in a cluster. +The principle to follow should be that the cluster-level objects can only be created by platform admins while the application teams should be able to create their own application level objects. + +To achieve this separation, we will use [root-sync][root-sync] and [repo-sync][repo-sync]. [root-sync][root-sync] allows to creae cluster scoped objects while [repo-sync][repo-sync] allows to create namespace scoped objects. + +### Cluster-level objects +Since the [root-sync][root-sync] object is associated with the folder `manifests/clusters`, the cluster level objects will be created from this folder. This includes creating CRDs, namespaces etc. So, for example, if you want to create a namespace as a platform admin, create a `yaml` file with the required K8s definition and save it under `manifests/clusters`. The namespace will be created on the cluster as soon as the sync happens. + +Note that the owner of the repo should create a CODEOWNERS file to allow access to the platform admins to this folder so that only they can make cluster level objects. The Application teams should not have access to `manifests/clusters`. + +In the section [04_setup_clusters][cluster-setup], you will create cluster scoped objects. + + +### Application-level objects +It is recommended to provide each Application its dedicated namespace. This means, only the application and related resources will be created in that namespace. The owner of the application or the app team will be get full access on the namespace so they can manage their application without having to be dependent on the platform admins. + +Since the namespace is a cluster-scoped object, platform admin will need to create the namespace for the application and grant the app team members access on the namespace. Additionally, they will provide a [repo-sync][repo-sync] repo to the app teams so they can use that to manage their application's kubernetes resource. Once, this setup is done, the app team members can manage the application inside the namespace with the manifests in the [repo-sync][repo-sync] repo. + +In the section [05_setup_teams][team-setup], you will learn how the platform admins will set up an application by providing a namespace to the App team along with a [repo-sync][repo-sync] that the app teams will use to manage their applications. + +In the section [06_operating_teams][operating-teams], you will learn how the app teams can use their [repo-sync][repo-sync] to manage thir application. + +## Troubleshooting +If you do not have [GitHub pro membership][github-pro], you can not apply branch protection rules on your repositories in GitHub. This will cause `409 code` error when you run `terraform apply` . You can ignore these errors. The downside is that you will not get branch protection rules on your configsync repository and can accidentally push changes to the non-default branch which is `dev`. In other words, it will break the `GitOps` flow. + +## Contributing + +* [Contributing guidelines][contributing-guidelines] +* [Code of conduct][code-of-conduct] + + + +[contributing-guidelines]: CONTRIBUTING.md +[code-of-conduct]: code-of-conduct.md +[personal-access-token]: https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/managing-your-personal-access-tokens +[repo-sync]: https://cloud.google.com/anthos-config-management/docs/reference/rootsync-reposync-fields +[root-sync]: https://cloud.google.com/anthos-config-management/docs/reference/rootsync-reposync-fields +[cluster-setup]: ../04_setup_clusters/README.md +[team-setup]: ../05_setup_teams/README.md +[operating-teams]: ../06_operating_teams +[cluster]: ../02_gke +[github-pro]: https://docs.github.com/en/get-started/learning-about-github/githubs-plans + +## Clean up + +1. The easiest way to prevent continued billing for the resources that you created for this tutorial is to delete the project you created for the tutorial. Run the following commands from Cloud Shell: + + ```bash + gcloud config unset project && \ + echo y | gcloud projects delete + ``` + +2. If the project needs to be left intact, another option is to destroy the infrastructure created from this module. Note, this does not destroy the Cloud Storage bucket containing the Terraform state and service enablement created out of Terraform. + + ```bash + cd ml-platform/03_configsync && \ + terraform destroy --auto-approve + ``` + diff --git a/ml-platform/03_configsync/backend.tf b/ml-platform/03_configsync/backend.tf new file mode 100644 index 000000000..b9d73f15f --- /dev/null +++ b/ml-platform/03_configsync/backend.tf @@ -0,0 +1,20 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +terraform { + backend "gcs" { + prefix = "03_config_sync_prerequisite" + bucket = "YOUR_STATE_BUCKET" + } +} diff --git a/ml-platform/03_configsync/create_cluster_yamls.sh b/ml-platform/03_configsync/create_cluster_yamls.sh new file mode 100755 index 000000000..3c659a198 --- /dev/null +++ b/ml-platform/03_configsync/create_cluster_yamls.sh @@ -0,0 +1,82 @@ +#!/bin/bash +# +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +github_org=${1} +acm_repo_name=${2} +github_user=${3} +github_email=${4} +cluster_env=${5} +cluster_name=${6} +index=${7} +sleep_time=20 +sleep_index=$((${index}+1)) +sleep_total=$((${sleep_time}*${sleep_index})) +sleep $sleep_total +random=$(echo $RANDOM | md5sum | head -c 20; echo) +log="$(pwd)/log" +flag=0 +#github_token=${7} +#echo "${github_token}" >> log +#echo "${TF_VAR_github_token}" >> log +#ls -lrt >> log +#ls -lrt ../ >> log +#TIMESTAMP=$(date "+%Y%m%d%H%M%S") +download_acm_repo_name=$(echo ${acm_repo_name} | awk -F "/" '{print $2}')-${random} +git config --global user.name ${github_user} +git config --global user.email ${github_emai} +git clone https://${github_user}:${TF_VAR_github_token}@github.com/${acm_repo_name} ${download_acm_repo_name} +echo "Download repo is ${download_acm_repo_name}" >> ${log} +echo "ls -lrt before going into download repo is $(ls -lrt)" >> ${log} +cd ${download_acm_repo_name} +echo "ls -lrt in download repo is $(ls -lrt)" >> ${log} +if [ ! -d "manifests" ] && [ ! -d "templates" ]; then + echo "copying files" >> ${log} + cp -r ../templates/acm-template/* . + flag=1 +fi +cd manifests/clusters +if [ ${flag} -eq 0 ]; then + echo "not copying files" >> ${log} +fi +echo "In directory $(pwd)" >> ${log} +echo "level0 $(ls -lrt)" >> ${log} +echo "level1 $(ls -lrt ../)" >> ${log} +echo "level2 $(ls -lrt ../../)" >> ${log} +echo "level3 $(ls -lrt ../../../)" >> ${log} +echo "level4 $(ls -lrt ../../../../ )" >> ${log} +echo "env is ${cluster_env}" >> ${log} + +cp ../../templates/_cluster-template/cluster.yaml ./${cluster_name}-cluster.yaml +cp ../../templates/_cluster-template/selector.yaml ./${cluster_env}-selector.yaml +#cp ../../templates/_cluster-template/connect-gateway-rbac.yaml ./${cluster_name}-connect-gateway-rbac.yaml + + +find . -type f -name ${cluster_name}-cluster.yaml -exec sed -i "s/CLUSTER_NAME/${cluster_name}/g" {} + +find . -type f -name ${cluster_name}-cluster.yaml -exec sed -i "s/ENV/${cluster_env}/g" {} + +find . -type f -name ${cluster_env}-selector.yaml -exec sed -i "s/ENV/${cluster_env}/g" {} + +#find . -type f -name ${cluster_name}-connect-gateway-rbac.yaml -exec sed -i "s/CLUSTER_NAME/${cluster_name}/g" {} + +#find . -type f -name ${cluster_name}-connect-gateway-rbac.yaml -exec sed -i "s/ENV/${cluster_env}/g" {} + + +cp ../../templates/_cluster-template/kuberay . + +git add ../../. +git config --global user.name ${github_user} +git config --global user.email ${github_email} +git commit -m "Adding ${cluster_name} cluster to the ${cluster_env} environment." +git push origin + +cd .. +rm -rf ${download_acm_repo_name} diff --git a/ml-platform/03_configsync/main.tf b/ml-platform/03_configsync/main.tf new file mode 100644 index 000000000..b8ad93325 --- /dev/null +++ b/ml-platform/03_configsync/main.tf @@ -0,0 +1,131 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +data "terraform_remote_state" "gke-clusters" { + backend = "gcs" + config = { + bucket = var.lookup_state_bucket + prefix = "02_gke" + } +} + +locals { + parsed_gke_info = data.terraform_remote_state.gke-clusters.outputs.gke_cluster + project_id_list = [for k,v in "${data.terraform_remote_state.gke-clusters.outputs.gke_cluster}" : v.gke_project_id] +} + +//resource "google_gke_hub_feature" "configmanagement_acm_feature" { +// count = length(distinct(local.project_id_list)) +// name = "configmanagement" +// project = distinct(local.project_id_list)[count.index] +// location = "global" +// provider = google-beta +//} + +resource "google_gke_hub_membership" "membership" { + provider = google-beta + for_each = local.parsed_gke_info + project = each.value["gke_project_id"] + membership_id = each.value["cluster_name"] + endpoint { + gke_cluster { + resource_link = format("%s/%s","//container.googleapis.com",each.value["cluster_id"]) + } + } + lifecycle { + ignore_changes = [ + "labels","description" + ] + } + #depends_on = [ google_gke_hub_feature.configmanagement_acm_feature ] +} + +resource "github_repository" "acm_repo" { + name = var.configsync_repo_name + description = "Repo for Config Sync" + visibility = "private" + has_issues = false + has_projects = false + has_wiki = false + + allow_merge_commit = true + allow_squash_merge = true + allow_rebase_merge = true + delete_branch_on_merge = false + auto_init = true + vulnerability_alerts = true +} +//Create a branch for each env +resource "github_branch" "branch" { + for_each = local.parsed_gke_info + repository = split("/",github_repository.acm_repo.full_name)[1] + branch = each.key + depends_on = [github_repository.acm_repo] +} +//Set default branch as the lowest env +resource "github_branch_default" "default_branch" { + repository = split("/",github_repository.acm_repo.full_name)[1] + branch = tostring(keys(local.parsed_gke_info)[0]) + #rename = true + depends_on = [github_branch.branch] +} +#Protect branches other than the default branch +resource "github_branch_protection_v3" "branch_protection" { + for_each = local.parsed_gke_info + repository = split("/",github_repository.acm_repo.full_name)[1] + branch = each.key + required_pull_request_reviews { + required_approving_review_count = 1 + require_code_owner_reviews = true + } + restrictions { + + } + + depends_on = [github_branch.branch] +} + +resource "google_gke_hub_feature_membership" "feature_member" { + provider = google-beta + for_each = local.parsed_gke_info + project = each.value["gke_project_id"] + location = "global" + feature = "configmanagement" + membership = google_gke_hub_membership.membership[each.key].membership_id + configmanagement { + version = "1.17.0" + config_sync { + source_format = "unstructured" + git { + sync_repo = "https://github.com/${github_repository.acm_repo.full_name}.git" + sync_branch = each.value["env"] + policy_dir = "manifests/clusters" + secret_type = "token" + } + } + policy_controller { + enabled = true + template_library_installed = true + referential_rules_enabled = true + } + } + + provisioner "local-exec" { + command = "${path.module}/create_cluster_yamls.sh ${var.github_org} ${github_repository.acm_repo.full_name} ${var.github_user} ${var.github_email} ${each.value["env"]} ${each.value["cluster_name"]} ${index(keys(local.parsed_gke_info),each.key)}" + } + + #depends_on = [ + # google_gke_hub_feature.configmanagement_acm_feature + # ] +} diff --git a/ml-platform/03_configsync/outputs.tf b/ml-platform/03_configsync/outputs.tf new file mode 100644 index 000000000..2e9c6603f --- /dev/null +++ b/ml-platform/03_configsync/outputs.tf @@ -0,0 +1,20 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +output "membership" { + value = google_gke_hub_membership.membership +} +output "val"{ +value = local.parsed_gke_info +} \ No newline at end of file diff --git a/ml-platform/03_configsync/providers.tf b/ml-platform/03_configsync/providers.tf new file mode 100644 index 000000000..6ba18fc39 --- /dev/null +++ b/ml-platform/03_configsync/providers.tf @@ -0,0 +1,39 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +terraform { + required_providers { + google-beta = { + source = "hashicorp/google-beta" + version = "4.72.1" + } + google = { + source = "hashicorp/google" + version = "4.72.1" + } + kubernetes = { + source = "hashicorp/kubernetes" + version = "2.21.1" + } + github = { + source = "hashicorp/github" + version = ">= 4.3.0" + } + } +} + +provider "github" { + owner = var.github_org + token = var.github_token +} diff --git a/ml-platform/03_configsync/templates/acm-template/manifests/apps/.gitkeep b/ml-platform/03_configsync/templates/acm-template/manifests/apps/.gitkeep new file mode 100644 index 000000000..e69de29bb diff --git a/ml-platform/03_configsync/templates/acm-template/manifests/clusters/.gitkeep b/ml-platform/03_configsync/templates/acm-template/manifests/clusters/.gitkeep new file mode 100644 index 000000000..e69de29bb diff --git a/ml-platform/03_configsync/templates/acm-template/templates/_cluster_template/cluster.yaml b/ml-platform/03_configsync/templates/acm-template/templates/_cluster_template/cluster.yaml new file mode 100644 index 000000000..c27d6a578 --- /dev/null +++ b/ml-platform/03_configsync/templates/acm-template/templates/_cluster_template/cluster.yaml @@ -0,0 +1,21 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +kind: Cluster +apiVersion: clusterregistry.k8s.io/v1alpha1 +metadata: + name: CLUSTER_NAME + labels: + environment: ENV + clusterName: CLUSTER_NAME \ No newline at end of file diff --git a/ml-platform/03_configsync/templates/acm-template/templates/_cluster_template/config-selector.yaml b/ml-platform/03_configsync/templates/acm-template/templates/_cluster_template/config-selector.yaml new file mode 100644 index 000000000..3f22b4d64 --- /dev/null +++ b/ml-platform/03_configsync/templates/acm-template/templates/_cluster_template/config-selector.yaml @@ -0,0 +1,23 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +kind: ClusterSelector +apiVersion: configmanagement.gke.io/v1 +metadata: + name: config +spec: + selector: + matchLabels: + clusterName: CLUSTER_NAME + diff --git a/ml-platform/03_configsync/templates/acm-template/templates/_cluster_template/kuberay/kustomization.yaml b/ml-platform/03_configsync/templates/acm-template/templates/_cluster_template/kuberay/kustomization.yaml new file mode 100644 index 000000000..cc63d55d2 --- /dev/null +++ b/ml-platform/03_configsync/templates/acm-template/templates/_cluster_template/kuberay/kustomization.yaml @@ -0,0 +1,30 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +resources: +- rbac.yaml +patches: +- path: rayclusters.yaml +- path: rayservices.yaml +- path: rayjobs.yaml + +helmCharts: +- name: kuberay-operator + repo: https://ray-project.github.io/kuberay-helm/ + version: 1.0.0 + releaseName: kuberay-operator + includeCRDs: true + valuesFile: values.yaml diff --git a/ml-platform/03_configsync/templates/acm-template/templates/_cluster_template/kuberay/rayclusters.yaml b/ml-platform/03_configsync/templates/acm-template/templates/_cluster_template/kuberay/rayclusters.yaml new file mode 100644 index 000000000..d552cc2d9 --- /dev/null +++ b/ml-platform/03_configsync/templates/acm-template/templates/_cluster_template/kuberay/rayclusters.yaml @@ -0,0 +1,22 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + name: rayclusters.ray.io + annotations: + controller-gen.kubebuilder.io/version: v0.6.0 +status: + $patch: delete \ No newline at end of file diff --git a/ml-platform/03_configsync/templates/acm-template/templates/_cluster_template/kuberay/rayjobs.yaml b/ml-platform/03_configsync/templates/acm-template/templates/_cluster_template/kuberay/rayjobs.yaml new file mode 100644 index 000000000..c18a0c21b --- /dev/null +++ b/ml-platform/03_configsync/templates/acm-template/templates/_cluster_template/kuberay/rayjobs.yaml @@ -0,0 +1,22 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + name: rayjobs.ray.io + annotations: + controller-gen.kubebuilder.io/version: v0.6.0 +status: + $patch: delete \ No newline at end of file diff --git a/ml-platform/03_configsync/templates/acm-template/templates/_cluster_template/kuberay/rayservices.yaml b/ml-platform/03_configsync/templates/acm-template/templates/_cluster_template/kuberay/rayservices.yaml new file mode 100644 index 000000000..4e10d8ab6 --- /dev/null +++ b/ml-platform/03_configsync/templates/acm-template/templates/_cluster_template/kuberay/rayservices.yaml @@ -0,0 +1,22 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + name: rayservices.ray.io + annotations: + controller-gen.kubebuilder.io/version: v0.6.0 +status: + $patch: delete \ No newline at end of file diff --git a/ml-platform/03_configsync/templates/acm-template/templates/_cluster_template/kuberay/rbac.yaml b/ml-platform/03_configsync/templates/acm-template/templates/_cluster_template/kuberay/rbac.yaml new file mode 100644 index 000000000..a0a1a686d --- /dev/null +++ b/ml-platform/03_configsync/templates/acm-template/templates/_cluster_template/kuberay/rbac.yaml @@ -0,0 +1,44 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: kuberay-operator-role +rules: +- apiGroups: + - '*' + resources: + - '*' + verbs: + - get + - list + - watch + - create + - update + - patch + - delete +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: kuberay-operator-binding +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: kuberay-operator-role +subjects: +- kind: ServiceAccount + name: kuberay-operator + namespace: default diff --git a/ml-platform/03_configsync/templates/acm-template/templates/_cluster_template/kuberay/values.yaml b/ml-platform/03_configsync/templates/acm-template/templates/_cluster_template/kuberay/values.yaml new file mode 100644 index 000000000..7226bf446 --- /dev/null +++ b/ml-platform/03_configsync/templates/acm-template/templates/_cluster_template/kuberay/values.yaml @@ -0,0 +1,113 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +image: + repository: kuberay/operator + tag: v1.0.0 + pullPolicy: IfNotPresent + +nameOverride: "kuberay-operator" +fullnameOverride: "kuberay-operator" + +serviceAccount: + # Specifies whether a service account should be created + create: true + # The name of the service account to use. + # If not set and create is true, a name is generated using the fullname template + name: "kuberay-operator" + +service: + type: ClusterIP + port: 8080 + +resources: + # We usually recommend not to specify default resources and to leave this as a conscious + # choice for the user. This also increases chances charts run on environments with little + # resources, such as Minikube. If you do whelm to specify resources, uncomment the following + # lines, adjust them as necessary, and remove the curly braces after 'resources:'. + limits: + cpu: 100m + # Anecdotally, managing 500 Ray pods requires roughly 500MB memory. + # Monitor memory usage and adjust as needed. + memory: 512Mi + # requests: + # cpu: 100m + # memory: 512Mi + +livenessProbe: + initialDelaySeconds: 10 + periodSeconds: 5 + failureThreshold: 5 + +readinessProbe: + initialDelaySeconds: 10 + periodSeconds: 5 + failureThreshold: 5 + +batchScheduler: + enabled: false + + # Set up `securityContext` to improve Pod security. + # See https://github.com/ray-project/kuberay/blob/master/docs/guidance/pod-security.md for further guidance. +securityContext: {} + + + # If rbacEnable is set to false, no RBAC resources will be created, including the Role for leader election, the Role for Pods and Services, and so on. +rbacEnable: true + + # When crNamespacedRbacEnable is set to true, the KubeRay operator will create a Role for RayCluster preparation (e.g., Pods, Services) + # and a corresponding RoleBinding for each namespace listed in the "watchNamespace" parameter. Please note that even if crNamespacedRbacEnable + # is set to false, the Role and RoleBinding for leader election will still be created. + # + # Note: + # (1) This variable is only effective when rbacEnable and singleNamespaceInstall are both set to true. + # (2) In most cases, it should be set to true, unless you are using a Kubernetes cluster managed by GitOps tools such as ArgoCD. +crNamespacedRbacEnable: true + + # When singleNamespaceInstall is true: + # - Install namespaced RBAC resources such as Role and RoleBinding instead of cluster-scoped ones like ClusterRole and ClusterRoleBinding so that + # the chart can be installed by users with permissions restricted to a single namespace. + # (Please note that this excludes the CRDs, which can only be installed at the cluster scope.) + # - If "watchNamespace" is not set, the KubeRay operator will, by default, only listen + # to resource events within its own namespace. +singleNamespaceInstall: true + +# The KubeRay operator will watch the custom resources in the namespaces listed in the "watchNamespace" parameter. +watchNamespace: +# - ml-team +# - ds-team + +# Environment variables +env: +# If not set or set to true, kuberay auto injects an init container waiting for ray GCS. +# If false, you will need to inject your own init container to ensure ray GCS is up before the ray workers start. +# Warning: we highly recommend setting to true and let kuberay handle for you. +# - name: ENABLE_INIT_CONTAINER_INJECTION +# value: "true" +# If not set or set to "", kuberay will pick up the default k8s cluster domain `cluster.local` +# Otherwise, kuberay will use your custom domain +# - name: CLUSTER_DOMAIN +# value: "" +# If not set or set to false, when running on OpenShift with Ingress creation enabled, kuberay will create OpenShift route +# Otherwise, regardless of the type of cluster with Ingress creation enabled, kuberay will create Ingress +# - name: USE_INGRESS_ON_OPENSHIFT +# value: "true" +# Unconditionally requeue after the number of seconds specified in the +# environment variable RAYCLUSTER_DEFAULT_REQUEUE_SECONDS_ENV. If the +# environment variable is not set, requeue after the default value (300). +# - name: RAYCLUSTER_DEFAULT_REQUEUE_SECONDS_ENV +# value: 300 +# If not set or set to "true", KubeRay will clean up the Redis storage namespace when a GCS FT-enabled RayCluster is deleted. +# - name: ENABLE_GCS_FT_REDIS_CLEANUP +# value: "true" \ No newline at end of file diff --git a/ml-platform/03_configsync/templates/acm-template/templates/_cluster_template/kustomization.yaml b/ml-platform/03_configsync/templates/acm-template/templates/_cluster_template/kustomization.yaml new file mode 100644 index 000000000..448f68961 --- /dev/null +++ b/ml-platform/03_configsync/templates/acm-template/templates/_cluster_template/kustomization.yaml @@ -0,0 +1,19 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +resources: +- "https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/nvidia-driver-installer/cos/daemonset-preloaded-latest.yaml" +- ./kuberay \ No newline at end of file diff --git a/ml-platform/03_configsync/templates/acm-template/templates/_cluster_template/selector.yaml b/ml-platform/03_configsync/templates/acm-template/templates/_cluster_template/selector.yaml new file mode 100644 index 000000000..cfd6f6ede --- /dev/null +++ b/ml-platform/03_configsync/templates/acm-template/templates/_cluster_template/selector.yaml @@ -0,0 +1,22 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +kind: ClusterSelector +apiVersion: configmanagement.gke.io/v1 +metadata: + name: ENV +spec: + selector: + matchLabels: + environment: ENV \ No newline at end of file diff --git a/ml-platform/03_configsync/templates/acm-template/templates/_cluster_template/team/kustomization.yaml b/ml-platform/03_configsync/templates/acm-template/templates/_cluster_template/team/kustomization.yaml new file mode 100644 index 000000000..93f6f77e9 --- /dev/null +++ b/ml-platform/03_configsync/templates/acm-template/templates/_cluster_template/team/kustomization.yaml @@ -0,0 +1,22 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +resources: +- namespace.yaml +- network-policy.yaml +- rbac.yaml +- reposync.yaml \ No newline at end of file diff --git a/ml-platform/03_configsync/templates/acm-template/templates/_cluster_template/team/namespace.yaml b/ml-platform/03_configsync/templates/acm-template/templates/_cluster_template/team/namespace.yaml new file mode 100644 index 000000000..832e04dc4 --- /dev/null +++ b/ml-platform/03_configsync/templates/acm-template/templates/_cluster_template/team/namespace.yaml @@ -0,0 +1,20 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: v1 +kind: Namespace +metadata: + name: NAMESPACE + labels: + app: APP_NAME \ No newline at end of file diff --git a/ml-platform/03_configsync/templates/acm-template/templates/_cluster_template/team/network-policy.yaml b/ml-platform/03_configsync/templates/acm-template/templates/_cluster_template/team/network-policy.yaml new file mode 100644 index 000000000..de02d2a5a --- /dev/null +++ b/ml-platform/03_configsync/templates/acm-template/templates/_cluster_template/team/network-policy.yaml @@ -0,0 +1,32 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: networking.k8s.io/v1 +kind: NetworkPolicy +metadata: + name: deny-from-other-namespaces + namespace: NAMESPACE +spec: + podSelector: + # Apply policy to all pods in this namespace. + matchLabels: {} + ingress: + - from: + # Allow traffic between all pods in this namespace. + - podSelector: {} + # Example that allows traffic from another app's namespace. + #- from: + # - namespaceSelector: + # matchLabels: + # app: another-app-namespace \ No newline at end of file diff --git a/ml-platform/03_configsync/templates/acm-template/templates/_cluster_template/team/rbac.yaml b/ml-platform/03_configsync/templates/acm-template/templates/_cluster_template/team/rbac.yaml new file mode 100644 index 000000000..398a617bb --- /dev/null +++ b/ml-platform/03_configsync/templates/acm-template/templates/_cluster_template/team/rbac.yaml @@ -0,0 +1,59 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: NAMESPACE-FullAccess + namespace: NAMESPACE +rules: +- apiGroups: + - '*' + resources: + - '*' + verbs: + - get + - list + - watch + - create + - update + - patch + - delete +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: NAMESPACE-user-access + namespace: NAMESPACE +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: NAMESPACE-FullAccess +subjects: +- kind: User + name: USERNAME1 +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: kuberay-sa-access + namespace: NAMESPACE +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: NAMESPACE-FullAccess +subjects: +- kind: ServiceAccount + name: kuberay-operator + namespace: default \ No newline at end of file diff --git a/ml-platform/03_configsync/templates/acm-template/templates/_cluster_template/team/reposync.yaml b/ml-platform/03_configsync/templates/acm-template/templates/_cluster_template/team/reposync.yaml new file mode 100644 index 000000000..191a5b7f0 --- /dev/null +++ b/ml-platform/03_configsync/templates/acm-template/templates/_cluster_template/team/reposync.yaml @@ -0,0 +1,135 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#ROOT_SOURCE/namespaces/NAMESPACE/repo-sync.yaml +apiVersion: configsync.gke.io/v1beta1 +kind: RepoSync +metadata: + name: dev-NAMESPACE + namespace: NAMESPACE + annotations: + configmanagement.gke.io/cluster-selector: dev +spec: + sourceType: git + # Since this is for a namespace repository, the format is unstructured + sourceFormat: unstructured + git: + repo: "GIT_REPO" + revision: "dev" + #branch: NAMESPACE_BRANCH + dir: "manifests/apps/NAMESPACE" + auth: token + secretRef: + name: git-creds +--- +#ROOT_REPO/namespaces/NAMESPACE/sync-rolebinding.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: dev-rb-NAMESPACE + namespace: NAMESPACE + annotations: + configmanagement.gke.io/cluster-selector: dev +subjects: +- kind: ServiceAccount + name: ns-reconciler-NAMESPACE-dev-NAMESPACE- + namespace: config-management-system +roleRef: + kind: ClusterRole + name: cluster-admin + apiGroup: rbac.authorization.k8s.io +--- + +#ROOT_SOURCE/namespaces/NAMESPACE/repo-sync.yaml +apiVersion: configsync.gke.io/v1beta1 +kind: RepoSync +metadata: + name: staging-NAMESPACE + namespace: NAMESPACE + annotations: + configmanagement.gke.io/cluster-selector: staging +spec: + sourceType: git + # Since this is for a namespace repository, the format is unstructured + sourceFormat: unstructured + git: + repo: "GIT_REPO" + revision: "staging" + #branch: NAMESPACE_BRANCH + dir: "manifests/apps/NAMESPACE" + auth: token + secretRef: + name: git-creds +--- +#ROOT_REPO/namespaces/NAMESPACE/sync-rolebinding.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: staging-rb-NAMESPACE + namespace: NAMESPACE + annotations: + configmanagement.gke.io/cluster-selector: staging +subjects: +- kind: ServiceAccount + name: ns-reconciler-NAMESPACE-staging-NAMESPACE- + namespace: config-management-system +roleRef: + kind: ClusterRole + name: cluster-admin + apiGroup: rbac.authorization.k8s.io +--- + +#ROOT_SOURCE/namespaces/NAMESPACE/repo-sync.yaml +apiVersion: configsync.gke.io/v1beta1 +kind: RepoSync +metadata: + name: prod-NAMESPACE + namespace: NAMESPACE + annotations: + configmanagement.gke.io/cluster-selector: prod +spec: + sourceType: git + # Since this is for a namespace repository, the format is unstructured + sourceFormat: unstructured + git: + repo: "GIT_REPO" + revision: "prod" + #branch: NAMESPACE_BRANCH + dir: "manifests/apps/NAMESPACE" + auth: token + secretRef: + name: git-creds +--- +#ROOT_REPO/namespaces/NAMESPACE/sync-rolebinding.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: prod-rb-NAMESPACE + namespace: NAMESPACE + annotations: + configmanagement.gke.io/cluster-selector: prod +subjects: +- kind: ServiceAccount + name: ns-reconciler-NAMESPACE-prod-NAMESPACE- + namespace: config-management-system +roleRef: + kind: ClusterRole + name: cluster-admin + apiGroup: rbac.authorization.k8s.io +--- +#What should be the name of the reconciler's service account? +#If the RepoSync name is repo-sync, SERVICE_ACCOUNT_NAME is ns-reconciler-NAMESPACE. +# Otherwise, it is ns-reconciler-NAMESPACE-REPO_SYNC_NAME-REPO_SYNC_NAME_LENGTH. +#For example, if your RepoSync name is prod, then the SERVICE_ACCOUNT_NAME would be ns-reconciler-NAMESPACE-prod-4. The integer 4 is used as prod contains 4 characters. +# https://cloud.google.com/anthos-config-management/docs/how-to/multiple-repositories \ No newline at end of file diff --git a/ml-platform/03_configsync/templates/acm-template/templates/_namespace_template/app/fluentd_config.yaml b/ml-platform/03_configsync/templates/acm-template/templates/_namespace_template/app/fluentd_config.yaml new file mode 100644 index 000000000..e85cf38d0 --- /dev/null +++ b/ml-platform/03_configsync/templates/acm-template/templates/_namespace_template/app/fluentd_config.yaml @@ -0,0 +1,44 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: v1 +kind: ConfigMap +metadata: + name: fluentbit-config +data: + fluent-bit.conf: | + [SERVICE] + Parsers_File parsers.conf + [INPUT] + Name tail + Path /tmp/ray/session_latest/logs/* + Tag ray + Path_Key filename + Refresh_Interval 5 + [FILTERS] + Name parser + Match ray + Key_Name filename + Parser rayjob + Reserve_Data On + [OUTPUT] + Name stdout + Format json_lines + Match * + + parsers.conf: | + [PARSER] + Name rayjob + Format regex + Regex (?raysubmit_[^.]*) \ No newline at end of file diff --git a/ml-platform/03_configsync/templates/acm-template/templates/_namespace_template/app/kustomization.yaml b/ml-platform/03_configsync/templates/acm-template/templates/_namespace_template/app/kustomization.yaml new file mode 100644 index 000000000..19240b9a8 --- /dev/null +++ b/ml-platform/03_configsync/templates/acm-template/templates/_namespace_template/app/kustomization.yaml @@ -0,0 +1,27 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +namespace: NAMESPACE + +resources: +- fluentd_config.yaml + +helmCharts: +- name: ray-cluster + repo: https://ray-project.github.io/kuberay-helm/ + version: 1.0.0 + releaseName: ray-cluster + valuesFile: values.yaml diff --git a/ml-platform/03_configsync/templates/acm-template/templates/_namespace_template/app/serviceaccount.yaml b/ml-platform/03_configsync/templates/acm-template/templates/_namespace_template/app/serviceaccount.yaml new file mode 100644 index 000000000..245824cc0 --- /dev/null +++ b/ml-platform/03_configsync/templates/acm-template/templates/_namespace_template/app/serviceaccount.yaml @@ -0,0 +1,21 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: v1 +kind: ServiceAccount +metadata: + name: KUBERNETES_SERVICE_ACCOUNT + namespace: NAMESPACE + annotations: + iam.gke.io/gcp-service-account: GOOGLE_SERVICE_ACCOUNT diff --git a/ml-platform/03_configsync/templates/acm-template/templates/_namespace_template/app/values.yaml b/ml-platform/03_configsync/templates/acm-template/templates/_namespace_template/app/values.yaml new file mode 100644 index 000000000..ba86e3191 --- /dev/null +++ b/ml-platform/03_configsync/templates/acm-template/templates/_namespace_template/app/values.yaml @@ -0,0 +1,313 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +image: + # Replace this with your own image if needed. + repository: rayproject/ray + tag: 2.7.1-py310-gpu + pullPolicy: IfNotPresent + +nameOverride: "kuberay" +fullnameOverride: "" + +imagePullSecrets: [] +# - name: an-existing-secret + +head: + groupName: headgroup + # If enableInTreeAutoscaling is true, the autoscaler sidecar will be added to the Ray head pod. + # Ray autoscaler integration is supported only for Ray versions >= 1.11.0 + # Ray autoscaler integration is Beta with KubeRay >= 0.3.0 and Ray >= 2.0.0. + # enableInTreeAutoscaling: true + # autoscalerOptions is an OPTIONAL field specifying configuration overrides for the Ray autoscaler. + # The example configuration shown below below represents the DEFAULT values. + # autoscalerOptions: + # upscalingMode: Default + # idleTimeoutSeconds: 60 + # securityContext: {} + # env: [] + # envFrom: [] + # resources specifies optional resource request and limit overrides for the autoscaler container. + # For large Ray clusters, we recommend monitoring container resource usage to determine if overriding the defaults is required. + # resources: + # limits: + # cpu: "500m" + # memory: "512Mi" + # requests: + # cpu: "500m" + # memory: "512Mi" + labels: + cloud.google.com/gke-ray-node-type: head + created-by: ray-on-gke + rayStartParams: + dashboard-host: '0.0.0.0' + block: 'true' + # containerEnv specifies environment variables for the Ray container, + # Follows standard K8s container env schema. + containerEnv: + # - name: EXAMPLE_ENV + # value: "1" + - name: RAY_memory_monitor_refresh_ms + value: "0" + envFrom: [] + # - secretRef: + # name: my-env-secret + # ports optionally allows specifying ports for the Ray container. + ports: [] + # resource requests and limits for the Ray head container. + # Modify as needed for your application. + # Note that the resources in this example are much too small for production; + # we don't recommend allocating less than 8G memory for a Ray pod in production. + # Ray pods should be sized to take up entire K8s nodes when possible. + # Always set CPU and memory limits for Ray pods. + # It is usually best to set requests equal to limits. + # See https://docs.ray.io/en/latest/cluster/kubernetes/user-guides/config.html#resources + # for further guidance. + resources: + limits: + cpu: "8" + nvidia.com/gpu: "1" + # To avoid out-of-memory issues, never allocate less than 2G memory for the Ray head. + memory: "20G" + ephemeral-storage: 20Gi + requests: + cpu: "8" + nvidia.com/gpu: "1" + memory: "20G" + ephemeral-storage: 10Gi + annotations: {} + nodeSelector: + iam.gke.io/gke-metadata-server-enabled: "true" + cloud.google.com/gke-accelerator: "nvidia-l4" + tolerations: + - key: "nvidia.com/gpu" + operator: "Exists" + effect: "NoSchedule" + - key: "reserved" + operator: "Exists" + effect: "NoSchedule" + affinity: {} + # Ray container security context. + securityContext: {} + volumes: + - name: ray-logs + emptyDir: {} + - name: fluentbit-config + configMap: + name: fluentbit-config + # Ray writes logs to /tmp/ray/session_latests/logs + volumeMounts: + - mountPath: /tmp/ray + name: ray-logs + # sidecarContainers specifies additional containers to attach to the Ray pod. + # Follows standard K8s container spec. + sidecarContainers: + - name: fluentbit + image: fluent/fluent-bit:1.9.6 + # These resource requests for Fluent Bit should be sufficient in production. + resources: + requests: + cpu: 100m + memory: 128Mi + ephemeral-storage: 2Gi + limits: + cpu: 100m + memory: 128Mi + ephemeral-storage: 4Gi + volumeMounts: + - mountPath: /tmp/ray + name: ray-logs + - mountPath: /fluent-bit/etc/ + name: fluentbit-config + +worker: + # If you want to disable the default workergroup + # uncomment the line below + # disabled: true + groupName: workergroup + replicas: 1 + type: worker + labels: + cloud.google.com/gke-ray-node-type: worker + created-by: ray-on-gke + rayStartParams: + block: 'true' + initContainerImage: 'busybox:1.28' # Enable users to specify the image for init container. Users can pull the busybox image from their private repositories. + # Security context for the init container. + initContainerSecurityContext: {} + # containerEnv specifies environment variables for the Ray container, + # Follows standard K8s container env schema. + containerEnv: [] + # - name: EXAMPLE_ENV + # value: "1" + envFrom: [] + # - secretRef: + # name: my-env-secret + # ports optionally allows specifying ports for the Ray container. + ports: [] + # resource requests and limits for the Ray head container. + # Modify as needed for your application. + # Note that the resources in this example are much too small for production; + # we don't recommend allocating less than 8G memory for a Ray pod in production. + # Ray pods should be sized to take up entire K8s nodes when possible. + # Always set CPU and memory limits for Ray pods. + # It is usually best to set requests equal to limits. + # See https://docs.ray.io/en/latest/cluster/kubernetes/user-guides/config.html#resources + # for further guidance. + resources: + limits: + cpu: "1" + nvidia.com/gpu: "1" + memory: "20G" + ephemeral-storage: 20Gi + requests: + cpu: "1" + nvidia.com/gpu: "1" + memory: "20G" + ephemeral-storage: 10Gi + annotations: + key: value + nodeSelector: + iam.gke.io/gke-metadata-server-enabled: "true" + cloud.google.com/gke-accelerator: "nvidia-l4" + tolerations: + - key: "nvidia.com/gpu" + operator: "Exists" + effect: "NoSchedule" + - key: "reserved" + operator: "Exists" + effect: "NoSchedule" + affinity: {} + # Ray container security context. + securityContext: {} + volumes: + - name: ray-logs + emptyDir: {} + - name: fluentbit-config + configMap: + name: fluentbit-config + # Ray writes logs to /tmp/ray/session_latests/logs + volumeMounts: + - mountPath: /tmp/ray + name: ray-logs + # sidecarContainers specifies additional containers to attach to the Ray pod. + # Follows standard K8s container spec. + sidecarContainers: + - name: fluentbit + image: fluent/fluent-bit:1.9.6 + # These resource requests for Fluent Bit should be sufficient in production. + resources: + requests: + cpu: 100m + memory: 128Mi + ephemeral-storage: 2Gi + limits: + cpu: 100m + memory: 128Mi + ephemeral-storage: 4Gi + volumeMounts: + - mountPath: /tmp/ray + name: ray-logs + - mountPath: /fluent-bit/etc/ + name: fluentbit-config + +# The map's key is used as the groupName. +# For example, key:small-group in the map below +# will be used as the groupName +additionalWorkerGroups: + smallGroup: + # Disabled by default + disabled: true + replicas: 1 + minReplicas: 1 + maxReplicas: 3 + type: worker + labels: {} + rayStartParams: + block: 'true' + initContainerImage: 'busybox:1.28' # Enable users to specify the image for init container. Users can pull the busybox image from their private repositories. + # Security context for the init container. + initContainerSecurityContext: {} + # containerEnv specifies environment variables for the Ray container, + # Follows standard K8s container env schema. + containerEnv: [] + # - name: EXAMPLE_ENV + # value: "1" + envFrom: [] + # - secretRef: + # name: my-env-secret + # ports optionally allows specifying ports for the Ray container. + ports: [] + # resource requests and limits for the Ray head container. + # Modify as needed for your application. + # Note that the resources in this example are much too small for production; + # we don't recommend allocating less than 8G memory for a Ray pod in production. + # Ray pods should be sized to take up entire K8s nodes when possible. + # Always set CPU and memory limits for Ray pods. + # It is usually best to set requests equal to limits. + # See https://docs.ray.io/en/latest/cluster/kubernetes/user-guides/config.html#resources + # for further guidance. + resources: + limits: + cpu: 1 + memory: "1G" + requests: + cpu: 1 + memory: "1G" + annotations: + key: value + nodeSelector: {} + tolerations: + - key: "nvidia.com/gpu" + operator: "Exists" + effect: "NoSchedule" + - key: "reserved" + operator: "Exists" + effect: "NoSchedule" + affinity: {} + # Ray container security context. + securityContext: {} + volumes: + - name: ray-logs + emptyDir: {} + - name: fluentbit-config + configMap: + name: fluentbit-config + # Ray writes logs to /tmp/ray/session_latests/logs + volumeMounts: + - mountPath: /tmp/ray + name: ray-logs + # sidecarContainers specifies additional containers to attach to the Ray pod. + # Follows standard K8s container spec. + sidecarContainers: + - name: fluentbit + image: fluent/fluent-bit:1.9.6 + # These resource requests for Fluent Bit should be sufficient in production. + resources: + requests: + cpu: 100m + memory: 128Mi + ephemeral-storage: 2Gi + limits: + cpu: 100m + memory: 128Mi + ephemeral-storage: 4Gi + volumeMounts: + - mountPath: /tmp/ray + name: ray-logs + - mountPath: /fluent-bit/etc/ + name: fluentbit-config + +service: + type: ClusterIP \ No newline at end of file diff --git a/ml-platform/03_configsync/variables.tf b/ml-platform/03_configsync/variables.tf new file mode 100644 index 000000000..000789e66 --- /dev/null +++ b/ml-platform/03_configsync/variables.tf @@ -0,0 +1,45 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +variable "lookup_state_bucket" { + description = "GCS bucket to look up TF state from previous steps." + type = string + default = "YOUR_STATE_BUCKET" +} + +variable "configsync_repo_name" { + type = string + description = "Name of the GitHub repo that will be synced to the cluster with Config sync." + default = "config-sync-repo" +} + +variable "github_user" { + description = "GitHub user name." + type = string + default = "YOUR_GIT_USER" +} +variable "github_email" { + description = "GitHub user email." + type = string + default = "YOUR_GIT_USER_EMAIL" +} +variable "github_org" { + type = string + description = "GitHub org." + default = "YOUR_GIT_ORG" +} +variable "github_token" { + type = string + description = "GitHub token. It is a token with write permissions as it will create a repo in the GitHub org." +} diff --git a/ml-platform/04_setup_clusters/README.md b/ml-platform/04_setup_clusters/README.md new file mode 100644 index 000000000..5613a0b52 --- /dev/null +++ b/ml-platform/04_setup_clusters/README.md @@ -0,0 +1,138 @@ +Copyright 2024 Google LLC + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +### This doc is meant for the platform admins or the group that has admin level permissions on the GKE clusters. The steps mentioned in these docs must be executed by them. + +## Prerequisite +- You have successfully run through [03_configsync][configsync] module. + +### Complete config synch setup + +Go to Google Cloud Console, click on the navigation menu and click on Kubernetes Engine > Config. Now, click on the `PACKAGES` +tab. You will notice that the `Sync status` will show as stalled for all [root-sync][root-sync]. +This is because, config sync needs to authenticate with GitHub to be able to read the manifests in the configsync repo. It expects a secret named `git-cred` in `config-menegement-system` namespace on the cluster. +This secret stores the github user and its [personal access token][personal-access-token]. The [personal access token][personal-access-token] should have the read only access so config sync can read the repo to perform the sync. + +Follow these steps to create a new secret `git-cred` in `config-menegement-system` namespace: +- For the GitHub user account that you plan to use, generate a [personal access token][personal-access-token] with read access to the configsync repo. It is recommended to use a [machine user account][machine-user-account] for this but you can use a personal user account just to try this reference architecture. +- Get IAM role `roles/gkehubeditor` to be able to use the connect gateway to access the GKE cluster. If you are the owner of the project, this step can be skipped. +- Open cloudshell and run these commands: + ``` + gcloud config set project + + gcloud container fleet memberships get-credentials + + kubectl create secret generic git-creds --namespace="config-management-system" --from-literal=username= --from-literal=token= + + gcloud container fleet memberships get-credentials + + kubectl create secret generic git-creds --namespace="config-management-system" --from-literal=username= --from-literal=token= + + gcloud container fleet memberships get-credentials + + kubectl create secret generic git-creds --namespace="config-management-system" --from-literal=username= --from-literal=token= + ``` + +After the `git-cred` secret has been created, you will see the `Sync status` for dev cluster will change from `stalled` to `synced` with a green tick mark against it. The `Synch status` for `staging` and `prod` clusters will change from stalled to Error. This is because the `staging` and `prod` branches of the repo has no content yet. + +Create a pull request from `dev` to `staging` and merge it. After the merge, the `Sync status` of the `staging` cluster will change from `Stalled` to `Synced`. Now, create a PR from `staging` to `prod` and merge it. The `Sync status` for `prod` cluster will change from `Stalled` to `Synced`. + +You just followed `GitOps` to promote changes from `dev` to higher environments. + +### Review the config synch repo +Open the configsync repo and go to `manifests/clusters`, you will see there is a cluster selector created for each cluster via yaml files. + +### Install a cluster scoped software +This section describes how platform admins will use the configsync repo to manage cluster scoped software or cluster level objects. These softwares could be used by multiple teams in their namespaces. An example of such softwares is [kuberay][kuberay] that can manage ray clusters in multiple namespace. + + +Let's install [Kuberay][kuberay] as a cluster level software that includes CRDs and deployments. Kuberay has a component called operator that facilitates `ray` on Kubernetes. We will install Kuberay operator in default namespace. The operator will then orchestrate `ray clusters` created in different namespace by different teams in the future. +Perform the following steps: +- Clone the configsync repo and change directory. The default branch `dev` is checked out. + ``` + git clone repo + cd repo + ``` + +- From the provided templates under `templates/_cluster_template`, copy kustomization.yaml to `manifests/clusters` which is synced with the GKE clusters. kustomization.yaml will become the entrypoint for the [root-sync][root-sync] in the `manifests/clusters` folder and it syncs all the files defined in kustomization.yaml with the cluster. + ``` + cp templates/_cluster_template/kustomization.yaml manifests/clusters + ``` + +- Copy the directory containing the manifests to install kuberay to the directory that is synced with the GKE clusters. + ``` + cp -r templates/_cluster_template/kuberay manifests/clusters + ``` + Note that the directory `kuberay` is supplied as a template with this reference architecture. You can modify it based on your requirements. + +- Add cluster selector files in kustomization.yaml so config sync syncs these files with the clusters. The selectors are useful when you want to apply changes on one or multiple clusters selectively. + ``` + cat <>manifests/clusters/kustomization.yaml + + - ./gke-ml-dev-cluster.yaml + - ./gke-ml-staging-cluster.yaml + - ./gke-ml-prod-cluster.yaml + - ./dev-selector.yaml + - ./staging-selector.yaml + - ./prod-selector.yaml + EOF + ``` + +- Commit the changes and push them to dev branch. + ``` + git add . + git commit -m "Installing Kuberay operator" + git push + ``` + +You just pushed the manifests to install kuberay operator in default namespace to the `dev` branch. Go to Google Cloud Console, click on the navigation menu and click on Kubernetes Engine > Config. Now, click on the `PACKAGES` tab. Verify that the dev cluster is in `Synced` status. + +Verify in the `dev` cluster that [Kuberay operator][kuberay] has been installed successfully. +Open cloudshell and run these commands: +- gcloud config set project `` +- gcloud container fleet memberships get-credentials `` +- kubectl get crd | grep ray + - This should show result similar to the following: + ``` + rayclusters.ray.io 2024-02-12T21:19:06Z + rayjobs.ray.io 2024-02-12T21:19:09Z + rayservices.ray.io 2024-02-12T21:19:12Z + ``` +- kubectl get pods + - This should show result similar to the following: + ``` + NAME READY STATUS RESTARTS AGE + kuberay-operator-56b8d98766-2nvht 1/1 Running 0 6m26s + ``` +As you can see , we have installed the CRDs and the deployment for the kuberay operator. + +## Contributing + +* [Contributing guidelines][contributing-guidelines] +* [Code of conduct][code-of-conduct] + + + +[contributing-guidelines]: CONTRIBUTING.md +[code-of-conduct]: code-of-conduct.md +[repo-sync]: https://cloud.google.com/anthos-config-management/docs/reference/rootsync-reposync-fields +[root-sync]: https://cloud.google.com/anthos-config-management/docs/reference/rootsync-reposync-fields +[personal-access-token]: https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/managing-your-personal-access-tokens +[machine-user-account]: https://docs.github.com/en/get-started/learning-about-github/types-of-github-accounts +[kuberay]: https://ray-project.github.io/kuberay/ +[configsync]: ../03_configsync + + + + diff --git a/ml-platform/05_setup_teams/README.md b/ml-platform/05_setup_teams/README.md new file mode 100644 index 000000000..e0c90e4d4 --- /dev/null +++ b/ml-platform/05_setup_teams/README.md @@ -0,0 +1,169 @@ +Copyright 2024 Google LLC + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + + +### This doc is meant for the platform admins or the group that has admin level permissions on the GKE clusters. The steps mentioned in these docs must be executed by them. + +## Prerequisite +- You have successfully run through [04_setup_clusters][cluster-setup] module. + +## Setup teams +Typically, each team can own one or more namespaces and the team's users will get access to create, update and delete objects in those namespaces but they will be restricted from creating, updating or deleting cluster level objects or the objects in other namespaces. + +The platform admin will set up the teams(create namespace and permission team's users on it) using the configsync repo(via [root sync][root-sync]) and provide the app teams the means to manage objetcs in their own namepsace without further involvment. + +Setting up a team has the following steps: +- create a new namespace for the team and permission the users on the namespace. + + Note: In this reference architecture, we create the namespace with the same name as the team. In real-world scenario, a team can own multiple namespaces so you might want to create namespaces with the application name that will be deployed in it. +- create a network-policy(optional). App teams can do it later. +- create a [reposync][repo-sync] object on the GKE clusters that will be associated with the repo/dir that is owned by the app teams. The app teams can manage the namespace scoped resources via their repo/dir by adding the kubernetes manifests there. + +### Prepare the changes to create a team + +In order to create a new namespace, perform the following steps: +- Clone the configsync repo and change directory. The default branch `dev` is checked out. + ``` + git clone `` repo + cd repo + ``` +- Copy the team template directory to the directory that is synced with the GKE clusters. The team template directory contains manifests to create namespace,[rbac][rbac],network policy and [reposync][repo-sync] + ``` + cp -r templates/_cluster_template/team manifests/clusters/ + ``` + `` is the name of the team for which the namespace is being created. It can also be the name of the application. + Note that the team template is provided with this reference architecture. You can modify it based on your requirements. + + + +- Change the placeholders in the files under `manifests/clusters/` + - replace NAMSESPACE with the name of the namespace/team in the files under `manifests/clusters/` + ``` + sed -i 's#NAMESPACE##g' manifests/clusters//* + ``` + - replace GIT_REPO with the link to the Git repository that you want to sync with this reposync in `manifests/clusters//reposync.yaml`. + ``` + sed -i 's#GIT_REPO##g' manifests/clusters//reposync.yaml + ``` + - manually replace NUMBER_OF_CHARACTERS_IN_REPOSYNC_NAME in `manifests/clusters//reposync.yaml` + e.g if the reposync name is prod-myteam, replace NUMBER_OF_CHARACTERS_IN_REPOSYNC_NAME with 11. + +- Create a new directory that the reposync object is pointing to. + ``` + mkdir manifests/apps/ + touch manifests/apps//.gitkeep + ``` + +- Add the path to the new team dir in kustomization.yaml to include it in the sync. + ``` + cat <>manifests/clusters/kustomization.yaml + - ./ + EOF + ``` + + +### Review the files: +Go to `manifests/clusters/` +- kustomization.yaml specifies which yaml files should be synced with the cluster. +- namespace.yaml defines the code to create a new namespace. +- rbac.yaml creates a role for full access to the namespace and assign the role to the team's users. + - This can be changed to a more restricted role or you can create multiple roles for different users. + - There is also a rolebinding that provides [kuberay operator][kuberay] service account access to this namespace. This is required for [kuberay][kuberay] to be able to manage the ray clusters inside this namespace. +- reposync.yaml creates [reposync][repo-sync] object on the cluster for the given namespace. The [reposync][repo-sync] object will be connected to a repo and will be used by the app team to create, update and delete the namespace scoped objects like rayclusters etc. + - The app team either can bring their own repo and provide it to the platform admins so they can update reposync.yaml accordingly. + - Alternatively, if your organization wants to follow mono repo structure, platform admin can create a subfolder named `` in this repo for each team under `manifests/apps` and provide the path `manifests/apps/`to the [reposync][repo-sync] object for that namespace. Platform admin can permission only the required team members to be able to edit the files under `manifests/apps/``` folder. + - see the `repo`, `revision` and `dir` tags in `reposync.yaml` that defines wha repo and dir will be synced for this [reposync][repo-sync]. + - see [mono repo vs multi repos](#mono-repo-vs-multi-repos) if you want to decide which one to use. + +### Apply the changes: +Commit the changes and push them to dev branch. +``` +git add . +git commit -m "Adding a new team" +git push +``` + +The changes are pushed to `dev` branch so the namespace and related objects will be created in dev GKE cluster. +Now create pull request from `dev` to `staging` branch and merge it. Then create a pull request from `staging` to `prod` branch and merge it. This will create the namespace and related objects in `staging` and `prod` GKE clusters. + +Go to Google Cloud Console, click on the navigation menu and click on Kubernetes Engine > Config. Now, click on the `PACKAGES` tab. You will see a new [repo-sync][repo-sync] object created on each cluster but they will be in `Stalled` state. +This is because config sync needs to authenticate with GitHub to be able to read the manifests in the repo. It expects a secret named `git-cred` in the namespace for configuring [reposync][repo-sync] with the GitHub repo. +This secret stores the github user and its [personal access token][personal-access-token]. + +Follow these steps to create a new secret in dev cluster `git-cred`: +- For the GitHub user account that you plan to use, generate a [personal access token][personal-access-token] with read access to the configsync repo. It is recommended to use a [machine user account][machine-user-account] for this but you can use a personal user account just to try this reference architecture. +- Get IAM role roles/gkehubeditor to be able to use connect gateway to access the GKE cluster. If you are the owner of the project, this step can be skipped. +- Open cloudshell and run these commands: + ``` + gcloud config set project + + gcloud container fleet memberships get-credentials + + kubectl create secret generic git-creds --namespace="" --from-literal=username= --from-literal=token= + + gcloud container fleet memberships get-credentials + + kubectl create secret generic git-creds --namespace="" --from-literal=username= --from-literal=token= + + gcloud container fleet memberships get-credentials + + kubectl create secret generic git-creds --namespace="" --from-literal=username= --from-literal=token= + ``` + + +Go to Google Cloud Console, click on the navigation menu and click on Kubernetes Engine > Config. Now, click on the `PACKAGES` tab. You will see a new [repo-sync][repo-sync] object will have `Synch status` as `Synced` with a green tick mark against them. This confirms that the [reposync][repo-sync] objects have been successfully created on all the clusters. + +This marks the completion of the team/namespace. + + +The platform admin will provide access to this dir to the members of the team. The team members will create manifests under this folder to manage their namespace scoped objects. + + +Important : +Platform admins should also restrict access to this directory so only the members of the team can update the files under it. +This can be done by creating CODEOWNERS files to allow the required team members to have access to this dir. This way you will ensure that only the team members can manage Kubernetes objects in this namespace and no other team can do that. If the team members try to create cluster-scoped objects from this dir, it will result in error as this folder is connected to [reposync][repo-sync] objects which doesn't allow cluster level access. + + +### Mono repo vs multi repos +The platform admins and the app teams need to make a decision on what repo structure they will use for config sync. + +Using mono repo means: +- The same repo will be used for cluster level objects(created by platform admins) and namespace level objects(created by app teams). +- The platform admins will be the owner of the repo and maintain CODEOWNERS files to provide granular access to the platform admins and the app teams. +- However, if the app teams want to promote changes from one env to another, they will reply on platform admins or the repo owners to approve the PR. + +Using multiple repos mean: +- The [rootsync][root-sync] will be tied to a repo that only platform-admins own and they can create cluster level objects from this repo. +- The [reposync][repo-sync] for individual teams will be created and tied to their own git repos. There is no need for granular permissions by platform admins as the app teams use their own repos to create namespace level objects. +- The app teams can create and merge PRs to their own repo independently to promote changes from one env to another. + + +## Contributing + +* [Contributing guidelines][contributing-guidelines] +* [Code of conduct][code-of-conduct] + + + +[contributing-guidelines]: CONTRIBUTING.md +[code-of-conduct]: code-of-conduct.md +[repo-sync]: https://cloud.google.com/anthos-config-management/docs/reference/rootsync-reposync-fields +[root-sync]: https://cloud.google.com/anthos-config-management/docs/reference/rootsync-reposync-fields +[personal-access-token]: https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/managing-your-personal-access-tokens +[machine-user-account]: https://docs.github.com/en/get-started/learning-about-github/types-of-github-accounts +[kuberay]: https://ray-project.github.io/kuberay/ +[rbac]: https://kubernetes.io/docs/reference/access-authn-authz/rbac/ +[cluster-setup]: ../04_setup_clusters + + diff --git a/ml-platform/06_operating_teams/README.md b/ml-platform/06_operating_teams/README.md new file mode 100644 index 000000000..f550b3c0b --- /dev/null +++ b/ml-platform/06_operating_teams/README.md @@ -0,0 +1,154 @@ +Copyright 2024 Google LLC + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + + +This doc describes how you as an App team member will use the configsync repo to manage your applications scoped to your namespace. +We will demonstrate this with an example of installing `ray` in the namespace. Typically, you can install any software or deploy any application in your namespace in the same fashion. + +## Prerequisite +- You have successfully run through [05_setup_teams][team-setup] module. + + +## Install a software(ray) + +This section is meant for the app teams that have permission only on a given namespace in the GKE clusters. The steps mentioned in this section must be executed by them. + +`Ray` is s an open-source unified compute framework that makes it easy to scale AI and Python workloads — from reinforcement learning to deep learning to tuning, and model serving. +It is very commonly used by Machine Learning teams. In order to run `Ray` on Kubernetes, you need `Kuberay` operator. The `kuberay` operators can manage the ray clusters installed in different namespaces. So, if there are multiple teams that need to use `ray` can install it in their own namespace while the kuberay oeprator can manage all of them. +Installing `kuberay` requires cluster level access as it creates the CRDs. We demonstrated installing `kuberay` in [cluster-setup][cluster-setup]. +Here we will show how to install `ray` in a namespace and configure `kuebray` to manage it. + +As an app team member, you will have access to `manifests/apps/``` folder in this repo if you are using a [mono repo][mono-repo] structure. You can perform the following steps to add `ray` manifests to the folder. The [reposync][repo-sync] will sync the manifests to the namespace on the cluster and you will get `ray` installed in your namespace. + +Note: If you are using multi repo structure, you will have access to the entire git repo and you can add the manifests in the similar fashion in the required directory to install `ray`. + +### Create the manifests +- Open `cloudshell` and run the following commands: + ``` + git clone repo && cd repo + + cp -r templates/_namespace_template/app/* manifests/apps// + ``` + +- Replace NAMSESPACE with the name of the namespace in the newly copied files. + ``` + sed -i 's#NAMESPACE##g' manifests/apps//* + ``` + +### Review the manifests +- `kustomization.yaml` specifies which yaml files should be synced with the cluster for this namespace. It references to a helm chart to install `ray` +- `values.yaml` contains the overriding values for the kuberay helm chart. +- `fluentd_config.yaml` specifies Configmap that will be applied to the namespace. +- `serviceaccount.yaml`(optional) sepcifies the kubernetes service account. This service account can be used for [workload identity][workload-identity]. + +Note that these files are provided as a template with the reference architecture for installing ray cluster. You can modify these templates as needed. + +### Apply the manifest: +- Go to `cloudshell` where you cloned the repo and copied the new files. + ``` + git add . + + git commit -m "Installing ray in namespace " + + git push + ``` + +The changes are pushed to `dev` branch so `ray` is installed on `dev` GKE cluster. To apply these changes to `staging`, create a pull request from `dev` to `staging` branch and merge it. Similarly, in order to apply the changes to `prod` cluster, create a pull request from `staging` to `prod` branch and merge it. + +Go to Google Cloud Console, click on the navigation menu and click on Kubernetes Engine > Config. Now, click on the `PACKAGES` tab. The [repo-sync][repo-sync] objects should show `Sync status` as `Synced` with green tick against it. + +### Verify the raycluster is in ready state in the namespace. +- Open cloudshell and run these commands: + ``` + gcloud config set project + + gcloud container fleet memberships get-credentials + + kubectl get raycluster -n + ``` +- This should show result similar to the following: + + ``` + NAME DESIRED WORKERS AVAILABLE WORKERS STATUS AGE + ray-cluster-kuberay 4m9s + ``` + +### Update kuberay operator to manage ray in your namespace + +This section is meant for platform admins or the group that has admin level permissions on the GKE clusters. The steps mentioned in these docs must be executed by them. + +[Kuberay][kuberay] operator manages `ray` on Kubernetes. You need to configure kuberay operator so that it manages `ray` in your namespace. `kuberay` was installed via [rootsync][root-sync] from the folder `manifests/clusters` by platform-admin so they should be performing the following step. +- Go to `cloudshell` where you cloned the repo. + +- Open `manifests/clusters/kuberay/values.yaml` +- add the namespace under `watchNamespace` tag. e.g. + ``` + watchNamespace: + - + ``` +- Commit and push the changes + ``` + git add . + + git commit -m "Updating kuberay operator to watch the namespace " + + git push + ``` +To apply these changes to `staging`, create a pull request from `dev` to `staging` branch and merge it. Similarly, in order to apply the changes to `prod` cluster, create a pull request from `staging` to `prod` branch and merge it. + +[kuberay][kuberay] operator will start managing the `ray` in your namespace on all the clusters. + +### Verify the ray head and worker has been started in your namespace. +- Open `cloudshell` and run these commands: + ``` + gcloud config set project + + gcloud container fleet memberships get-credentials + ``` +- Run `kubectl get raycluster -n ``` . This should show result similar to the following indicating the raycluster is now ready: + + ``` + NAME DESIRED WORKERS AVAILABLE WORKERS STATUS AGE + ray-cluster-kuberay 1 1 ready 29m + + ``` + +- Run `kubectl get pods -n ``` . This should show result similar to the following: + + ``` + NAME READY STATUS RESTARTS AGE + ray-cluster-kuberay-head-sp6dg 2/2 Running 0 3m21s + ray-cluster-kuberay-worker-workergroup-rzpjw 2/2 Running 0 3m21s + ``` + +## Contributing + +* [Contributing guidelines][contributing-guidelines] +* [Code of conduct][code-of-conduct] + + + +[contributing-guidelines]: CONTRIBUTING.md +[code-of-conduct]: code-of-conduct.md +[repo-sync]: https://cloud.google.com/anthos-config-management/docs/reference/rootsync-reposync-fields +[root-sync]: https://cloud.google.com/anthos-config-management/docs/reference/rootsync-reposync-fields +[personal-access-token]: https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/managing-your-personal-access-tokens +[machine-user-account]: https://docs.github.com/en/get-started/learning-about-github/types-of-github-accounts +[kuberay]: https://ray-project.github.io/kuberay/ +[workload-identity]: https://cloud.google.com/kubernetes-engine/docs/how-to/workload-identity +[cluster-setup]: ../04_setup_clusters/README.md +[mono-repo]: ../05_setup_teams/README.md#mono-repo-vs-multi-repos +[team-setup]: ../05_setup_teams + + diff --git a/ml-platform/README.md b/ml-platform/README.md new file mode 100644 index 000000000..5b9d74241 --- /dev/null +++ b/ml-platform/README.md @@ -0,0 +1,82 @@ +Copyright 2024 Google LLC + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +# Reference architecture demonstrating how to build your ML platform on GKE. + +## Purpose + +This tutorial demonstrates repeatable patterns to setup a multi environment ML platform on private [Google Kubernetes Engine](https://cloud.google.com/kubernetes-engine/docs/concepts/kubernetes-engine-overview) (GKE) that can be extended for end-to-end MLOps. + +It addresses following personae and provides means to automate and simplify their CUJs. + +### Platform Admin + +**CUJ 1** : Provide templates with built-in standard practices to stamp out GKE platforms to be used by ML Engineers/Data Scientist. + +**CUJ 2** : Provide space for the ML teams on GKE cluster to run their workloads and the permissions following the principle of least privilege. + +**CUJ 3** : Provide secure methods to the ML teams and the Operators to connect to the private GKE clusters. + +**CUJ 4** : Enforcing security policies on the underlying platform. + + + +### ML Engineers + +**CUJ 1** : Use ML tools like `ray` to perform their day to day tasks like data pre-processing, ML training etc. + +**CUJ 2** : Use a development environment like Jupyter Notebook for faster inner loop of ML development. + +### Operators + +**CUJ 1**: Act as a bridge between the Platform admins and the ML Engineers by providing and maintaining softwares needed by the ML engineers so they can focus on their job. + +**CUJ 2**: Deploying the models. + +**CUJ 3**: Building observability on the models. + +**CUJ 4**: Operationalizing the models. + +## Prerequistes + +1. This tutorial has been tested on [Cloud Shell](https://shell.cloud.google.com) which comes preinstalled with [Google Cloud SDK](https://cloud.google.com/sdk) is required to complete this tutorial. + +2. It is recommended to start the tutorial in a fresh project since the easiest way to clean up once complete is to delete the project. See [here](https://cloud.google.com/resource-manager/docs/creating-managing-projects) for more details. + +3. This tutorial requires a number of different GCP Quotas (>= 60 T4 GPUs and 400 CPU cores) in the region of your choosing. Please visit the [IAM -> Quotas page](https://console.cloud.google.com/iam-admin/quotas) in the context of your project and region to request additional quota before proceeding with this tutorial. + +## Deploy resources. + +Follow these steps in order to build the platform and use it. + +- Run Terraform in [01_gcp_project folder][projects]. This module creates GCP projects for your ML environments. This is an optional module. If you already have created GCP projects, directly run 02_gke module. + +- Run Terraform in [02_gke folder][gke]. This modules creates private GKE clusters for each environment. + +- Run Terraform in [03_configsync folder][configsync]. This modules enables Config management on GKE clusters, creates a repository in GitHub and creates a [root-sync][root-sync] on the clusters connected to the repo. + +- Run steps in [04_setup_clusters][setup-clusters]. This modules walks through how as platform admin you can set up cluster level software to the ML teams. + +- Run steps in [05_setup_teams][setup-teams]. This modules walks through how as platform admin you can set up spaces for ML teams on the cluster and transfer ownership to operators to maintain that space. + +- Run steps in [06_operating_teams][operating-teams]. This module walks through how as an operator you will provide the softwares required by ML engineers. + + +[projects]: ./01_gcp_project/README.md +[gke]: ./02_gke/README.md +[configsync]: ./03_configsync/README.md +[setup-clusters]: ./04_setup_clusters/README.md +[setup-teams]: ./05_setup_teams/README.md +[operating-teams]: ./06_operating_teams/README.md +[repo-sync]: https://cloud.google.com/anthos-config-management/docs/reference/rootsync-reposync-fields +[root-sync]: https://cloud.google.com/anthos-config-management/docs/reference/rootsync-reposync-fields \ No newline at end of file From 9736e165ed46f30905c203036a62ab8231f4f568 Mon Sep 17 00:00:00 2001 From: Shobhit Gupta <43795024+gushob21@users.noreply.github.com> Date: Thu, 29 Feb 2024 23:46:23 +0000 Subject: [PATCH 02/39] fixing project_id variable defaults (#267) * Fixing project_id defaults --- ml-platform/02_gke/README.md | 36 ++++++++++++++++----------------- ml-platform/02_gke/variables.tf | 2 +- 2 files changed, 19 insertions(+), 19 deletions(-) diff --git a/ml-platform/02_gke/README.md b/ml-platform/02_gke/README.md index a136d3ef3..836ffcc86 100644 --- a/ml-platform/02_gke/README.md +++ b/ml-platform/02_gke/README.md @@ -32,24 +32,24 @@ limitations under the License. ## Inputs -| Name | Description | Type | Default | Required | -|------|-------------|------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------|:--------:| -| [cluster\_name](#input\_cluster\_name) | Name of the GKE cluster | `string` | `"gke-ml"` | no | -| [lookup\_state\_bucket](#input\_lookup\_state\_bucket) | GCS bucket to look up TF state from previous steps. | `string` | n/a | yes | -| [network\_name](#input\_network\_name) | VPC network where GKE cluster will be created | `string` | `"ml-vpc"` | no | -| [ondemand\_taints](#input\_ondemand\_taints) | Taints to be applied to the on-demand node pool. |
list(object({
key = string
value = any
effect = string
}))
|
[
{
"effect": "NO_SCHEDULE",
"key": "ondemand",
"value": true
}
]
| no | -| [project\_id](#input\_project\_id) | The GCP project where the resources will be created. It is a map with environments a skeys and project\_ids s values | `map` | n/a
 An example : 
project_id = {
"dev": "gkebatchexpce3c8dcb",
"prod": "gkebatchexpce3c8dcb",
"staging": "gkebatchexpce3c8dcb"
}
| yes | -| [reserved\_taints](#input\_reserved\_taints) | Taints to be applied to the reserved node pool. |
list(object({
key = string
value = any
effect = string
}))
|
[
{
"effect": "NO_SCHEDULE",
"key": "reserved",
"value": true
}
]
| no | -| [routing\_mode](#input\_routing\_mode) | VPC routing mode. | `string` | `"GLOBAL"` | no | -| [spot\_taints](#input\_spot\_taints) | Taints to be applied to the spot node pool. |
list(object({
key = string
value = any
effect = string
}))
|
[
{
"effect": "NO_SCHEDULE",
"key": "spot",
"value": true
}
]
| no | -| [subnet\_01\_description](#input\_subnet\_01\_description) | Description of the first subnet. | `string` | `"subnet 01"` | no | -| [subnet\_01\_ip](#input\_subnet\_01\_ip) | CIDR of the first subnet. | `string` | `"10.40.0.0/22"` | no | -| [subnet\_01\_name](#input\_subnet\_01\_name) | Name of the first subnet in the VPC network. | `string` | `"ml-vpc-subnet-01"` | no | -| [subnet\_01\_region](#input\_subnet\_01\_region) | Region of the first subnet. | `string` | `"us-central1"` | no | -| [subnet\_02\_description](#input\_subnet\_02\_description) | Description of the second subnet. | `string` | `"subnet 02"` | no | -| [subnet\_02\_ip](#input\_subnet\_02\_ip) | CIDR of the second subnet. | `string` | `"10.12.0.0/22"` | no | -| [subnet\_02\_name](#input\_subnet\_02\_name) | Name of the second subnet in the VPC network. | `string` | `"gke-vpc-subnet-02"` | no | -| [subnet\_02\_region](#input\_subnet\_02\_region) | Region of the second subnet. | `string` | `"us-west2"` | no | +| Name | Description | Type | Default | Required | +|------|-------------|------|---------------------------------------------------------------------------------------------------------------------------------------------|:--------:| +| [cluster\_name](#input\_cluster\_name) | Name of the GKE cluster | `string` | `"gke-ml"` | no | +| [lookup\_state\_bucket](#input\_lookup\_state\_bucket) | GCS bucket to look up TF state from previous steps. | `string` | n/a | yes | +| [network\_name](#input\_network\_name) | VPC network where GKE cluster will be created | `string` | `"ml-vpc"` | no | +| [ondemand\_taints](#input\_ondemand\_taints) | Taints to be applied to the on-demand node pool. |
list(object({
key = string
value = any
effect = string
}))
|
[
{
"effect": "NO_SCHEDULE",
"key": "ondemand",
"value": true
}
]
| no | +| [project\_id](#input\_project\_id) | The GCP project where the resources will be created. It is a map with environments a skeys and project\_ids s values | `map` | n/a
 An example : 
project_id = {
"dev": "project_id1",
"staging": "project_id2",
"prod": "project_id3"
}
| yes | +| [reserved\_taints](#input\_reserved\_taints) | Taints to be applied to the reserved node pool. |
list(object({
key = string
value = any
effect = string
}))
|
[
{
"effect": "NO_SCHEDULE",
"key": "reserved",
"value": true
}
]
| no | +| [routing\_mode](#input\_routing\_mode) | VPC routing mode. | `string` | `"GLOBAL"` | no | +| [spot\_taints](#input\_spot\_taints) | Taints to be applied to the spot node pool. |
list(object({
key = string
value = any
effect = string
}))
|
[
{
"effect": "NO_SCHEDULE",
"key": "spot",
"value": true
}
]
| no | +| [subnet\_01\_description](#input\_subnet\_01\_description) | Description of the first subnet. | `string` | `"subnet 01"` | no | +| [subnet\_01\_ip](#input\_subnet\_01\_ip) | CIDR of the first subnet. | `string` | `"10.40.0.0/22"` | no | +| [subnet\_01\_name](#input\_subnet\_01\_name) | Name of the first subnet in the VPC network. | `string` | `"ml-vpc-subnet-01"` | no | +| [subnet\_01\_region](#input\_subnet\_01\_region) | Region of the first subnet. | `string` | `"us-central1"` | no | +| [subnet\_02\_description](#input\_subnet\_02\_description) | Description of the second subnet. | `string` | `"subnet 02"` | no | +| [subnet\_02\_ip](#input\_subnet\_02\_ip) | CIDR of the second subnet. | `string` | `"10.12.0.0/22"` | no | +| [subnet\_02\_name](#input\_subnet\_02\_name) | Name of the second subnet in the VPC network. | `string` | `"gke-vpc-subnet-02"` | no | +| [subnet\_02\_region](#input\_subnet\_02\_region) | Region of the second subnet. | `string` | `"us-west2"` | no | ## Outputs diff --git a/ml-platform/02_gke/variables.tf b/ml-platform/02_gke/variables.tf index bd3da28f1..83ccafcb1 100644 --- a/ml-platform/02_gke/variables.tf +++ b/ml-platform/02_gke/variables.tf @@ -17,7 +17,7 @@ variable "project_id" { description = "The GCP project where the resources will be created. It is a map with environments a skeys and project_ids s values" default = {} #Below is an example of not null project_id variable - #default = { "dev" : "gkebatchexpce3c8dcb", "staging" : "gkebatchexpce3c8dcb", "prod" : "gkebatchexpce3c8dcb" } + #default = { "dev" : "project_id1", "staging" : "project_id2", "prod" : "project_id3" } } variable "network_name" { From 6bde1aece0d5d56c6ae3b74e47f147fb1231d628 Mon Sep 17 00:00:00 2001 From: Shobhit Gupta <43795024+gushob21@users.noreply.github.com> Date: Fri, 1 Mar 2024 00:27:33 +0000 Subject: [PATCH 03/39] Fixing documentation (#268) * Fixing documentation --- ml-platform/04_setup_clusters/README.md | 2 +- ml-platform/README.md | 16 ++++++---------- 2 files changed, 7 insertions(+), 11 deletions(-) diff --git a/ml-platform/04_setup_clusters/README.md b/ml-platform/04_setup_clusters/README.md index 5613a0b52..64f8240df 100644 --- a/ml-platform/04_setup_clusters/README.md +++ b/ml-platform/04_setup_clusters/README.md @@ -54,7 +54,7 @@ You just followed `GitOps` to promote changes from `dev` to higher environments. Open the configsync repo and go to `manifests/clusters`, you will see there is a cluster selector created for each cluster via yaml files. ### Install a cluster scoped software -This section describes how platform admins will use the configsync repo to manage cluster scoped software or cluster level objects. These softwares could be used by multiple teams in their namespaces. An example of such softwares is [kuberay][kuberay] that can manage ray clusters in multiple namespace. +This section describes how platform admins will use the configsync repo to manage cluster scoped software or cluster level objects. These software could be used by multiple teams in their namespaces. An example of such software is [kuberay][kuberay] that can manage ray clusters in multiple namespace. Let's install [Kuberay][kuberay] as a cluster level software that includes CRDs and deployments. Kuberay has a component called operator that facilitates `ray` on Kubernetes. We will install Kuberay operator in default namespace. The operator will then orchestrate `ray clusters` created in different namespace by different teams in the future. diff --git a/ml-platform/README.md b/ml-platform/README.md index 5b9d74241..54037e5b2 100644 --- a/ml-platform/README.md +++ b/ml-platform/README.md @@ -35,26 +35,22 @@ It addresses following personae and provides means to automate and simplify thei **CUJ 1** : Use ML tools like `ray` to perform their day to day tasks like data pre-processing, ML training etc. -**CUJ 2** : Use a development environment like Jupyter Notebook for faster inner loop of ML development. +**CUJ 2** : Use a development environment like Jupyter Notebook for faster inner loop of ML development. **[TBD]** ### Operators -**CUJ 1**: Act as a bridge between the Platform admins and the ML Engineers by providing and maintaining softwares needed by the ML engineers so they can focus on their job. +**CUJ 1**: Act as a bridge between the Platform admins and the ML Engineers by providing and maintaining software needed by the ML engineers so they can focus on their job. -**CUJ 2**: Deploying the models. +**CUJ 2**: Deploying the models. **[TBD]** -**CUJ 3**: Building observability on the models. +**CUJ 3**: Building observability on the models. **[TBD]** -**CUJ 4**: Operationalizing the models. +**CUJ 4**: Operationalizing the models. **[TBD]** ## Prerequistes 1. This tutorial has been tested on [Cloud Shell](https://shell.cloud.google.com) which comes preinstalled with [Google Cloud SDK](https://cloud.google.com/sdk) is required to complete this tutorial. -2. It is recommended to start the tutorial in a fresh project since the easiest way to clean up once complete is to delete the project. See [here](https://cloud.google.com/resource-manager/docs/creating-managing-projects) for more details. - -3. This tutorial requires a number of different GCP Quotas (>= 60 T4 GPUs and 400 CPU cores) in the region of your choosing. Please visit the [IAM -> Quotas page](https://console.cloud.google.com/iam-admin/quotas) in the context of your project and region to request additional quota before proceeding with this tutorial. - ## Deploy resources. Follow these steps in order to build the platform and use it. @@ -69,7 +65,7 @@ Follow these steps in order to build the platform and use it. - Run steps in [05_setup_teams][setup-teams]. This modules walks through how as platform admin you can set up spaces for ML teams on the cluster and transfer ownership to operators to maintain that space. -- Run steps in [06_operating_teams][operating-teams]. This module walks through how as an operator you will provide the softwares required by ML engineers. +- Run steps in [06_operating_teams][operating-teams]. This module walks through how as an operator you will provide the software required by ML engineers. [projects]: ./01_gcp_project/README.md From 9ab2b071d595addf8acf57bd33f4fa4c7cf14149 Mon Sep 17 00:00:00 2001 From: Aaron Rueth Date: Wed, 6 Mar 2024 16:57:46 +0000 Subject: [PATCH 04/39] Formatted Terraform files --- ml-platform/01_gcp_project/backend.tf | 1 - ml-platform/01_gcp_project/main.tf | 10 +- .../modules/projects/outputs.tf | 2 +- .../modules/projects/projects.tf | 36 +++--- ml-platform/01_gcp_project/outputs.tf | 4 +- ml-platform/01_gcp_project/providers.tf | 2 +- ml-platform/02_gke/main.tf | 120 +++++++++--------- .../02_gke/modules/cloud-nat/outputs.tf | 1 - .../02_gke/modules/cloud-nat/versions.tf | 2 +- ml-platform/02_gke/modules/cluster/gke.tf | 6 +- ml-platform/02_gke/modules/cluster/outputs.tf | 2 +- .../02_gke/modules/cluster/variables.tf | 10 +- .../02_gke/modules/cluster/versions.tf | 1 - ml-platform/02_gke/modules/network/outputs.tf | 2 +- .../02_gke/modules/network/variables.tf | 31 +++-- .../02_gke/modules/network/versions.tf | 2 +- ml-platform/02_gke/modules/network/vpc.tf | 36 ++---- .../02_gke/modules/node-pools/nodepools.tf | 20 +-- .../02_gke/modules/node-pools/variables.tf | 21 +-- .../02_gke/modules/node-pools/versions.tf | 1 - .../02_gke/modules/vm-reservations/outputs.tf | 4 +- .../modules/vm-reservations/reservations.tf | 4 +- .../modules/vm-reservations/variables.tf | 4 + .../modules/vm-reservations/versions.tf | 1 - ml-platform/02_gke/outputs.tf | 2 +- ml-platform/02_gke/providers.tf | 1 - ml-platform/02_gke/variables.tf | 60 +++++---- ml-platform/03_configsync/main.tf | 36 +++--- ml-platform/03_configsync/outputs.tf | 7 +- ml-platform/03_configsync/variables.tf | 25 ++-- 30 files changed, 231 insertions(+), 223 deletions(-) diff --git a/ml-platform/01_gcp_project/backend.tf b/ml-platform/01_gcp_project/backend.tf index 5b9bff1bd..b54d5aca8 100644 --- a/ml-platform/01_gcp_project/backend.tf +++ b/ml-platform/01_gcp_project/backend.tf @@ -18,4 +18,3 @@ terraform { bucket = "YOUR_STATE_BUCKET" } } - diff --git a/ml-platform/01_gcp_project/main.tf b/ml-platform/01_gcp_project/main.tf index 305bfce2c..1dadd943e 100644 --- a/ml-platform/01_gcp_project/main.tf +++ b/ml-platform/01_gcp_project/main.tf @@ -13,10 +13,10 @@ # limitations under the License. module "gcp-project" { - source = "./modules/projects" - org_id = var.org_id - folder_id = var.folder_id - env = var.env + source = "./modules/projects" + org_id = var.org_id + folder_id = var.folder_id + env = var.env billing_account = var.billing_account - project_name = var.project_name + project_name = var.project_name } diff --git a/ml-platform/01_gcp_project/modules/projects/outputs.tf b/ml-platform/01_gcp_project/modules/projects/outputs.tf index e087e6c85..431fe53dd 100644 --- a/ml-platform/01_gcp_project/modules/projects/outputs.tf +++ b/ml-platform/01_gcp_project/modules/projects/outputs.tf @@ -14,4 +14,4 @@ output "project_ids" { value = "${google_project.project_under_folder}" == {} ? "${google_project.project_under_org}" : "${google_project.project_under_folder}" -} \ No newline at end of file +} diff --git a/ml-platform/01_gcp_project/modules/projects/projects.tf b/ml-platform/01_gcp_project/modules/projects/projects.tf index 55c88ee86..76f7d1ef3 100644 --- a/ml-platform/01_gcp_project/modules/projects/projects.tf +++ b/ml-platform/01_gcp_project/modules/projects/projects.tf @@ -17,18 +17,18 @@ resource "random_id" "random_project_id_suffix" { } resource "google_project" "project_under_folder" { - for_each = var.folder_id != null ? var.env : toset([]) - name = format("%s-%s",var.project_name,each.value) - project_id = format("%s-%s-%s",var.project_name,random_id.random_project_id_suffix.hex,each.value) - folder_id = var.folder_id + for_each = var.folder_id != null ? var.env : toset([]) + name = format("%s-%s", var.project_name, each.value) + project_id = format("%s-%s-%s", var.project_name, random_id.random_project_id_suffix.hex, each.value) + folder_id = var.folder_id billing_account = var.billing_account } resource "google_project" "project_under_org" { - for_each = var.folder_id == null ? var.env : toset([]) - name = format("%s-%s",var.project_name,each.value) - project_id = format("%s-%s-%s",var.project_name,random_id.random_project_id_suffix.hex,each.value) - org_id = var.org_id + for_each = var.folder_id == null ? var.env : toset([]) + name = format("%s-%s", var.project_name, each.value) + project_id = format("%s-%s-%s", var.project_name, random_id.random_project_id_suffix.hex, each.value) + org_id = var.org_id billing_account = var.billing_account } @@ -38,7 +38,7 @@ resource "google_project_service" "project_services" { service = "cloudresourcemanager.googleapis.com" disable_on_destroy = true disable_dependent_services = true - depends_on = [google_project.project_under_folder,google_project.project_under_org] + depends_on = [google_project.project_under_folder, google_project.project_under_org] } resource "google_project_service" "project_services-1" { @@ -47,25 +47,25 @@ resource "google_project_service" "project_services-1" { service = "iam.googleapis.com" disable_on_destroy = true disable_dependent_services = true - depends_on = [google_project.project_under_folder,google_project.project_under_org] + depends_on = [google_project.project_under_folder, google_project.project_under_org] } resource "google_project_service" "project_services-2" { - for_each = var.folder_id == null ? google_project.project_under_org: google_project.project_under_folder + for_each = var.folder_id == null ? google_project.project_under_org : google_project.project_under_folder project = each.value.id service = "container.googleapis.com" disable_on_destroy = true disable_dependent_services = true - depends_on = [google_project.project_under_folder,google_project.project_under_org] + depends_on = [google_project.project_under_folder, google_project.project_under_org] } resource "google_project_service" "project_services-3" { - for_each = var.folder_id == null ? google_project.project_under_org: google_project.project_under_folder + for_each = var.folder_id == null ? google_project.project_under_org : google_project.project_under_folder project = each.value.id service = "compute.googleapis.com" disable_on_destroy = true disable_dependent_services = true - depends_on = [google_project.project_under_folder,google_project.project_under_org] + depends_on = [google_project.project_under_folder, google_project.project_under_org] } resource "google_project_service" "project_services-4" { @@ -74,7 +74,7 @@ resource "google_project_service" "project_services-4" { service = "anthos.googleapis.com" disable_on_destroy = true disable_dependent_services = true - depends_on = [google_project.project_under_folder,google_project.project_under_org] + depends_on = [google_project.project_under_folder, google_project.project_under_org] } resource "google_project_service" "project_services-5" { @@ -83,7 +83,7 @@ resource "google_project_service" "project_services-5" { service = "anthosconfigmanagement.googleapis.com" disable_on_destroy = true disable_dependent_services = true - depends_on = [google_project.project_under_folder,google_project.project_under_org] + depends_on = [google_project.project_under_folder, google_project.project_under_org] } resource "google_project_service" "project_services-6" { @@ -92,5 +92,5 @@ resource "google_project_service" "project_services-6" { service = "gkehub.googleapis.com" disable_on_destroy = true disable_dependent_services = true - depends_on = [google_project.project_under_folder,google_project.project_under_org] -} \ No newline at end of file + depends_on = [google_project.project_under_folder, google_project.project_under_org] +} diff --git a/ml-platform/01_gcp_project/outputs.tf b/ml-platform/01_gcp_project/outputs.tf index 7e4d72a6c..11352c942 100644 --- a/ml-platform/01_gcp_project/outputs.tf +++ b/ml-platform/01_gcp_project/outputs.tf @@ -13,5 +13,5 @@ # limitations under the License. output "project_ids" { - value = {for k, v in "${module.gcp-project.project_ids}" : k => v.project_id} -} \ No newline at end of file + value = { for k, v in "${module.gcp-project.project_ids}" : k => v.project_id } +} diff --git a/ml-platform/01_gcp_project/providers.tf b/ml-platform/01_gcp_project/providers.tf index 1817d23eb..95ff9fe61 100644 --- a/ml-platform/01_gcp_project/providers.tf +++ b/ml-platform/01_gcp_project/providers.tf @@ -19,4 +19,4 @@ terraform { version = "4.72.1" } } -} \ No newline at end of file +} diff --git a/ml-platform/02_gke/main.tf b/ml-platform/02_gke/main.tf index 2d8ed3dea..d8fbc0b21 100644 --- a/ml-platform/02_gke/main.tf +++ b/ml-platform/02_gke/main.tf @@ -13,11 +13,11 @@ # limitations under the License. data "terraform_remote_state" "gcp-projects" { - count = length(keys("${var.project_id}")) == 0 ? 1 : 0 + count = length(keys("${var.project_id}")) == 0 ? 1 : 0 backend = "gcs" config = { - bucket = var.lookup_state_bucket - prefix = "01_gcp_project" + bucket = var.lookup_state_bucket + prefix = "01_gcp_project" } } @@ -26,17 +26,17 @@ locals { } module "create-vpc" { - for_each = local.parsed_project_id - source = "./modules/network" - project_id = each.value - network_name = format("%s-%s",var.network_name,each.key) - routing_mode = var.routing_mode - subnet_01_name = format("%s-%s",var.subnet_01_name,each.key) - subnet_01_ip = var.subnet_01_ip - subnet_01_region = var.subnet_01_region - subnet_02_name = format("%s-%s",var.subnet_02_name,each.key) - subnet_02_ip = var.subnet_02_ip - subnet_02_region = var.subnet_02_region + for_each = local.parsed_project_id + source = "./modules/network" + project_id = each.value + network_name = format("%s-%s", var.network_name, each.key) + routing_mode = var.routing_mode + subnet_01_name = format("%s-%s", var.subnet_01_name, each.key) + subnet_01_ip = var.subnet_01_ip + subnet_01_region = var.subnet_01_region + subnet_02_name = format("%s-%s", var.subnet_02_name, each.key) + subnet_02_ip = var.subnet_02_ip + subnet_02_region = var.subnet_02_region #default_route_name = format("%s-%s","default-route",each.key) } @@ -49,69 +49,69 @@ resource "google_gke_hub_feature" "configmanagement_acm_feature" { } module "gke" { - for_each = local.parsed_project_id - source = "./modules/cluster" - cluster_name = format("%s-%s",var.cluster_name,each.key) - network = module.create-vpc[each.key].vpc - subnet = module.create-vpc[each.key].subnet-1 - project_id = each.value - region = var.subnet_01_region - zone = "${var.subnet_01_region}-a" - master_auth_networks_ipcidr = var.subnet_01_ip - depends_on = [ google_gke_hub_feature.configmanagement_acm_feature ] - env = each.key + for_each = local.parsed_project_id + source = "./modules/cluster" + cluster_name = format("%s-%s", var.cluster_name, each.key) + network = module.create-vpc[each.key].vpc + subnet = module.create-vpc[each.key].subnet-1 + project_id = each.value + region = var.subnet_01_region + zone = "${var.subnet_01_region}-a" + master_auth_networks_ipcidr = var.subnet_01_ip + depends_on = [google_gke_hub_feature.configmanagement_acm_feature] + env = each.key } module "reservation" { - for_each = local.parsed_project_id - source = "./modules/vm-reservations" - cluster_name = module.gke[each.key].cluster_name - zone = "${var.subnet_01_region}-a" - project_id = each.value - depends_on = [ module.gke ] + for_each = local.parsed_project_id + source = "./modules/vm-reservations" + cluster_name = module.gke[each.key].cluster_name + zone = "${var.subnet_01_region}-a" + project_id = each.value + depends_on = [module.gke] } module "node_pool-reserved" { - for_each = local.parsed_project_id - source = "./modules/node-pools" - node_pool_name = "reservation" - project_id = each.value - cluster_name = module.gke[each.key].cluster_name - region = "${var.subnet_01_region}" - taints = var.reserved_taints - resource_type = "reservation" + for_each = local.parsed_project_id + source = "./modules/node-pools" + node_pool_name = "reservation" + project_id = each.value + cluster_name = module.gke[each.key].cluster_name + region = var.subnet_01_region + taints = var.reserved_taints + resource_type = "reservation" reservation_name = module.reservation[each.key].reservation_name } module "node_pool-ondemand" { - for_each = local.parsed_project_id - source = "./modules/node-pools" + for_each = local.parsed_project_id + source = "./modules/node-pools" node_pool_name = "ondemand" - project_id = each.value + project_id = each.value cluster_name = module.gke[each.key].cluster_name - region = "${var.subnet_01_region}" - taints = var.ondemand_taints - resource_type = "ondemand" + region = var.subnet_01_region + taints = var.ondemand_taints + resource_type = "ondemand" } module "node_pool-spot" { - for_each = local.parsed_project_id - source = "./modules/node-pools" + for_each = local.parsed_project_id + source = "./modules/node-pools" node_pool_name = "spot" - project_id = each.value + project_id = each.value cluster_name = module.gke[each.key].cluster_name - region = "${var.subnet_01_region}" - taints = var.spot_taints - resource_type = "spot" + region = var.subnet_01_region + taints = var.spot_taints + resource_type = "spot" } module "cloud-nat" { - for_each = local.parsed_project_id - source = "./modules/cloud-nat" - project_id = each.value - region = split("/", module.create-vpc[each.key].subnet-1)[3] - name = format("%s-%s","nat-for-acm",each.key) - network = module.create-vpc[each.key].vpc - create_router = true - router = format("%s-%s","router-for-acm",each.key) - depends_on = [ module.create-vpc ] + for_each = local.parsed_project_id + source = "./modules/cloud-nat" + project_id = each.value + region = split("/", module.create-vpc[each.key].subnet-1)[3] + name = format("%s-%s", "nat-for-acm", each.key) + network = module.create-vpc[each.key].vpc + create_router = true + router = format("%s-%s", "router-for-acm", each.key) + depends_on = [module.create-vpc] } diff --git a/ml-platform/02_gke/modules/cloud-nat/outputs.tf b/ml-platform/02_gke/modules/cloud-nat/outputs.tf index 86bf7c39d..acd7f8ce6 100644 --- a/ml-platform/02_gke/modules/cloud-nat/outputs.tf +++ b/ml-platform/02_gke/modules/cloud-nat/outputs.tf @@ -31,4 +31,3 @@ output "router_name" { description = "Cloud NAT router name" value = local.router } - diff --git a/ml-platform/02_gke/modules/cloud-nat/versions.tf b/ml-platform/02_gke/modules/cloud-nat/versions.tf index 8422786e6..ee7532c5e 100644 --- a/ml-platform/02_gke/modules/cloud-nat/versions.tf +++ b/ml-platform/02_gke/modules/cloud-nat/versions.tf @@ -16,7 +16,7 @@ terraform { required_providers { google = { - source = "hashicorp/google" + source = "hashicorp/google" #version = ">= 4.51, < 5.0" version = "4.72.1" } diff --git a/ml-platform/02_gke/modules/cluster/gke.tf b/ml-platform/02_gke/modules/cluster/gke.tf index 34186dbc8..418068752 100644 --- a/ml-platform/02_gke/modules/cluster/gke.tf +++ b/ml-platform/02_gke/modules/cluster/gke.tf @@ -107,14 +107,14 @@ resource "google_container_cluster" "gke_batch" { channel = "RAPID" } private_cluster_config { - enable_private_nodes = true + enable_private_nodes = true enable_private_endpoint = true - master_ipv4_cidr_block = "172.16.0.32/28" + master_ipv4_cidr_block = "172.16.0.32/28" } master_authorized_networks_config { cidr_blocks { - cidr_block = var.master_auth_networks_ipcidr + cidr_block = var.master_auth_networks_ipcidr display_name = "vpc-cidr" } } diff --git a/ml-platform/02_gke/modules/cluster/outputs.tf b/ml-platform/02_gke/modules/cluster/outputs.tf index b26d3be8e..57bd8a0de 100644 --- a/ml-platform/02_gke/modules/cluster/outputs.tf +++ b/ml-platform/02_gke/modules/cluster/outputs.tf @@ -30,4 +30,4 @@ output "gke_project_id" { output "env" { value = var.env -} \ No newline at end of file +} diff --git a/ml-platform/02_gke/modules/cluster/variables.tf b/ml-platform/02_gke/modules/cluster/variables.tf index 66e3cda06..5d76462c4 100644 --- a/ml-platform/02_gke/modules/cluster/variables.tf +++ b/ml-platform/02_gke/modules/cluster/variables.tf @@ -36,23 +36,23 @@ variable "zone" { } variable "master_auth_networks_ipcidr" { - type = string + type = string description = "master authorized network" } variable "network" { - type = string + type = string description = "VPC network where the cluster will be created" } variable "subnet" { - type = string + type = string description = "subnetwork where the cluster will be created" } variable "env" { - type = string + type = string description = "environment" -} \ No newline at end of file +} diff --git a/ml-platform/02_gke/modules/cluster/versions.tf b/ml-platform/02_gke/modules/cluster/versions.tf index dc628619e..fc374eab1 100644 --- a/ml-platform/02_gke/modules/cluster/versions.tf +++ b/ml-platform/02_gke/modules/cluster/versions.tf @@ -24,4 +24,3 @@ terraform { } } } - diff --git a/ml-platform/02_gke/modules/network/outputs.tf b/ml-platform/02_gke/modules/network/outputs.tf index bf9d36dad..13026f645 100644 --- a/ml-platform/02_gke/modules/network/outputs.tf +++ b/ml-platform/02_gke/modules/network/outputs.tf @@ -25,4 +25,4 @@ output "subnet-1" { output "subnet-2" { value = google_compute_subnetwork.subnet-2.id description = "subnet2." -} \ No newline at end of file +} diff --git a/ml-platform/02_gke/modules/network/variables.tf b/ml-platform/02_gke/modules/network/variables.tf index e85ab0e48..db344133d 100644 --- a/ml-platform/02_gke/modules/network/variables.tf +++ b/ml-platform/02_gke/modules/network/variables.tf @@ -14,43 +14,46 @@ variable "project_id" { description = "Id of the GCP project where VPC is to be created." - type = string + type = string } + variable "network_name" { description = "Name of the VPC network." - type = string + type = string } + variable "routing_mode" { description = "The network routing mode." - type = string - default = "GLOBAL" + type = string + default = "GLOBAL" } + variable "subnet_01_name" { description = "Name of first subnet." - type = string + type = string } + variable "subnet_01_ip" { description = "IP range of first subnet." - type = string + type = string } + variable "subnet_01_region" { description = "Region of first subnet." - type = string + type = string } variable "subnet_02_name" { description = "Name of the second subnet." - type = string + type = string } + variable "subnet_02_ip" { description = "IP range of second subnet." - type = string + type = string } + variable "subnet_02_region" { description = "Region of second subnet." - type = string + type = string } -//variable "default_route_name" { -// description = "Name of the default route to internet." -// type = string -//} diff --git a/ml-platform/02_gke/modules/network/versions.tf b/ml-platform/02_gke/modules/network/versions.tf index 033f83d8f..c5f8c84a4 100644 --- a/ml-platform/02_gke/modules/network/versions.tf +++ b/ml-platform/02_gke/modules/network/versions.tf @@ -19,4 +19,4 @@ terraform { version = ">= 4.28.0" } } -} \ No newline at end of file +} diff --git a/ml-platform/02_gke/modules/network/vpc.tf b/ml-platform/02_gke/modules/network/vpc.tf index ad7071b5a..a80166be5 100644 --- a/ml-platform/02_gke/modules/network/vpc.tf +++ b/ml-platform/02_gke/modules/network/vpc.tf @@ -13,34 +13,26 @@ # limitations under the License. resource "google_compute_network" "vpc-network" { - project = var.project_id - name = var.network_name - auto_create_subnetworks = false - routing_mode = var.routing_mode + project = var.project_id + name = var.network_name + auto_create_subnetworks = false + routing_mode = var.routing_mode } resource "google_compute_subnetwork" "subnet-1" { - project = var.project_id - name = var.subnet_01_name - ip_cidr_range = var.subnet_01_ip - region = var.subnet_01_region - network = google_compute_network.vpc-network.id + project = var.project_id + name = var.subnet_01_name + ip_cidr_range = var.subnet_01_ip + region = var.subnet_01_region + network = google_compute_network.vpc-network.id private_ip_google_access = true } resource "google_compute_subnetwork" "subnet-2" { - project = var.project_id - name = var.subnet_02_name - ip_cidr_range = var.subnet_02_ip - region = var.subnet_02_region - network = google_compute_network.vpc-network.id + project = var.project_id + name = var.subnet_02_name + ip_cidr_range = var.subnet_02_ip + region = var.subnet_02_region + network = google_compute_network.vpc-network.id private_ip_google_access = true } - -//resource "google_compute_route" "default-route" { -//name = var.default_route_name -//dest_range = "0.0.0.0/0" -//network = google_compute_network.vpc-network.id -//priority = 1000 -//next_hop_gateway = "default-internet-gateway" -//} diff --git a/ml-platform/02_gke/modules/node-pools/nodepools.tf b/ml-platform/02_gke/modules/node-pools/nodepools.tf index 6eec2bc7d..402e45695 100644 --- a/ml-platform/02_gke/modules/node-pools/nodepools.tf +++ b/ml-platform/02_gke/modules/node-pools/nodepools.tf @@ -13,19 +13,19 @@ # limitations under the License. resource "google_container_node_pool" "node-pool" { - name = format("%s-%s",var.cluster_name,var.node_pool_name) - project = var.project_id - cluster = var.cluster_name - location = var.region + name = format("%s-%s", var.cluster_name, var.node_pool_name) + project = var.project_id + cluster = var.cluster_name + location = var.region node_config { machine_type = var.machine_type - taint = var.taints + taint = var.taints labels = { "resource-type" : var.resource_type } guest_accelerator { - type = var.accelerator + type = var.accelerator count = var.accelerator_count } oauth_scopes = [ @@ -33,11 +33,11 @@ resource "google_container_node_pool" "node-pool" { ] dynamic "reservation_affinity" { - for_each = var.reservation_name != "" ? [1] : [ ] + for_each = var.reservation_name != "" ? [1] : [] content { consume_reservation_type = "SPECIFIC_RESERVATION" - key = "compute.googleapis.com/reservation-name" - values = [var.reservation_name] + key = "compute.googleapis.com/reservation-name" + values = [var.reservation_name] } } } @@ -61,4 +61,4 @@ resource "google_container_node_pool" "node-pool" { network_config { enable_private_nodes = true } -} \ No newline at end of file +} diff --git a/ml-platform/02_gke/modules/node-pools/variables.tf b/ml-platform/02_gke/modules/node-pools/variables.tf index f217268b8..973d7a1fe 100644 --- a/ml-platform/02_gke/modules/node-pools/variables.tf +++ b/ml-platform/02_gke/modules/node-pools/variables.tf @@ -13,19 +13,22 @@ # limitations under the License. variable "node_pool_name" { - type = string + type = string description = "Name of the node pool" } + variable "project_id" { type = string description = "The GCP project where the resources will be created" default = "" } + variable "cluster_name" { type = string description = "GKE cluster name" default = "" } + variable "region" { type = string description = "The GCP zone where the reservation will be created" @@ -49,11 +52,10 @@ variable "taints" { variable "resource_type" { description = "ondemand/spot/reserved." - type = string - default = "ondemand" + type = string + default = "ondemand" } - variable "accelerator" { type = string description = "The GPU accelerator to use." @@ -65,6 +67,7 @@ variable "accelerator_count" { description = "The number of accelerators per machine." default = 2 } + variable "machine_reservation_count" { type = number description = "Number of machines reserved instances with GPUs" @@ -72,12 +75,12 @@ variable "machine_reservation_count" { } variable "autoscaling" { - type = map - default = { "total_min_node_count" : 0, "total_max_node_count" : 24, "location_policy" : "ANY"} + type = map(any) + default = { "total_min_node_count" : 0, "total_max_node_count" : 24, "location_policy" : "ANY" } } variable "reservation_name" { description = "reservation name to which the nodepool will be associated" - type = string - default = "" -} \ No newline at end of file + type = string + default = "" +} diff --git a/ml-platform/02_gke/modules/node-pools/versions.tf b/ml-platform/02_gke/modules/node-pools/versions.tf index dc628619e..fc374eab1 100644 --- a/ml-platform/02_gke/modules/node-pools/versions.tf +++ b/ml-platform/02_gke/modules/node-pools/versions.tf @@ -24,4 +24,3 @@ terraform { } } } - diff --git a/ml-platform/02_gke/modules/vm-reservations/outputs.tf b/ml-platform/02_gke/modules/vm-reservations/outputs.tf index 367c796d1..11ffcc6d8 100644 --- a/ml-platform/02_gke/modules/vm-reservations/outputs.tf +++ b/ml-platform/02_gke/modules/vm-reservations/outputs.tf @@ -13,5 +13,5 @@ # limitations under the License. output "reservation_name" { - value = split("/",google_compute_reservation.machine_reservation.id)[5] -} \ No newline at end of file + value = split("/", google_compute_reservation.machine_reservation.id)[5] +} diff --git a/ml-platform/02_gke/modules/vm-reservations/reservations.tf b/ml-platform/02_gke/modules/vm-reservations/reservations.tf index 3e35e47c5..03438d0f7 100644 --- a/ml-platform/02_gke/modules/vm-reservations/reservations.tf +++ b/ml-platform/02_gke/modules/vm-reservations/reservations.tf @@ -15,7 +15,7 @@ resource "google_compute_reservation" "machine_reservation" { project = var.project_id specific_reservation_required = true - name = format("%s-%s",var.cluster_name,"reservation") + name = format("%s-%s", var.cluster_name, "reservation") zone = var.zone specific_reservation { count = var.machine_reservation_count @@ -27,4 +27,4 @@ resource "google_compute_reservation" "machine_reservation" { } } } -} \ No newline at end of file +} diff --git a/ml-platform/02_gke/modules/vm-reservations/variables.tf b/ml-platform/02_gke/modules/vm-reservations/variables.tf index 3a8e3482d..7ca5e5af3 100644 --- a/ml-platform/02_gke/modules/vm-reservations/variables.tf +++ b/ml-platform/02_gke/modules/vm-reservations/variables.tf @@ -17,16 +17,19 @@ variable "project_id" { description = "The GCP project where the resources will be created" default = "" } + variable "cluster_name" { type = string description = "GKE cluster name" default = "" } + variable "zone" { type = string description = "The GCP zone where the reservation will be created" default = "us-central1-a" } + variable "machine_type" { type = string description = "The machine type to use." @@ -44,6 +47,7 @@ variable "accelerator_count" { description = "The number of accelerators per machine." default = 2 } + variable "machine_reservation_count" { type = number description = "Number of machines reserved instances with GPUs" diff --git a/ml-platform/02_gke/modules/vm-reservations/versions.tf b/ml-platform/02_gke/modules/vm-reservations/versions.tf index dc628619e..fc374eab1 100644 --- a/ml-platform/02_gke/modules/vm-reservations/versions.tf +++ b/ml-platform/02_gke/modules/vm-reservations/versions.tf @@ -24,4 +24,3 @@ terraform { } } } - diff --git a/ml-platform/02_gke/outputs.tf b/ml-platform/02_gke/outputs.tf index 76dca95a5..08500e25e 100644 --- a/ml-platform/02_gke/outputs.tf +++ b/ml-platform/02_gke/outputs.tf @@ -14,4 +14,4 @@ output "gke_cluster" { value = module.gke -} \ No newline at end of file +} diff --git a/ml-platform/02_gke/providers.tf b/ml-platform/02_gke/providers.tf index dc628619e..fc374eab1 100644 --- a/ml-platform/02_gke/providers.tf +++ b/ml-platform/02_gke/providers.tf @@ -24,4 +24,3 @@ terraform { } } } - diff --git a/ml-platform/02_gke/variables.tf b/ml-platform/02_gke/variables.tf index 83ccafcb1..05765c043 100644 --- a/ml-platform/02_gke/variables.tf +++ b/ml-platform/02_gke/variables.tf @@ -13,7 +13,7 @@ # limitations under the License. variable "project_id" { - type = map + type = map(any) description = "The GCP project where the resources will be created. It is a map with environments a skeys and project_ids s values" default = {} #Below is an example of not null project_id variable @@ -21,67 +21,76 @@ variable "project_id" { } variable "network_name" { - default = "ml-vpc" + default = "ml-vpc" description = "VPC network where GKE cluster will be created" - type = string + type = string } + variable "routing_mode" { - default = "GLOBAL" + default = "GLOBAL" description = "VPC routing mode." - type = string + type = string } + variable "subnet_01_name" { - default = "ml-vpc-subnet-01" + default = "ml-vpc-subnet-01" description = "Name of the first subnet in the VPC network." - type = string + type = string } + variable "subnet_01_ip" { - default = "10.40.0.0/22" + default = "10.40.0.0/22" description = "CIDR of the first subnet." - type = string + type = string } + variable "subnet_01_region" { - default = "us-central1" + default = "us-central1" description = "Region of the first subnet." - type = string + type = string } + variable "subnet_01_description" { - default = "subnet 01" + default = "subnet 01" description = "Description of the first subnet." - type = string + type = string } variable "subnet_02_name" { - default = "gke-vpc-subnet-02" + default = "gke-vpc-subnet-02" description = "Name of the second subnet in the VPC network." - type = string + type = string } + variable "subnet_02_ip" { - default = "10.12.0.0/22" + default = "10.12.0.0/22" description = "CIDR of the second subnet." - type = string + type = string } + variable "subnet_02_region" { - default = "us-west2" + default = "us-west2" description = "Region of the second subnet." - type = string + type = string } + variable "subnet_02_description" { - default = "subnet 02" + default = "subnet 02" description = "Description of the second subnet." - type = string + type = string } variable "lookup_state_bucket" { description = "GCS bucket to look up TF state from previous steps." - type = string - default = "YOUR_STATE_BUCKET" + type = string + default = "YOUR_STATE_BUCKET" } variable "cluster_name" { description = "Name of the GKE cluster" - default = "gke-ml" - type = string + default = "gke-ml" + type = string } + variable "reserved_taints" { description = "Taints to be applied to the reserved node pool." type = list(object({ @@ -123,4 +132,3 @@ variable "spot_taints" { effect = "NO_SCHEDULE" }] } - diff --git a/ml-platform/03_configsync/main.tf b/ml-platform/03_configsync/main.tf index b8ad93325..671abee5c 100644 --- a/ml-platform/03_configsync/main.tf +++ b/ml-platform/03_configsync/main.tf @@ -15,14 +15,14 @@ data "terraform_remote_state" "gke-clusters" { backend = "gcs" config = { - bucket = var.lookup_state_bucket - prefix = "02_gke" + bucket = var.lookup_state_bucket + prefix = "02_gke" } } locals { parsed_gke_info = data.terraform_remote_state.gke-clusters.outputs.gke_cluster - project_id_list = [for k,v in "${data.terraform_remote_state.gke-clusters.outputs.gke_cluster}" : v.gke_project_id] + project_id_list = [for k, v in "${data.terraform_remote_state.gke-clusters.outputs.gke_cluster}" : v.gke_project_id] } //resource "google_gke_hub_feature" "configmanagement_acm_feature" { @@ -40,20 +40,20 @@ resource "google_gke_hub_membership" "membership" { membership_id = each.value["cluster_name"] endpoint { gke_cluster { - resource_link = format("%s/%s","//container.googleapis.com",each.value["cluster_id"]) + resource_link = format("%s/%s", "//container.googleapis.com", each.value["cluster_id"]) } } lifecycle { ignore_changes = [ - "labels","description" + "labels", "description" ] } #depends_on = [ google_gke_hub_feature.configmanagement_acm_feature ] } resource "github_repository" "acm_repo" { - name = var.configsync_repo_name - description = "Repo for Config Sync" + name = var.configsync_repo_name + description = "Repo for Config Sync" visibility = "private" has_issues = false has_projects = false @@ -63,27 +63,27 @@ resource "github_repository" "acm_repo" { allow_squash_merge = true allow_rebase_merge = true delete_branch_on_merge = false - auto_init = true - vulnerability_alerts = true + auto_init = true + vulnerability_alerts = true } //Create a branch for each env resource "github_branch" "branch" { for_each = local.parsed_gke_info - repository = split("/",github_repository.acm_repo.full_name)[1] + repository = split("/", github_repository.acm_repo.full_name)[1] branch = each.key depends_on = [github_repository.acm_repo] } //Set default branch as the lowest env resource "github_branch_default" "default_branch" { - repository = split("/",github_repository.acm_repo.full_name)[1] + repository = split("/", github_repository.acm_repo.full_name)[1] branch = tostring(keys(local.parsed_gke_info)[0]) #rename = true depends_on = [github_branch.branch] } #Protect branches other than the default branch resource "github_branch_protection_v3" "branch_protection" { - for_each = local.parsed_gke_info - repository = split("/",github_repository.acm_repo.full_name)[1] + for_each = local.parsed_gke_info + repository = split("/", github_repository.acm_repo.full_name)[1] branch = each.key required_pull_request_reviews { required_approving_review_count = 1 @@ -98,7 +98,7 @@ resource "github_branch_protection_v3" "branch_protection" { resource "google_gke_hub_feature_membership" "feature_member" { provider = google-beta - for_each = local.parsed_gke_info + for_each = local.parsed_gke_info project = each.value["gke_project_id"] location = "global" feature = "configmanagement" @@ -108,21 +108,21 @@ resource "google_gke_hub_feature_membership" "feature_member" { config_sync { source_format = "unstructured" git { - sync_repo = "https://github.com/${github_repository.acm_repo.full_name}.git" + sync_repo = "https://github.com/${github_repository.acm_repo.full_name}.git" sync_branch = each.value["env"] policy_dir = "manifests/clusters" secret_type = "token" } } policy_controller { - enabled = true + enabled = true template_library_installed = true - referential_rules_enabled = true + referential_rules_enabled = true } } provisioner "local-exec" { - command = "${path.module}/create_cluster_yamls.sh ${var.github_org} ${github_repository.acm_repo.full_name} ${var.github_user} ${var.github_email} ${each.value["env"]} ${each.value["cluster_name"]} ${index(keys(local.parsed_gke_info),each.key)}" + command = "${path.module}/create_cluster_yamls.sh ${var.github_org} ${github_repository.acm_repo.full_name} ${var.github_user} ${var.github_email} ${each.value["env"]} ${each.value["cluster_name"]} ${index(keys(local.parsed_gke_info), each.key)}" } #depends_on = [ diff --git a/ml-platform/03_configsync/outputs.tf b/ml-platform/03_configsync/outputs.tf index 2e9c6603f..a19b71988 100644 --- a/ml-platform/03_configsync/outputs.tf +++ b/ml-platform/03_configsync/outputs.tf @@ -15,6 +15,7 @@ output "membership" { value = google_gke_hub_membership.membership } -output "val"{ -value = local.parsed_gke_info -} \ No newline at end of file + +output "val" { + value = local.parsed_gke_info +} diff --git a/ml-platform/03_configsync/variables.tf b/ml-platform/03_configsync/variables.tf index 000789e66..f04844d23 100644 --- a/ml-platform/03_configsync/variables.tf +++ b/ml-platform/03_configsync/variables.tf @@ -14,32 +14,35 @@ variable "lookup_state_bucket" { description = "GCS bucket to look up TF state from previous steps." - type = string - default = "YOUR_STATE_BUCKET" + type = string + default = "YOUR_STATE_BUCKET" } variable "configsync_repo_name" { - type = string + type = string description = "Name of the GitHub repo that will be synced to the cluster with Config sync." - default = "config-sync-repo" + default = "config-sync-repo" } variable "github_user" { description = "GitHub user name." - type = string - default = "YOUR_GIT_USER" + type = string + default = "YOUR_GIT_USER" } + variable "github_email" { description = "GitHub user email." - type = string - default = "YOUR_GIT_USER_EMAIL" + type = string + default = "YOUR_GIT_USER_EMAIL" } + variable "github_org" { - type = string + type = string description = "GitHub org." - default = "YOUR_GIT_ORG" + default = "YOUR_GIT_ORG" } + variable "github_token" { - type = string + type = string description = "GitHub token. It is a token with write permissions as it will create a repo in the GitHub org." } From 2aba065ae294362724ef3d940ffade86571e038f Mon Sep 17 00:00:00 2001 From: Aaron Rueth Date: Wed, 6 Mar 2024 17:05:55 +0000 Subject: [PATCH 05/39] Commented out the license in the README files --- ml-platform/01_gcp_project/README.md | 3 ++- ml-platform/02_gke/README.md | 2 ++ ml-platform/02_gke/modules/cloud-nat/README.md | 2 ++ ml-platform/02_gke/modules/network/README.md | 2 ++ ml-platform/03_configsync/README.md | 2 ++ ml-platform/04_setup_clusters/README.md | 3 ++- ml-platform/05_setup_teams/README.md | 4 ++-- ml-platform/06_operating_teams/README.md | 4 ++-- ml-platform/README.md | 2 ++ 9 files changed, 18 insertions(+), 6 deletions(-) diff --git a/ml-platform/01_gcp_project/README.md b/ml-platform/01_gcp_project/README.md index 91fbdca18..269576a79 100644 --- a/ml-platform/01_gcp_project/README.md +++ b/ml-platform/01_gcp_project/README.md @@ -1,3 +1,4 @@ + ## Requirements | Name | Version | diff --git a/ml-platform/02_gke/README.md b/ml-platform/02_gke/README.md index 836ffcc86..248d1ee57 100644 --- a/ml-platform/02_gke/README.md +++ b/ml-platform/02_gke/README.md @@ -1,3 +1,4 @@ + ## Requirements | Name | Version | diff --git a/ml-platform/02_gke/modules/cloud-nat/README.md b/ml-platform/02_gke/modules/cloud-nat/README.md index 6952d4e9f..e498d7958 100644 --- a/ml-platform/02_gke/modules/cloud-nat/README.md +++ b/ml-platform/02_gke/modules/cloud-nat/README.md @@ -1,3 +1,4 @@ + # Terraform Google Cloud NAT Module This module handles opinionated Google Cloud Platform Cloud NAT creation and configuration. diff --git a/ml-platform/02_gke/modules/network/README.md b/ml-platform/02_gke/modules/network/README.md index 6de9bdc13..14e1591d8 100644 --- a/ml-platform/02_gke/modules/network/README.md +++ b/ml-platform/02_gke/modules/network/README.md @@ -1,3 +1,4 @@ + ## Requirements | Name | Version | diff --git a/ml-platform/03_configsync/README.md b/ml-platform/03_configsync/README.md index ef777933e..70da6a370 100644 --- a/ml-platform/03_configsync/README.md +++ b/ml-platform/03_configsync/README.md @@ -1,3 +1,4 @@ + ## Requirements | Name | Version | diff --git a/ml-platform/04_setup_clusters/README.md b/ml-platform/04_setup_clusters/README.md index 64f8240df..8e69190ee 100644 --- a/ml-platform/04_setup_clusters/README.md +++ b/ml-platform/04_setup_clusters/README.md @@ -1,3 +1,4 @@ + ### This doc is meant for the platform admins or the group that has admin level permissions on the GKE clusters. The steps mentioned in these docs must be executed by them. ## Prerequisite diff --git a/ml-platform/05_setup_teams/README.md b/ml-platform/05_setup_teams/README.md index e0c90e4d4..70516f1a6 100644 --- a/ml-platform/05_setup_teams/README.md +++ b/ml-platform/05_setup_teams/README.md @@ -1,3 +1,4 @@ + ### This doc is meant for the platform admins or the group that has admin level permissions on the GKE clusters. The steps mentioned in these docs must be executed by them. ## Prerequisite diff --git a/ml-platform/06_operating_teams/README.md b/ml-platform/06_operating_teams/README.md index f550b3c0b..9b5712210 100644 --- a/ml-platform/06_operating_teams/README.md +++ b/ml-platform/06_operating_teams/README.md @@ -1,3 +1,4 @@ + This doc describes how you as an App team member will use the configsync repo to manage your applications scoped to your namespace. We will demonstrate this with an example of installing `ray` in the namespace. Typically, you can install any software or deploy any application in your namespace in the same fashion. diff --git a/ml-platform/README.md b/ml-platform/README.md index 54037e5b2..59e712b18 100644 --- a/ml-platform/README.md +++ b/ml-platform/README.md @@ -1,3 +1,4 @@ + # Reference architecture demonstrating how to build your ML platform on GKE. ## Purpose From 2aa60173bb7836613305a265014d83d2eafbb0f0 Mon Sep 17 00:00:00 2001 From: Shobhit Gupta <43795024+gushob21@users.noreply.github.com> Date: Mon, 11 Mar 2024 17:57:52 +0000 Subject: [PATCH 06/39] Mlops platform udates (#326) * Adding default single env installation and updated documentation --- ml-platform/01_gcp_project/README.md | 108 ----- ml-platform/01_gcp_project/backend.tf | 20 - ml-platform/01_gcp_project/main.tf | 22 - ml-platform/01_gcp_project/providers.tf | 22 - ml-platform/01_gcp_project/variables.tf | 43 -- ml-platform/02_gke/README.md | 141 ------- ml-platform/02_gke/main.tf | 117 ------ .../02_gke/modules/cloud-nat/versions.tf | 30 -- .../02_gke/modules/network/versions.tf | 22 - ml-platform/02_gke/outputs.tf | 17 - ml-platform/03_configsync/README.md | 147 ------- ml-platform/03_configsync/backend.tf | 20 - ml-platform/03_configsync/main.tf | 131 ------ ml-platform/03_configsync/outputs.tf | 21 - ml-platform/03_configsync/variables.tf | 48 --- ml-platform/04_setup_clusters/README.md | 139 ------- ml-platform/05_setup_teams/README.md | 169 -------- ml-platform/06_operating_teams/README.md | 154 ------- ml-platform/README.md | 278 ++++++++++--- ml-platform/{02_gke => }/backend.tf | 3 +- .../create_cluster_yamls.sh | 45 +- ml-platform/create_git_cred.sh | 38 ++ ml-platform/create_namespace.sh | 57 +++ ml-platform/install_kuberay_operator.sh | 47 +++ ml-platform/install_ray_cluster.sh | 48 +++ ml-platform/main.tf | 390 ++++++++++++++++++ ml-platform/manage_ray_ns.sh | 43 ++ ml-platform/mlenv.auto.tfvars | 9 + .../{02_gke => }/modules/cloud-nat/README.md | 2 - .../{02_gke => }/modules/cloud-nat/main.tf | 0 .../{02_gke => }/modules/cloud-nat/outputs.tf | 1 + .../modules/cloud-nat/variables.tf | 0 ml-platform/modules/cloud-nat/versions.tf | 50 +++ .../{02_gke => }/modules/cluster/gke.tf | 19 +- .../{02_gke => }/modules/cluster/outputs.tf | 2 +- .../{02_gke => }/modules/cluster/variables.tf | 2 +- .../cluster}/versions.tf | 23 +- .../{02_gke => }/modules/network/README.md | 2 - .../{02_gke => }/modules/network/outputs.tf | 2 +- .../{02_gke => }/modules/network/variables.tf | 11 +- .../network}/versions.tf | 15 +- .../{02_gke => }/modules/network/vpc.tf | 8 + .../modules/node-pools/nodepools.tf | 11 +- .../modules/node-pools/variables.tf | 9 +- .../node-pools/versions.tf} | 23 +- .../modules/projects/outputs.tf | 2 +- .../modules/projects/projects.tf | 2 +- .../modules/projects/variables.tf | 0 .../cluster => modules/projects}/versions.tf | 15 +- .../modules/vm-reservations/outputs.tf | 2 +- .../modules/vm-reservations/reservations.tf | 2 +- .../modules/vm-reservations/variables.tf | 4 - .../modules/vm-reservations/versions.tf | 39 ++ ml-platform/{01_gcp_project => }/outputs.tf | 8 +- .../acm-template/manifests/apps/.gitkeep | 0 .../acm-template/manifests/clusters/.gitkeep | 0 .../templates/_cluster_template/cluster.yaml | 0 .../_cluster_template/config-selector.yaml | 0 .../kuberay/kustomization.yaml | 0 .../kuberay/rayclusters.yaml | 0 .../_cluster_template/kuberay/rayjobs.yaml | 0 .../kuberay/rayservices.yaml | 0 .../_cluster_template/kuberay/rbac.yaml | 0 .../_cluster_template/kuberay/values.yaml | 3 +- .../_cluster_template/kustomization.yaml | 0 .../templates/_cluster_template/selector.yaml | 0 .../_cluster_template/team/kustomization.yaml | 0 .../_cluster_template/team/namespace.yaml | 2 +- .../team/network-policy.yaml | 0 .../_cluster_template/team/rbac.yaml | 0 .../_cluster_template/team/reposync.yaml | 2 +- .../app/fluentd_config.yaml | 0 .../app/kustomization.yaml | 0 .../app/serviceaccount.yaml | 0 .../_namespace_template/app/values.yaml | 0 ml-platform/{02_gke => }/variables.tf | 123 +++++- .../providers.tf => versions.tf} | 21 +- 77 files changed, 1178 insertions(+), 1556 deletions(-) delete mode 100644 ml-platform/01_gcp_project/README.md delete mode 100644 ml-platform/01_gcp_project/backend.tf delete mode 100644 ml-platform/01_gcp_project/main.tf delete mode 100644 ml-platform/01_gcp_project/providers.tf delete mode 100644 ml-platform/01_gcp_project/variables.tf delete mode 100644 ml-platform/02_gke/README.md delete mode 100644 ml-platform/02_gke/main.tf delete mode 100644 ml-platform/02_gke/modules/cloud-nat/versions.tf delete mode 100644 ml-platform/02_gke/modules/network/versions.tf delete mode 100644 ml-platform/02_gke/outputs.tf delete mode 100644 ml-platform/03_configsync/README.md delete mode 100644 ml-platform/03_configsync/backend.tf delete mode 100644 ml-platform/03_configsync/main.tf delete mode 100644 ml-platform/03_configsync/outputs.tf delete mode 100644 ml-platform/03_configsync/variables.tf delete mode 100644 ml-platform/04_setup_clusters/README.md delete mode 100644 ml-platform/05_setup_teams/README.md delete mode 100644 ml-platform/06_operating_teams/README.md rename ml-platform/{02_gke => }/backend.tf (96%) rename ml-platform/{03_configsync => }/create_cluster_yamls.sh (53%) create mode 100755 ml-platform/create_git_cred.sh create mode 100755 ml-platform/create_namespace.sh create mode 100755 ml-platform/install_kuberay_operator.sh create mode 100755 ml-platform/install_ray_cluster.sh create mode 100644 ml-platform/main.tf create mode 100755 ml-platform/manage_ray_ns.sh create mode 100644 ml-platform/mlenv.auto.tfvars rename ml-platform/{02_gke => }/modules/cloud-nat/README.md (99%) rename ml-platform/{02_gke => }/modules/cloud-nat/main.tf (100%) rename ml-platform/{02_gke => }/modules/cloud-nat/outputs.tf (99%) rename ml-platform/{02_gke => }/modules/cloud-nat/variables.tf (100%) create mode 100644 ml-platform/modules/cloud-nat/versions.tf rename ml-platform/{02_gke => }/modules/cluster/gke.tf (87%) rename ml-platform/{02_gke => }/modules/cluster/outputs.tf (99%) rename ml-platform/{02_gke => }/modules/cluster/variables.tf (99%) rename ml-platform/{02_gke/modules/node-pools => modules/cluster}/versions.tf (71%) rename ml-platform/{02_gke => }/modules/network/README.md (99%) rename ml-platform/{02_gke => }/modules/network/outputs.tf (99%) rename ml-platform/{02_gke => }/modules/network/variables.tf (92%) rename ml-platform/{02_gke/modules/vm-reservations => modules/network}/versions.tf (80%) rename ml-platform/{02_gke => }/modules/network/vpc.tf (85%) rename ml-platform/{02_gke => }/modules/node-pools/nodepools.tf (90%) rename ml-platform/{02_gke => }/modules/node-pools/variables.tf (98%) rename ml-platform/{02_gke/providers.tf => modules/node-pools/versions.tf} (71%) rename ml-platform/{01_gcp_project => }/modules/projects/outputs.tf (99%) rename ml-platform/{01_gcp_project => }/modules/projects/projects.tf (99%) rename ml-platform/{01_gcp_project => }/modules/projects/variables.tf (100%) rename ml-platform/{02_gke/modules/cluster => modules/projects}/versions.tf (80%) rename ml-platform/{02_gke => }/modules/vm-reservations/outputs.tf (99%) rename ml-platform/{02_gke => }/modules/vm-reservations/reservations.tf (99%) rename ml-platform/{02_gke => }/modules/vm-reservations/variables.tf (99%) create mode 100644 ml-platform/modules/vm-reservations/versions.tf rename ml-platform/{01_gcp_project => }/outputs.tf (72%) rename ml-platform/{03_configsync => }/templates/acm-template/manifests/apps/.gitkeep (100%) rename ml-platform/{03_configsync => }/templates/acm-template/manifests/clusters/.gitkeep (100%) rename ml-platform/{03_configsync => }/templates/acm-template/templates/_cluster_template/cluster.yaml (100%) rename ml-platform/{03_configsync => }/templates/acm-template/templates/_cluster_template/config-selector.yaml (100%) rename ml-platform/{03_configsync => }/templates/acm-template/templates/_cluster_template/kuberay/kustomization.yaml (100%) rename ml-platform/{03_configsync => }/templates/acm-template/templates/_cluster_template/kuberay/rayclusters.yaml (100%) rename ml-platform/{03_configsync => }/templates/acm-template/templates/_cluster_template/kuberay/rayjobs.yaml (100%) rename ml-platform/{03_configsync => }/templates/acm-template/templates/_cluster_template/kuberay/rayservices.yaml (100%) rename ml-platform/{03_configsync => }/templates/acm-template/templates/_cluster_template/kuberay/rbac.yaml (100%) rename ml-platform/{03_configsync => }/templates/acm-template/templates/_cluster_template/kuberay/values.yaml (99%) rename ml-platform/{03_configsync => }/templates/acm-template/templates/_cluster_template/kustomization.yaml (100%) rename ml-platform/{03_configsync => }/templates/acm-template/templates/_cluster_template/selector.yaml (100%) rename ml-platform/{03_configsync => }/templates/acm-template/templates/_cluster_template/team/kustomization.yaml (100%) rename ml-platform/{03_configsync => }/templates/acm-template/templates/_cluster_template/team/namespace.yaml (97%) rename ml-platform/{03_configsync => }/templates/acm-template/templates/_cluster_template/team/network-policy.yaml (100%) rename ml-platform/{03_configsync => }/templates/acm-template/templates/_cluster_template/team/rbac.yaml (100%) rename ml-platform/{03_configsync => }/templates/acm-template/templates/_cluster_template/team/reposync.yaml (99%) rename ml-platform/{03_configsync => }/templates/acm-template/templates/_namespace_template/app/fluentd_config.yaml (100%) rename ml-platform/{03_configsync => }/templates/acm-template/templates/_namespace_template/app/kustomization.yaml (100%) rename ml-platform/{03_configsync => }/templates/acm-template/templates/_namespace_template/app/serviceaccount.yaml (100%) rename ml-platform/{03_configsync => }/templates/acm-template/templates/_namespace_template/app/values.yaml (100%) rename ml-platform/{02_gke => }/variables.tf (56%) rename ml-platform/{03_configsync/providers.tf => versions.tf} (81%) diff --git a/ml-platform/01_gcp_project/README.md b/ml-platform/01_gcp_project/README.md deleted file mode 100644 index 269576a79..000000000 --- a/ml-platform/01_gcp_project/README.md +++ /dev/null @@ -1,108 +0,0 @@ - -## Requirements - -| Name | Version | -|------|---------| -| [google](#requirement\_google) | 4.72.1 | - - -## Modules - -| Name | Source | Version | -|------|--------|---------| -| [gcp-project](#module\_gcp-project) | ./modules/projects | n/a | - - -## Inputs - -| Name | Description | Type | Default | Required | -|-----------------------------------------------------------------------------------|--------------------------------------------------|------|------------------------------|:--------:| -| [billing\_account](#input\_billing\_account) | GCP billing account | `string` | n/a | yes | -| [env](#input\_env) | List of environments | `set(string)` |
[
"dev"
]
| no | -| [folder\_id](#input\_folder\_id) | Folder Id where the GCP projects will be created | `string` | `null` | no | -| [org\_id](#input\_org\_id) | The GCP orig id | `string` | n/a | yes | -| [project\_name](#input\_project\_name) | Project name | `string` | `ml-platfrom` | no | - -## Outputs - -| Name | Description | -|------|-------------| -| [project\_ids](#output\_project\_ids) | n/a | - - -## Workflow - -This module accepts a list of environments and creates a GCP project for each environment. - -Typically, you would want to have dev, staging and production environments created in separate projects. To have such isolation, pass `env` input variable as `[ "dev", "staging", "prod" ]`. This will create one project for dev, staging and prod environments. You can update the input variable `env` based on how many environments/projects you want to create. - -However, if you want to use a single project for multiple environments, you can create just one project by passing one element to `env` input variable list e.g [ "dev" ] or ["my-playground"] etc. - -## Prerequisite -To run this Terraform Module, you need to have the following IAM roles: -- roles/resourcemanager.projectCreator - -## Usage - -- Create a new GCP project that will host the TF state bucket. - - To create a new project, open `cloudshell` and run the following command: - ``` - gcloud projects create - ``` - - Associate billing account to the project - ``` - gcloud beta billing projects link \ - --billing-account - ``` - -- Create a GCS bucket in the project for storing TF state. - - To create a new bucket, run the following command in `cloudshell` - ``` - gcloud storage buckets create gs://-tf-state --location= --project - ``` -- Clone the repo and change dir - ``` - git clone https://github.com/GoogleCloudPlatform/ai-on-gke - cd ml-platform/01_gcp_project - ``` -- In backend.tf replace `YOUR_STATE_BUCKET` with the name of the GCS bucket. -- In variables.tf: - - replace `YOUR_GCP_ORG_ID` with your GCP Org ID. - - replace `YOUR_BILLING_ACCOUNT` with GCP your Billing account. - - (optional) overridde the default value of `folder_id` with the numeric ID of the folder this project should be created under. If you leave `folder_id` null, the projects will bw created under your org. - - (optional) override the default value of `env`. See [workflow](#workflow) for details. - -- terraform init -- terraform plan -- terraform apply --auto-approve - - -## Clean up - -1. The easiest way to prevent continued billing for the resources that you created for this tutorial is to delete the project you created for the tutorial. Run the following commands from Cloud Shell: - - ```bash - gcloud config unset project && \ - echo y | gcloud projects delete - ``` - -2. If the project needs to be left intact, another option is to destroy the infrastructure created from this module. Note, this does not destroy the Cloud Storage bucket containing the Terraform state and service enablement created out of Terraform. - - ```bash - cd ml-platform/01_gcp_project && \ - terraform destroy --auto-approve - ``` \ No newline at end of file diff --git a/ml-platform/01_gcp_project/backend.tf b/ml-platform/01_gcp_project/backend.tf deleted file mode 100644 index b54d5aca8..000000000 --- a/ml-platform/01_gcp_project/backend.tf +++ /dev/null @@ -1,20 +0,0 @@ -# Copyright 2024 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -terraform { - backend "gcs" { - prefix = "01_gcp_project" - bucket = "YOUR_STATE_BUCKET" - } -} diff --git a/ml-platform/01_gcp_project/main.tf b/ml-platform/01_gcp_project/main.tf deleted file mode 100644 index 1dadd943e..000000000 --- a/ml-platform/01_gcp_project/main.tf +++ /dev/null @@ -1,22 +0,0 @@ -# Copyright 2024 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -module "gcp-project" { - source = "./modules/projects" - org_id = var.org_id - folder_id = var.folder_id - env = var.env - billing_account = var.billing_account - project_name = var.project_name -} diff --git a/ml-platform/01_gcp_project/providers.tf b/ml-platform/01_gcp_project/providers.tf deleted file mode 100644 index 95ff9fe61..000000000 --- a/ml-platform/01_gcp_project/providers.tf +++ /dev/null @@ -1,22 +0,0 @@ -# Copyright 2024 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -terraform { - required_providers { - google = { - source = "hashicorp/google" - version = "4.72.1" - } - } -} diff --git a/ml-platform/01_gcp_project/variables.tf b/ml-platform/01_gcp_project/variables.tf deleted file mode 100644 index bb1adda73..000000000 --- a/ml-platform/01_gcp_project/variables.tf +++ /dev/null @@ -1,43 +0,0 @@ -# Copyright 2024 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -variable "org_id" { - type = string - description = "The GCP orig id" - default = "YOUR_GCP_ORG_ID" -} - -variable "env" { - type = set(string) - description = "List of environments" - default = ["dev"] -} - -variable "folder_id" { - type = string - description = "Folder Id where the GCP projects will be created" - default = null -} - -variable "billing_account" { - type = string - description = "GCP billing account" - default = "YOUR_BILLING_ACCOUNT" -} - -variable "project_name" { - type = string - description = "GCP project name" - default = "ml-platform" -} \ No newline at end of file diff --git a/ml-platform/02_gke/README.md b/ml-platform/02_gke/README.md deleted file mode 100644 index 248d1ee57..000000000 --- a/ml-platform/02_gke/README.md +++ /dev/null @@ -1,141 +0,0 @@ - -## Requirements - -| Name | Version | -|------|---------| -| [google](#requirement\_google) | 4.72.1 | -| [google-beta](#requirement\_google-beta) | 4.72.1 | - -## Modules - -| Name | Source | Version | -|------|--------|---------| -| [cloud-nat](#module\_cloud-nat) | ./modules/cloud-nat | n/a | -| [create-vpc](#module\_create-vpc) | ./modules/network | n/a | -| [gke](#module\_gke) | ./modules/cluster | n/a | -| [node\_pool-ondemand](#module\_node\_pool-ondemand) | ./modules/node-pools | n/a | -| [node\_pool-reserved](#module\_node\_pool-reserved) | ./modules/node-pools | n/a | -| [node\_pool-spot](#module\_node\_pool-spot) | ./modules/node-pools | n/a | -| [reservation](#module\_reservation) | ./modules/vm-reservations | n/a | - -## Inputs - -| Name | Description | Type | Default | Required | -|------|-------------|------|---------------------------------------------------------------------------------------------------------------------------------------------|:--------:| -| [cluster\_name](#input\_cluster\_name) | Name of the GKE cluster | `string` | `"gke-ml"` | no | -| [lookup\_state\_bucket](#input\_lookup\_state\_bucket) | GCS bucket to look up TF state from previous steps. | `string` | n/a | yes | -| [network\_name](#input\_network\_name) | VPC network where GKE cluster will be created | `string` | `"ml-vpc"` | no | -| [ondemand\_taints](#input\_ondemand\_taints) | Taints to be applied to the on-demand node pool. |
list(object({
key = string
value = any
effect = string
}))
|
[
{
"effect": "NO_SCHEDULE",
"key": "ondemand",
"value": true
}
]
| no | -| [project\_id](#input\_project\_id) | The GCP project where the resources will be created. It is a map with environments a skeys and project\_ids s values | `map` | n/a
 An example : 
project_id = {
"dev": "project_id1",
"staging": "project_id2",
"prod": "project_id3"
}
| yes | -| [reserved\_taints](#input\_reserved\_taints) | Taints to be applied to the reserved node pool. |
list(object({
key = string
value = any
effect = string
}))
|
[
{
"effect": "NO_SCHEDULE",
"key": "reserved",
"value": true
}
]
| no | -| [routing\_mode](#input\_routing\_mode) | VPC routing mode. | `string` | `"GLOBAL"` | no | -| [spot\_taints](#input\_spot\_taints) | Taints to be applied to the spot node pool. |
list(object({
key = string
value = any
effect = string
}))
|
[
{
"effect": "NO_SCHEDULE",
"key": "spot",
"value": true
}
]
| no | -| [subnet\_01\_description](#input\_subnet\_01\_description) | Description of the first subnet. | `string` | `"subnet 01"` | no | -| [subnet\_01\_ip](#input\_subnet\_01\_ip) | CIDR of the first subnet. | `string` | `"10.40.0.0/22"` | no | -| [subnet\_01\_name](#input\_subnet\_01\_name) | Name of the first subnet in the VPC network. | `string` | `"ml-vpc-subnet-01"` | no | -| [subnet\_01\_region](#input\_subnet\_01\_region) | Region of the first subnet. | `string` | `"us-central1"` | no | -| [subnet\_02\_description](#input\_subnet\_02\_description) | Description of the second subnet. | `string` | `"subnet 02"` | no | -| [subnet\_02\_ip](#input\_subnet\_02\_ip) | CIDR of the second subnet. | `string` | `"10.12.0.0/22"` | no | -| [subnet\_02\_name](#input\_subnet\_02\_name) | Name of the second subnet in the VPC network. | `string` | `"gke-vpc-subnet-02"` | no | -| [subnet\_02\_region](#input\_subnet\_02\_region) | Region of the second subnet. | `string` | `"us-west2"` | no | - -## Outputs - -| Name | Description | -|------|------------------| -| [gke\_cluster](#output\_gke\_cluster) | GKE cluster info | - -## Prerequisite -To run this Terraform Module, you need to have the following IAM roles on the projects where the GKE clusters will be created: -- roles/Owner - -## Usage -- Skip this step if you have run [01_gcp_project][projects] to create GCP projects. If you are starting from this module, run these steps. - - Create a new GCP project that will host the TF state bucket or use an existing project. - - To create a new project, open `cloudshell` and run the following command: - ``` - gcloud projects create - ``` - - Associate billing account to the project - ``` - gcloud beta billing projects link \ - --billing-account - ``` - - - Create a GCS bucket in the project for storing TF state. - - To create a new bucket, run the following command in `cloudshell` - ``` - gcloud storage buckets create gs://-tf-state --location= --project - ``` -- Clone the repo and change dir - ``` - git clone https://github.com/GoogleCloudPlatform/ai-on-gke - cd ml-platform/02_gke - ``` -- In backend.tf replace `YOUR_STATE_BUCKET` with the name of the GCS bucket. -- In variables.tf, provide the values of the following variables: - - `project_id` : If you created the projects using [01_gcp_project][projects] module, no need to provide a value for it as TF will read the project ids from the state file. - If you are providing your existing project ids, provide it in the following format. - - The following is an example of creating three env in the same GCP project : - ``` - { "dev" : "project1", "staging" : "project1", "prod" : "project1" } - ``` - The following is an example of creating three env in three different projects: - ``` - { "dev" : "project1", "staging" : "project2", "prod" : "project3" } - ``` - - - `lookup_state_bucket` : provide the name of the GCS bucket. - - -- If you did not use [01_gcp_projects][projects] module to create GCP projects and are supplying your project ids in variables.tf, enable the following APIs in those project. - - In `cloudshell`, run: - ``` - gcloud config set project - - gcloud services enable cloudresourcemanager.googleapis.com iam.googleapis.com container.googleapis.com gkehub.googleapis.com anthos.googleapis.com anthosconfigmanagement.googleapis.com compute.googleapis.com - ``` - -- terraform init -- terraform plan -- terraform apply --auto-approve - -When Terraform apply has been completed, you will get the following resources: -- A VPC network per environment with a NAT gateway and Cloud router. -- A private GKE cluster per environment. This cluster will be created in the respective VPC. -- VM reservation for `nvidia-l4` -- Three node pools, spot, reserved and on-demand respectively. - - -## Clean up - -1. The easiest way to prevent continued billing for the resources that you created for this tutorial is to delete the project you created for the tutorial. Run the following commands from Cloud Shell: - - ```bash - gcloud config unset project && \ - echo y | gcloud projects delete - ``` - -2. If the project needs to be left intact, another option is to destroy the infrastructure created from this module. Note, this does not destroy the Cloud Storage bucket containing the Terraform state and service enablement created out of Terraform. - - ```bash - cd ml-platform/02_gke && \ - terraform destroy --auto-approve - ``` - -[projects]: ../01_gcp_project/README.md \ No newline at end of file diff --git a/ml-platform/02_gke/main.tf b/ml-platform/02_gke/main.tf deleted file mode 100644 index d8fbc0b21..000000000 --- a/ml-platform/02_gke/main.tf +++ /dev/null @@ -1,117 +0,0 @@ -# Copyright 2024 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -data "terraform_remote_state" "gcp-projects" { - count = length(keys("${var.project_id}")) == 0 ? 1 : 0 - backend = "gcs" - config = { - bucket = var.lookup_state_bucket - prefix = "01_gcp_project" - } -} - -locals { - parsed_project_id = length(keys("${var.project_id}")) == 0 ? data.terraform_remote_state.gcp-projects[0].outputs.project_ids : var.project_id -} - -module "create-vpc" { - for_each = local.parsed_project_id - source = "./modules/network" - project_id = each.value - network_name = format("%s-%s", var.network_name, each.key) - routing_mode = var.routing_mode - subnet_01_name = format("%s-%s", var.subnet_01_name, each.key) - subnet_01_ip = var.subnet_01_ip - subnet_01_region = var.subnet_01_region - subnet_02_name = format("%s-%s", var.subnet_02_name, each.key) - subnet_02_ip = var.subnet_02_ip - subnet_02_region = var.subnet_02_region - #default_route_name = format("%s-%s","default-route",each.key) -} - -resource "google_gke_hub_feature" "configmanagement_acm_feature" { - count = length(distinct(values(local.parsed_project_id))) - name = "configmanagement" - project = distinct(values(local.parsed_project_id))[count.index] - location = "global" - provider = google-beta -} - -module "gke" { - for_each = local.parsed_project_id - source = "./modules/cluster" - cluster_name = format("%s-%s", var.cluster_name, each.key) - network = module.create-vpc[each.key].vpc - subnet = module.create-vpc[each.key].subnet-1 - project_id = each.value - region = var.subnet_01_region - zone = "${var.subnet_01_region}-a" - master_auth_networks_ipcidr = var.subnet_01_ip - depends_on = [google_gke_hub_feature.configmanagement_acm_feature] - env = each.key -} -module "reservation" { - for_each = local.parsed_project_id - source = "./modules/vm-reservations" - cluster_name = module.gke[each.key].cluster_name - zone = "${var.subnet_01_region}-a" - project_id = each.value - depends_on = [module.gke] -} -module "node_pool-reserved" { - for_each = local.parsed_project_id - source = "./modules/node-pools" - node_pool_name = "reservation" - project_id = each.value - cluster_name = module.gke[each.key].cluster_name - region = var.subnet_01_region - taints = var.reserved_taints - resource_type = "reservation" - reservation_name = module.reservation[each.key].reservation_name -} - -module "node_pool-ondemand" { - for_each = local.parsed_project_id - source = "./modules/node-pools" - node_pool_name = "ondemand" - project_id = each.value - cluster_name = module.gke[each.key].cluster_name - region = var.subnet_01_region - taints = var.ondemand_taints - resource_type = "ondemand" -} - -module "node_pool-spot" { - for_each = local.parsed_project_id - source = "./modules/node-pools" - node_pool_name = "spot" - project_id = each.value - cluster_name = module.gke[each.key].cluster_name - region = var.subnet_01_region - taints = var.spot_taints - resource_type = "spot" - -} - -module "cloud-nat" { - for_each = local.parsed_project_id - source = "./modules/cloud-nat" - project_id = each.value - region = split("/", module.create-vpc[each.key].subnet-1)[3] - name = format("%s-%s", "nat-for-acm", each.key) - network = module.create-vpc[each.key].vpc - create_router = true - router = format("%s-%s", "router-for-acm", each.key) - depends_on = [module.create-vpc] -} diff --git a/ml-platform/02_gke/modules/cloud-nat/versions.tf b/ml-platform/02_gke/modules/cloud-nat/versions.tf deleted file mode 100644 index ee7532c5e..000000000 --- a/ml-platform/02_gke/modules/cloud-nat/versions.tf +++ /dev/null @@ -1,30 +0,0 @@ -# Copyright 2024 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -terraform { - required_providers { - - google = { - source = "hashicorp/google" - #version = ">= 4.51, < 5.0" - version = "4.72.1" - } - - random = { - source = "hashicorp/random" - version = "~> 2.2" - } - } - -} diff --git a/ml-platform/02_gke/modules/network/versions.tf b/ml-platform/02_gke/modules/network/versions.tf deleted file mode 100644 index c5f8c84a4..000000000 --- a/ml-platform/02_gke/modules/network/versions.tf +++ /dev/null @@ -1,22 +0,0 @@ -# Copyright 2024 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -terraform { - required_providers { - google = { - source = "hashicorp/google" - version = ">= 4.28.0" - } - } -} diff --git a/ml-platform/02_gke/outputs.tf b/ml-platform/02_gke/outputs.tf deleted file mode 100644 index 08500e25e..000000000 --- a/ml-platform/02_gke/outputs.tf +++ /dev/null @@ -1,17 +0,0 @@ -# Copyright 2024 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -output "gke_cluster" { - value = module.gke -} diff --git a/ml-platform/03_configsync/README.md b/ml-platform/03_configsync/README.md deleted file mode 100644 index 70da6a370..000000000 --- a/ml-platform/03_configsync/README.md +++ /dev/null @@ -1,147 +0,0 @@ - -## Requirements - -| Name | Version | -|------|--------| -| [github](#requirement\_github) | >= 4.3.0 | -| [google](#requirement\_google) | >= 4.72.1 | -| [google-beta](#requirement\_google-beta) | >= 4.72.1 | - -## Inputs - -| Name | Description | Type | Default | Required | -|----------------------------------------------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------|----------|---------|:--------:| -| [project\_id](#input\_project\_id) | Id of the GCP Project where the resources will be created. It is a map with environments as keys and project ids as values. | `map` | n/a | yes | -| [github\_user](#github\_user) | GitHub user name. | `string` | n/a | yes | -| [github\_email](#input\_github\_email) | GitHub user email. | `string` | n/a | yes | -| [github\_org](#input\_github\_org) | GitHub org. | `string` | n/a | yes | -| [github\_token](#input\_github\_token) | GitHub access token | `string` | n/a | yes | -| [lookup\_state\_bucket](#input\_lookup\_state\_bucket) | Lookup TF State bucket. Used for looking up resources created in steps 01 and 02. | `string` | n/a | yes | -| [configsync\_repo\_name](#input\_configsync\_repo\_name) | Configsync repo name to be created in GitHub. | `string` | n/a | no | - -## Prerequisite -- You have created GKE clusters using [02_gke][cluster] module. -- You have the role `roles/Owner` on the projects where you have created GKE clusters. - -## Usage -- Clone the repo and change dir - ``` - git clone https://github.com/GoogleCloudPlatform/ai-on-gke - cd ml-platform/03_configsync - ``` -- In backend.tf replace `YOUR_STATE_BUCKET` with the name of the GCS bucket. -- In variables.tf, provide the values of the following variables: - - `github_user` : GitHub user. We recommend you use a system user account. - - `github_email` : Email of the system user account. - - `github_org` : GitHub org where the config sync repo will be created. - - `lookup_state_bucket` : name of the GCS bucket. - - `configsync_repo_name` : Suitable name for your config sync repo. - -- You also need to provide a personal access token for the GitHub user. Generate a [personal access token][personal-access-token] with access to create and delete repo for the user in GitHub and pass it as env variable: - - export TF_VAR_github_token="``" -- terraform init -- terraform plan -- terraform apply --auto-approve - - -This module performs the following actions: -- Looks up project_id from the state file if not provided. -- Looks up GKE clusters created in step 02. -- Creates a GitHub repository and branches corresponding to each environment and apply branch protection rules on it. This is the configsync repo. -- Creates Config sync on each GKE clusters. -- Hydrates templates into K8s manifests and commit them to the default branch of the GitHub repo to do initial cluster setup. - -## Config sync repo workflow -After this module has been successfully completed, you will get a [root-sync][root-sync] object created on all the GKE clusters. - -Go to Google Cloud Console, click on the navigation menu and click on Kubernetes Engine > Config. Now, click on the `PACKAGES` tab. You will see three [root-sync][root-sync] objects created, one for each cluster. Review the `Source url` against the `dev` cluster. It should be something like: -``` -https://github.com//experiment-acm-repo/tree/dev/manifests/clusters - -``` -This means that the `dev` cluster is associated with the `manifests/clusters` folder on the `dev` branch of the configsync repo. So, manifests under `manifests/clusters` folder on the `dev` branch will be synced with the dev cluster. -Similarly, the folder `manifests/clusters` on `staging` branch will be synced with the `staging` cluster and `manifests/clusters` on `prod` branch will be sycned with `prod` cluster. - -We will follow GitOps methodology to create resources on the clusters. This means you can only make changes to the default branch while other branches are protected. In order to merge changes to non-default branches, you will need to create a pull request. - -The following documentation will assume that you have three clusters `dev`, `staging` and `prod` and that resulted in three branches on the configsync repo `dev`, `staging` and `prod`. The `dev` branch is the default branch. - -To follow `GitOps` approach, you will make changes and push them to the `dev` branch. Config sync will then sync the `dev` branch with the `dev` cluster. If the changes look good in `dev` environment, -and are ready to be moved to `staging` you create a pull request from `dev` to `staging` branch. Once this pull request is approved and merged, the `staging branch` will be synced with `staging` cluster reflecting the changes in staging environment. -Similarly, when you are ready to promote the changes in production environment, create a pull request from `staging` to `prod` branch and merge it. - -## Managing cluster-level and application-level objects - -It is recommended to have a separation of duties on who should be able to create what objects in a cluster. -The principle to follow should be that the cluster-level objects can only be created by platform admins while the application teams should be able to create their own application level objects. - -To achieve this separation, we will use [root-sync][root-sync] and [repo-sync][repo-sync]. [root-sync][root-sync] allows to creae cluster scoped objects while [repo-sync][repo-sync] allows to create namespace scoped objects. - -### Cluster-level objects -Since the [root-sync][root-sync] object is associated with the folder `manifests/clusters`, the cluster level objects will be created from this folder. This includes creating CRDs, namespaces etc. So, for example, if you want to create a namespace as a platform admin, create a `yaml` file with the required K8s definition and save it under `manifests/clusters`. The namespace will be created on the cluster as soon as the sync happens. - -Note that the owner of the repo should create a CODEOWNERS file to allow access to the platform admins to this folder so that only they can make cluster level objects. The Application teams should not have access to `manifests/clusters`. - -In the section [04_setup_clusters][cluster-setup], you will create cluster scoped objects. - - -### Application-level objects -It is recommended to provide each Application its dedicated namespace. This means, only the application and related resources will be created in that namespace. The owner of the application or the app team will be get full access on the namespace so they can manage their application without having to be dependent on the platform admins. - -Since the namespace is a cluster-scoped object, platform admin will need to create the namespace for the application and grant the app team members access on the namespace. Additionally, they will provide a [repo-sync][repo-sync] repo to the app teams so they can use that to manage their application's kubernetes resource. Once, this setup is done, the app team members can manage the application inside the namespace with the manifests in the [repo-sync][repo-sync] repo. - -In the section [05_setup_teams][team-setup], you will learn how the platform admins will set up an application by providing a namespace to the App team along with a [repo-sync][repo-sync] that the app teams will use to manage their applications. - -In the section [06_operating_teams][operating-teams], you will learn how the app teams can use their [repo-sync][repo-sync] to manage thir application. - -## Troubleshooting -If you do not have [GitHub pro membership][github-pro], you can not apply branch protection rules on your repositories in GitHub. This will cause `409 code` error when you run `terraform apply` . You can ignore these errors. The downside is that you will not get branch protection rules on your configsync repository and can accidentally push changes to the non-default branch which is `dev`. In other words, it will break the `GitOps` flow. - -## Contributing - -* [Contributing guidelines][contributing-guidelines] -* [Code of conduct][code-of-conduct] - - - -[contributing-guidelines]: CONTRIBUTING.md -[code-of-conduct]: code-of-conduct.md -[personal-access-token]: https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/managing-your-personal-access-tokens -[repo-sync]: https://cloud.google.com/anthos-config-management/docs/reference/rootsync-reposync-fields -[root-sync]: https://cloud.google.com/anthos-config-management/docs/reference/rootsync-reposync-fields -[cluster-setup]: ../04_setup_clusters/README.md -[team-setup]: ../05_setup_teams/README.md -[operating-teams]: ../06_operating_teams -[cluster]: ../02_gke -[github-pro]: https://docs.github.com/en/get-started/learning-about-github/githubs-plans - -## Clean up - -1. The easiest way to prevent continued billing for the resources that you created for this tutorial is to delete the project you created for the tutorial. Run the following commands from Cloud Shell: - - ```bash - gcloud config unset project && \ - echo y | gcloud projects delete - ``` - -2. If the project needs to be left intact, another option is to destroy the infrastructure created from this module. Note, this does not destroy the Cloud Storage bucket containing the Terraform state and service enablement created out of Terraform. - - ```bash - cd ml-platform/03_configsync && \ - terraform destroy --auto-approve - ``` - diff --git a/ml-platform/03_configsync/backend.tf b/ml-platform/03_configsync/backend.tf deleted file mode 100644 index b9d73f15f..000000000 --- a/ml-platform/03_configsync/backend.tf +++ /dev/null @@ -1,20 +0,0 @@ -# Copyright 2024 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -terraform { - backend "gcs" { - prefix = "03_config_sync_prerequisite" - bucket = "YOUR_STATE_BUCKET" - } -} diff --git a/ml-platform/03_configsync/main.tf b/ml-platform/03_configsync/main.tf deleted file mode 100644 index 671abee5c..000000000 --- a/ml-platform/03_configsync/main.tf +++ /dev/null @@ -1,131 +0,0 @@ -# Copyright 2024 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -data "terraform_remote_state" "gke-clusters" { - backend = "gcs" - config = { - bucket = var.lookup_state_bucket - prefix = "02_gke" - } -} - -locals { - parsed_gke_info = data.terraform_remote_state.gke-clusters.outputs.gke_cluster - project_id_list = [for k, v in "${data.terraform_remote_state.gke-clusters.outputs.gke_cluster}" : v.gke_project_id] -} - -//resource "google_gke_hub_feature" "configmanagement_acm_feature" { -// count = length(distinct(local.project_id_list)) -// name = "configmanagement" -// project = distinct(local.project_id_list)[count.index] -// location = "global" -// provider = google-beta -//} - -resource "google_gke_hub_membership" "membership" { - provider = google-beta - for_each = local.parsed_gke_info - project = each.value["gke_project_id"] - membership_id = each.value["cluster_name"] - endpoint { - gke_cluster { - resource_link = format("%s/%s", "//container.googleapis.com", each.value["cluster_id"]) - } - } - lifecycle { - ignore_changes = [ - "labels", "description" - ] - } - #depends_on = [ google_gke_hub_feature.configmanagement_acm_feature ] -} - -resource "github_repository" "acm_repo" { - name = var.configsync_repo_name - description = "Repo for Config Sync" - visibility = "private" - has_issues = false - has_projects = false - has_wiki = false - - allow_merge_commit = true - allow_squash_merge = true - allow_rebase_merge = true - delete_branch_on_merge = false - auto_init = true - vulnerability_alerts = true -} -//Create a branch for each env -resource "github_branch" "branch" { - for_each = local.parsed_gke_info - repository = split("/", github_repository.acm_repo.full_name)[1] - branch = each.key - depends_on = [github_repository.acm_repo] -} -//Set default branch as the lowest env -resource "github_branch_default" "default_branch" { - repository = split("/", github_repository.acm_repo.full_name)[1] - branch = tostring(keys(local.parsed_gke_info)[0]) - #rename = true - depends_on = [github_branch.branch] -} -#Protect branches other than the default branch -resource "github_branch_protection_v3" "branch_protection" { - for_each = local.parsed_gke_info - repository = split("/", github_repository.acm_repo.full_name)[1] - branch = each.key - required_pull_request_reviews { - required_approving_review_count = 1 - require_code_owner_reviews = true - } - restrictions { - - } - - depends_on = [github_branch.branch] -} - -resource "google_gke_hub_feature_membership" "feature_member" { - provider = google-beta - for_each = local.parsed_gke_info - project = each.value["gke_project_id"] - location = "global" - feature = "configmanagement" - membership = google_gke_hub_membership.membership[each.key].membership_id - configmanagement { - version = "1.17.0" - config_sync { - source_format = "unstructured" - git { - sync_repo = "https://github.com/${github_repository.acm_repo.full_name}.git" - sync_branch = each.value["env"] - policy_dir = "manifests/clusters" - secret_type = "token" - } - } - policy_controller { - enabled = true - template_library_installed = true - referential_rules_enabled = true - } - } - - provisioner "local-exec" { - command = "${path.module}/create_cluster_yamls.sh ${var.github_org} ${github_repository.acm_repo.full_name} ${var.github_user} ${var.github_email} ${each.value["env"]} ${each.value["cluster_name"]} ${index(keys(local.parsed_gke_info), each.key)}" - } - - #depends_on = [ - # google_gke_hub_feature.configmanagement_acm_feature - # ] -} diff --git a/ml-platform/03_configsync/outputs.tf b/ml-platform/03_configsync/outputs.tf deleted file mode 100644 index a19b71988..000000000 --- a/ml-platform/03_configsync/outputs.tf +++ /dev/null @@ -1,21 +0,0 @@ -# Copyright 2024 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -output "membership" { - value = google_gke_hub_membership.membership -} - -output "val" { - value = local.parsed_gke_info -} diff --git a/ml-platform/03_configsync/variables.tf b/ml-platform/03_configsync/variables.tf deleted file mode 100644 index f04844d23..000000000 --- a/ml-platform/03_configsync/variables.tf +++ /dev/null @@ -1,48 +0,0 @@ -# Copyright 2024 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -variable "lookup_state_bucket" { - description = "GCS bucket to look up TF state from previous steps." - type = string - default = "YOUR_STATE_BUCKET" -} - -variable "configsync_repo_name" { - type = string - description = "Name of the GitHub repo that will be synced to the cluster with Config sync." - default = "config-sync-repo" -} - -variable "github_user" { - description = "GitHub user name." - type = string - default = "YOUR_GIT_USER" -} - -variable "github_email" { - description = "GitHub user email." - type = string - default = "YOUR_GIT_USER_EMAIL" -} - -variable "github_org" { - type = string - description = "GitHub org." - default = "YOUR_GIT_ORG" -} - -variable "github_token" { - type = string - description = "GitHub token. It is a token with write permissions as it will create a repo in the GitHub org." -} diff --git a/ml-platform/04_setup_clusters/README.md b/ml-platform/04_setup_clusters/README.md deleted file mode 100644 index 8e69190ee..000000000 --- a/ml-platform/04_setup_clusters/README.md +++ /dev/null @@ -1,139 +0,0 @@ - -### This doc is meant for the platform admins or the group that has admin level permissions on the GKE clusters. The steps mentioned in these docs must be executed by them. - -## Prerequisite -- You have successfully run through [03_configsync][configsync] module. - -### Complete config synch setup - -Go to Google Cloud Console, click on the navigation menu and click on Kubernetes Engine > Config. Now, click on the `PACKAGES` -tab. You will notice that the `Sync status` will show as stalled for all [root-sync][root-sync]. -This is because, config sync needs to authenticate with GitHub to be able to read the manifests in the configsync repo. It expects a secret named `git-cred` in `config-menegement-system` namespace on the cluster. -This secret stores the github user and its [personal access token][personal-access-token]. The [personal access token][personal-access-token] should have the read only access so config sync can read the repo to perform the sync. - -Follow these steps to create a new secret `git-cred` in `config-menegement-system` namespace: -- For the GitHub user account that you plan to use, generate a [personal access token][personal-access-token] with read access to the configsync repo. It is recommended to use a [machine user account][machine-user-account] for this but you can use a personal user account just to try this reference architecture. -- Get IAM role `roles/gkehubeditor` to be able to use the connect gateway to access the GKE cluster. If you are the owner of the project, this step can be skipped. -- Open cloudshell and run these commands: - ``` - gcloud config set project - - gcloud container fleet memberships get-credentials - - kubectl create secret generic git-creds --namespace="config-management-system" --from-literal=username= --from-literal=token= - - gcloud container fleet memberships get-credentials - - kubectl create secret generic git-creds --namespace="config-management-system" --from-literal=username= --from-literal=token= - - gcloud container fleet memberships get-credentials - - kubectl create secret generic git-creds --namespace="config-management-system" --from-literal=username= --from-literal=token= - ``` - -After the `git-cred` secret has been created, you will see the `Sync status` for dev cluster will change from `stalled` to `synced` with a green tick mark against it. The `Synch status` for `staging` and `prod` clusters will change from stalled to Error. This is because the `staging` and `prod` branches of the repo has no content yet. - -Create a pull request from `dev` to `staging` and merge it. After the merge, the `Sync status` of the `staging` cluster will change from `Stalled` to `Synced`. Now, create a PR from `staging` to `prod` and merge it. The `Sync status` for `prod` cluster will change from `Stalled` to `Synced`. - -You just followed `GitOps` to promote changes from `dev` to higher environments. - -### Review the config synch repo -Open the configsync repo and go to `manifests/clusters`, you will see there is a cluster selector created for each cluster via yaml files. - -### Install a cluster scoped software -This section describes how platform admins will use the configsync repo to manage cluster scoped software or cluster level objects. These software could be used by multiple teams in their namespaces. An example of such software is [kuberay][kuberay] that can manage ray clusters in multiple namespace. - - -Let's install [Kuberay][kuberay] as a cluster level software that includes CRDs and deployments. Kuberay has a component called operator that facilitates `ray` on Kubernetes. We will install Kuberay operator in default namespace. The operator will then orchestrate `ray clusters` created in different namespace by different teams in the future. -Perform the following steps: -- Clone the configsync repo and change directory. The default branch `dev` is checked out. - ``` - git clone repo - cd repo - ``` - -- From the provided templates under `templates/_cluster_template`, copy kustomization.yaml to `manifests/clusters` which is synced with the GKE clusters. kustomization.yaml will become the entrypoint for the [root-sync][root-sync] in the `manifests/clusters` folder and it syncs all the files defined in kustomization.yaml with the cluster. - ``` - cp templates/_cluster_template/kustomization.yaml manifests/clusters - ``` - -- Copy the directory containing the manifests to install kuberay to the directory that is synced with the GKE clusters. - ``` - cp -r templates/_cluster_template/kuberay manifests/clusters - ``` - Note that the directory `kuberay` is supplied as a template with this reference architecture. You can modify it based on your requirements. - -- Add cluster selector files in kustomization.yaml so config sync syncs these files with the clusters. The selectors are useful when you want to apply changes on one or multiple clusters selectively. - ``` - cat <>manifests/clusters/kustomization.yaml - - - ./gke-ml-dev-cluster.yaml - - ./gke-ml-staging-cluster.yaml - - ./gke-ml-prod-cluster.yaml - - ./dev-selector.yaml - - ./staging-selector.yaml - - ./prod-selector.yaml - EOF - ``` - -- Commit the changes and push them to dev branch. - ``` - git add . - git commit -m "Installing Kuberay operator" - git push - ``` - -You just pushed the manifests to install kuberay operator in default namespace to the `dev` branch. Go to Google Cloud Console, click on the navigation menu and click on Kubernetes Engine > Config. Now, click on the `PACKAGES` tab. Verify that the dev cluster is in `Synced` status. - -Verify in the `dev` cluster that [Kuberay operator][kuberay] has been installed successfully. -Open cloudshell and run these commands: -- gcloud config set project `` -- gcloud container fleet memberships get-credentials `` -- kubectl get crd | grep ray - - This should show result similar to the following: - ``` - rayclusters.ray.io 2024-02-12T21:19:06Z - rayjobs.ray.io 2024-02-12T21:19:09Z - rayservices.ray.io 2024-02-12T21:19:12Z - ``` -- kubectl get pods - - This should show result similar to the following: - ``` - NAME READY STATUS RESTARTS AGE - kuberay-operator-56b8d98766-2nvht 1/1 Running 0 6m26s - ``` -As you can see , we have installed the CRDs and the deployment for the kuberay operator. - -## Contributing - -* [Contributing guidelines][contributing-guidelines] -* [Code of conduct][code-of-conduct] - - - -[contributing-guidelines]: CONTRIBUTING.md -[code-of-conduct]: code-of-conduct.md -[repo-sync]: https://cloud.google.com/anthos-config-management/docs/reference/rootsync-reposync-fields -[root-sync]: https://cloud.google.com/anthos-config-management/docs/reference/rootsync-reposync-fields -[personal-access-token]: https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/managing-your-personal-access-tokens -[machine-user-account]: https://docs.github.com/en/get-started/learning-about-github/types-of-github-accounts -[kuberay]: https://ray-project.github.io/kuberay/ -[configsync]: ../03_configsync - - - - diff --git a/ml-platform/05_setup_teams/README.md b/ml-platform/05_setup_teams/README.md deleted file mode 100644 index 70516f1a6..000000000 --- a/ml-platform/05_setup_teams/README.md +++ /dev/null @@ -1,169 +0,0 @@ - -### This doc is meant for the platform admins or the group that has admin level permissions on the GKE clusters. The steps mentioned in these docs must be executed by them. - -## Prerequisite -- You have successfully run through [04_setup_clusters][cluster-setup] module. - -## Setup teams -Typically, each team can own one or more namespaces and the team's users will get access to create, update and delete objects in those namespaces but they will be restricted from creating, updating or deleting cluster level objects or the objects in other namespaces. - -The platform admin will set up the teams(create namespace and permission team's users on it) using the configsync repo(via [root sync][root-sync]) and provide the app teams the means to manage objetcs in their own namepsace without further involvment. - -Setting up a team has the following steps: -- create a new namespace for the team and permission the users on the namespace. - - Note: In this reference architecture, we create the namespace with the same name as the team. In real-world scenario, a team can own multiple namespaces so you might want to create namespaces with the application name that will be deployed in it. -- create a network-policy(optional). App teams can do it later. -- create a [reposync][repo-sync] object on the GKE clusters that will be associated with the repo/dir that is owned by the app teams. The app teams can manage the namespace scoped resources via their repo/dir by adding the kubernetes manifests there. - -### Prepare the changes to create a team - -In order to create a new namespace, perform the following steps: -- Clone the configsync repo and change directory. The default branch `dev` is checked out. - ``` - git clone `` repo - cd repo - ``` -- Copy the team template directory to the directory that is synced with the GKE clusters. The team template directory contains manifests to create namespace,[rbac][rbac],network policy and [reposync][repo-sync] - ``` - cp -r templates/_cluster_template/team manifests/clusters/ - ``` - `` is the name of the team for which the namespace is being created. It can also be the name of the application. - Note that the team template is provided with this reference architecture. You can modify it based on your requirements. - - - -- Change the placeholders in the files under `manifests/clusters/` - - replace NAMSESPACE with the name of the namespace/team in the files under `manifests/clusters/` - ``` - sed -i 's#NAMESPACE##g' manifests/clusters//* - ``` - - replace GIT_REPO with the link to the Git repository that you want to sync with this reposync in `manifests/clusters//reposync.yaml`. - ``` - sed -i 's#GIT_REPO##g' manifests/clusters//reposync.yaml - ``` - - manually replace NUMBER_OF_CHARACTERS_IN_REPOSYNC_NAME in `manifests/clusters//reposync.yaml` - e.g if the reposync name is prod-myteam, replace NUMBER_OF_CHARACTERS_IN_REPOSYNC_NAME with 11. - -- Create a new directory that the reposync object is pointing to. - ``` - mkdir manifests/apps/ - touch manifests/apps//.gitkeep - ``` - -- Add the path to the new team dir in kustomization.yaml to include it in the sync. - ``` - cat <>manifests/clusters/kustomization.yaml - - ./ - EOF - ``` - - -### Review the files: -Go to `manifests/clusters/` -- kustomization.yaml specifies which yaml files should be synced with the cluster. -- namespace.yaml defines the code to create a new namespace. -- rbac.yaml creates a role for full access to the namespace and assign the role to the team's users. - - This can be changed to a more restricted role or you can create multiple roles for different users. - - There is also a rolebinding that provides [kuberay operator][kuberay] service account access to this namespace. This is required for [kuberay][kuberay] to be able to manage the ray clusters inside this namespace. -- reposync.yaml creates [reposync][repo-sync] object on the cluster for the given namespace. The [reposync][repo-sync] object will be connected to a repo and will be used by the app team to create, update and delete the namespace scoped objects like rayclusters etc. - - The app team either can bring their own repo and provide it to the platform admins so they can update reposync.yaml accordingly. - - Alternatively, if your organization wants to follow mono repo structure, platform admin can create a subfolder named `` in this repo for each team under `manifests/apps` and provide the path `manifests/apps/`to the [reposync][repo-sync] object for that namespace. Platform admin can permission only the required team members to be able to edit the files under `manifests/apps/``` folder. - - see the `repo`, `revision` and `dir` tags in `reposync.yaml` that defines wha repo and dir will be synced for this [reposync][repo-sync]. - - see [mono repo vs multi repos](#mono-repo-vs-multi-repos) if you want to decide which one to use. - -### Apply the changes: -Commit the changes and push them to dev branch. -``` -git add . -git commit -m "Adding a new team" -git push -``` - -The changes are pushed to `dev` branch so the namespace and related objects will be created in dev GKE cluster. -Now create pull request from `dev` to `staging` branch and merge it. Then create a pull request from `staging` to `prod` branch and merge it. This will create the namespace and related objects in `staging` and `prod` GKE clusters. - -Go to Google Cloud Console, click on the navigation menu and click on Kubernetes Engine > Config. Now, click on the `PACKAGES` tab. You will see a new [repo-sync][repo-sync] object created on each cluster but they will be in `Stalled` state. -This is because config sync needs to authenticate with GitHub to be able to read the manifests in the repo. It expects a secret named `git-cred` in the namespace for configuring [reposync][repo-sync] with the GitHub repo. -This secret stores the github user and its [personal access token][personal-access-token]. - -Follow these steps to create a new secret in dev cluster `git-cred`: -- For the GitHub user account that you plan to use, generate a [personal access token][personal-access-token] with read access to the configsync repo. It is recommended to use a [machine user account][machine-user-account] for this but you can use a personal user account just to try this reference architecture. -- Get IAM role roles/gkehubeditor to be able to use connect gateway to access the GKE cluster. If you are the owner of the project, this step can be skipped. -- Open cloudshell and run these commands: - ``` - gcloud config set project - - gcloud container fleet memberships get-credentials - - kubectl create secret generic git-creds --namespace="" --from-literal=username= --from-literal=token= - - gcloud container fleet memberships get-credentials - - kubectl create secret generic git-creds --namespace="" --from-literal=username= --from-literal=token= - - gcloud container fleet memberships get-credentials - - kubectl create secret generic git-creds --namespace="" --from-literal=username= --from-literal=token= - ``` - - -Go to Google Cloud Console, click on the navigation menu and click on Kubernetes Engine > Config. Now, click on the `PACKAGES` tab. You will see a new [repo-sync][repo-sync] object will have `Synch status` as `Synced` with a green tick mark against them. This confirms that the [reposync][repo-sync] objects have been successfully created on all the clusters. - -This marks the completion of the team/namespace. - - -The platform admin will provide access to this dir to the members of the team. The team members will create manifests under this folder to manage their namespace scoped objects. - - -Important : -Platform admins should also restrict access to this directory so only the members of the team can update the files under it. -This can be done by creating CODEOWNERS files to allow the required team members to have access to this dir. This way you will ensure that only the team members can manage Kubernetes objects in this namespace and no other team can do that. If the team members try to create cluster-scoped objects from this dir, it will result in error as this folder is connected to [reposync][repo-sync] objects which doesn't allow cluster level access. - - -### Mono repo vs multi repos -The platform admins and the app teams need to make a decision on what repo structure they will use for config sync. - -Using mono repo means: -- The same repo will be used for cluster level objects(created by platform admins) and namespace level objects(created by app teams). -- The platform admins will be the owner of the repo and maintain CODEOWNERS files to provide granular access to the platform admins and the app teams. -- However, if the app teams want to promote changes from one env to another, they will reply on platform admins or the repo owners to approve the PR. - -Using multiple repos mean: -- The [rootsync][root-sync] will be tied to a repo that only platform-admins own and they can create cluster level objects from this repo. -- The [reposync][repo-sync] for individual teams will be created and tied to their own git repos. There is no need for granular permissions by platform admins as the app teams use their own repos to create namespace level objects. -- The app teams can create and merge PRs to their own repo independently to promote changes from one env to another. - - -## Contributing - -* [Contributing guidelines][contributing-guidelines] -* [Code of conduct][code-of-conduct] - - - -[contributing-guidelines]: CONTRIBUTING.md -[code-of-conduct]: code-of-conduct.md -[repo-sync]: https://cloud.google.com/anthos-config-management/docs/reference/rootsync-reposync-fields -[root-sync]: https://cloud.google.com/anthos-config-management/docs/reference/rootsync-reposync-fields -[personal-access-token]: https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/managing-your-personal-access-tokens -[machine-user-account]: https://docs.github.com/en/get-started/learning-about-github/types-of-github-accounts -[kuberay]: https://ray-project.github.io/kuberay/ -[rbac]: https://kubernetes.io/docs/reference/access-authn-authz/rbac/ -[cluster-setup]: ../04_setup_clusters - - diff --git a/ml-platform/06_operating_teams/README.md b/ml-platform/06_operating_teams/README.md deleted file mode 100644 index 9b5712210..000000000 --- a/ml-platform/06_operating_teams/README.md +++ /dev/null @@ -1,154 +0,0 @@ - -This doc describes how you as an App team member will use the configsync repo to manage your applications scoped to your namespace. -We will demonstrate this with an example of installing `ray` in the namespace. Typically, you can install any software or deploy any application in your namespace in the same fashion. - -## Prerequisite -- You have successfully run through [05_setup_teams][team-setup] module. - - -## Install a software(ray) - -This section is meant for the app teams that have permission only on a given namespace in the GKE clusters. The steps mentioned in this section must be executed by them. - -`Ray` is s an open-source unified compute framework that makes it easy to scale AI and Python workloads — from reinforcement learning to deep learning to tuning, and model serving. -It is very commonly used by Machine Learning teams. In order to run `Ray` on Kubernetes, you need `Kuberay` operator. The `kuberay` operators can manage the ray clusters installed in different namespaces. So, if there are multiple teams that need to use `ray` can install it in their own namespace while the kuberay oeprator can manage all of them. -Installing `kuberay` requires cluster level access as it creates the CRDs. We demonstrated installing `kuberay` in [cluster-setup][cluster-setup]. -Here we will show how to install `ray` in a namespace and configure `kuebray` to manage it. - -As an app team member, you will have access to `manifests/apps/``` folder in this repo if you are using a [mono repo][mono-repo] structure. You can perform the following steps to add `ray` manifests to the folder. The [reposync][repo-sync] will sync the manifests to the namespace on the cluster and you will get `ray` installed in your namespace. - -Note: If you are using multi repo structure, you will have access to the entire git repo and you can add the manifests in the similar fashion in the required directory to install `ray`. - -### Create the manifests -- Open `cloudshell` and run the following commands: - ``` - git clone repo && cd repo - - cp -r templates/_namespace_template/app/* manifests/apps// - ``` - -- Replace NAMSESPACE with the name of the namespace in the newly copied files. - ``` - sed -i 's#NAMESPACE##g' manifests/apps//* - ``` - -### Review the manifests -- `kustomization.yaml` specifies which yaml files should be synced with the cluster for this namespace. It references to a helm chart to install `ray` -- `values.yaml` contains the overriding values for the kuberay helm chart. -- `fluentd_config.yaml` specifies Configmap that will be applied to the namespace. -- `serviceaccount.yaml`(optional) sepcifies the kubernetes service account. This service account can be used for [workload identity][workload-identity]. - -Note that these files are provided as a template with the reference architecture for installing ray cluster. You can modify these templates as needed. - -### Apply the manifest: -- Go to `cloudshell` where you cloned the repo and copied the new files. - ``` - git add . - - git commit -m "Installing ray in namespace " - - git push - ``` - -The changes are pushed to `dev` branch so `ray` is installed on `dev` GKE cluster. To apply these changes to `staging`, create a pull request from `dev` to `staging` branch and merge it. Similarly, in order to apply the changes to `prod` cluster, create a pull request from `staging` to `prod` branch and merge it. - -Go to Google Cloud Console, click on the navigation menu and click on Kubernetes Engine > Config. Now, click on the `PACKAGES` tab. The [repo-sync][repo-sync] objects should show `Sync status` as `Synced` with green tick against it. - -### Verify the raycluster is in ready state in the namespace. -- Open cloudshell and run these commands: - ``` - gcloud config set project - - gcloud container fleet memberships get-credentials - - kubectl get raycluster -n - ``` -- This should show result similar to the following: - - ``` - NAME DESIRED WORKERS AVAILABLE WORKERS STATUS AGE - ray-cluster-kuberay 4m9s - ``` - -### Update kuberay operator to manage ray in your namespace - -This section is meant for platform admins or the group that has admin level permissions on the GKE clusters. The steps mentioned in these docs must be executed by them. - -[Kuberay][kuberay] operator manages `ray` on Kubernetes. You need to configure kuberay operator so that it manages `ray` in your namespace. `kuberay` was installed via [rootsync][root-sync] from the folder `manifests/clusters` by platform-admin so they should be performing the following step. -- Go to `cloudshell` where you cloned the repo. - -- Open `manifests/clusters/kuberay/values.yaml` -- add the namespace under `watchNamespace` tag. e.g. - ``` - watchNamespace: - - - ``` -- Commit and push the changes - ``` - git add . - - git commit -m "Updating kuberay operator to watch the namespace " - - git push - ``` -To apply these changes to `staging`, create a pull request from `dev` to `staging` branch and merge it. Similarly, in order to apply the changes to `prod` cluster, create a pull request from `staging` to `prod` branch and merge it. - -[kuberay][kuberay] operator will start managing the `ray` in your namespace on all the clusters. - -### Verify the ray head and worker has been started in your namespace. -- Open `cloudshell` and run these commands: - ``` - gcloud config set project - - gcloud container fleet memberships get-credentials - ``` -- Run `kubectl get raycluster -n ``` . This should show result similar to the following indicating the raycluster is now ready: - - ``` - NAME DESIRED WORKERS AVAILABLE WORKERS STATUS AGE - ray-cluster-kuberay 1 1 ready 29m - - ``` - -- Run `kubectl get pods -n ``` . This should show result similar to the following: - - ``` - NAME READY STATUS RESTARTS AGE - ray-cluster-kuberay-head-sp6dg 2/2 Running 0 3m21s - ray-cluster-kuberay-worker-workergroup-rzpjw 2/2 Running 0 3m21s - ``` - -## Contributing - -* [Contributing guidelines][contributing-guidelines] -* [Code of conduct][code-of-conduct] - - - -[contributing-guidelines]: CONTRIBUTING.md -[code-of-conduct]: code-of-conduct.md -[repo-sync]: https://cloud.google.com/anthos-config-management/docs/reference/rootsync-reposync-fields -[root-sync]: https://cloud.google.com/anthos-config-management/docs/reference/rootsync-reposync-fields -[personal-access-token]: https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/managing-your-personal-access-tokens -[machine-user-account]: https://docs.github.com/en/get-started/learning-about-github/types-of-github-accounts -[kuberay]: https://ray-project.github.io/kuberay/ -[workload-identity]: https://cloud.google.com/kubernetes-engine/docs/how-to/workload-identity -[cluster-setup]: ../04_setup_clusters/README.md -[mono-repo]: ../05_setup_teams/README.md#mono-repo-vs-multi-repos -[team-setup]: ../05_setup_teams - - diff --git a/ml-platform/README.md b/ml-platform/README.md index 59e712b18..90c3a165f 100644 --- a/ml-platform/README.md +++ b/ml-platform/README.md @@ -1,45 +1,138 @@ - -# Reference architecture demonstrating how to build your ML platform on GKE. - -## Purpose - -This tutorial demonstrates repeatable patterns to setup a multi environment ML platform on private [Google Kubernetes Engine](https://cloud.google.com/kubernetes-engine/docs/concepts/kubernetes-engine-overview) (GKE) that can be extended for end-to-end MLOps. - -It addresses following personae and provides means to automate and simplify their CUJs. - -### Platform Admin - -**CUJ 1** : Provide templates with built-in standard practices to stamp out GKE platforms to be used by ML Engineers/Data Scientist. - -**CUJ 2** : Provide space for the ML teams on GKE cluster to run their workloads and the permissions following the principle of least privilege. - -**CUJ 3** : Provide secure methods to the ML teams and the Operators to connect to the private GKE clusters. + +## Requirements + +| Name | Version | +|------|---------| +| [github](#requirement\_github) | 6.0.1 | +| [google](#requirement\_google) | 5.19.0 | +| [google-beta](#requirement\_google-beta) | 5.19.0 | +| [null](#requirement\_null) | 3.2.2 | + +## Providers + +| Name | Version | +|------|---------| +| [github](#provider\_github) | 6.0.1 | +| [google](#provider\_google) | 5.19.0 | +| [google-beta](#provider\_google-beta) | 5.19.0 | +| [null](#provider\_null) | 3.2.2 | + +## Modules + +| Name | Source | Version | +|------|--------|---------| +| [cloud-nat](#module\_cloud-nat) | ./modules/cloud-nat | n/a | +| [create-vpc](#module\_create-vpc) | ./modules/network | n/a | +| [gcp-project](#module\_gcp-project) | ./modules/projects | n/a | +| [gke](#module\_gke) | ./modules/cluster | n/a | +| [node\_pool-ondemand](#module\_node\_pool-ondemand) | ./modules/node-pools | n/a | +| [node\_pool-reserved](#module\_node\_pool-reserved) | ./modules/node-pools | n/a | +| [node\_pool-spot](#module\_node\_pool-spot) | ./modules/node-pools | n/a | +| [reservation](#module\_reservation) | ./modules/vm-reservations | n/a | + +## Resources + +| Name | Type | +|------|------| +| [github_branch.branch](https://registry.terraform.io/providers/integrations/github/6.0.1/docs/resources/branch) | resource | +| [github_branch_default.default_branch](https://registry.terraform.io/providers/integrations/github/6.0.1/docs/resources/branch_default) | resource | +| [github_branch_protection_v3.branch_protection](https://registry.terraform.io/providers/integrations/github/6.0.1/docs/resources/branch_protection_v3) | resource | +| [github_repository.acm_repo](https://registry.terraform.io/providers/integrations/github/6.0.1/docs/resources/repository) | resource | +| [google-beta_google_gke_hub_feature.configmanagement_acm_feature](https://registry.terraform.io/providers/hashicorp/google-beta/5.19.0/docs/resources/google_gke_hub_feature) | resource | +| [google-beta_google_gke_hub_feature_membership.feature_member](https://registry.terraform.io/providers/hashicorp/google-beta/5.19.0/docs/resources/google_gke_hub_feature_membership) | resource | +| [google-beta_google_gke_hub_membership.membership](https://registry.terraform.io/providers/hashicorp/google-beta/5.19.0/docs/resources/google_gke_hub_membership) | resource | +| [google_project_service.project_services-an](https://registry.terraform.io/providers/hashicorp/google/5.19.0/docs/resources/project_service) | resource | +| [google_project_service.project_services-anc](https://registry.terraform.io/providers/hashicorp/google/5.19.0/docs/resources/project_service) | resource | +| [google_project_service.project_services-com](https://registry.terraform.io/providers/hashicorp/google/5.19.0/docs/resources/project_service) | resource | +| [google_project_service.project_services-con](https://registry.terraform.io/providers/hashicorp/google/5.19.0/docs/resources/project_service) | resource | +| [google_project_service.project_services-cr](https://registry.terraform.io/providers/hashicorp/google/5.19.0/docs/resources/project_service) | resource | +| [google_project_service.project_services-gate](https://registry.terraform.io/providers/hashicorp/google/5.19.0/docs/resources/project_service) | resource | +| [google_project_service.project_services-gkecon](https://registry.terraform.io/providers/hashicorp/google/5.19.0/docs/resources/project_service) | resource | +| [google_project_service.project_services-gkeh](https://registry.terraform.io/providers/hashicorp/google/5.19.0/docs/resources/project_service) | resource | +| [google_project_service.project_services-iam](https://registry.terraform.io/providers/hashicorp/google/5.19.0/docs/resources/project_service) | resource | +| [null_resource.create_git_cred_cms](https://registry.terraform.io/providers/hashicorp/null/3.2.2/docs/resources/resource) | resource | +| [null_resource.create_git_cred_ns](https://registry.terraform.io/providers/hashicorp/null/3.2.2/docs/resources/resource) | resource | +| [null_resource.create_namespace](https://registry.terraform.io/providers/hashicorp/null/3.2.2/docs/resources/resource) | resource | +| [null_resource.install_kuberay_operator](https://registry.terraform.io/providers/hashicorp/null/3.2.2/docs/resources/resource) | resource | +| [null_resource.install_ray_cluster](https://registry.terraform.io/providers/hashicorp/null/3.2.2/docs/resources/resource) | resource | +| [null_resource.manage_ray_ns](https://registry.terraform.io/providers/hashicorp/null/3.2.2/docs/resources/resource) | resource | + +## Inputs + +| Name | Description | Type | Default | Required | +|------|-------------|------|---------|:--------:| +| [billing\_account](#input\_billing\_account) | GCP billing account | `string` | `null` | no | +| [cluster\_name](#input\_cluster\_name) | Name of the GKE cluster | `string` | `"gke-ml"` | no | +| [configsync\_repo\_name](#input\_configsync\_repo\_name) | Name of the GitHub repo that will be synced to the cluster with Config sync. | `string` | `"config-sync-repo"` | no | +| [create\_namespace](#input\_create\_namespace) | Setup a namespace to demo. | `number` | `1` | no | +| [create\_projects](#input\_create\_projects) | Flag to create GCP projects | `number` | `0` | no | +| [env](#input\_env) | List of environments | `set(string)` |
[
"dev"
]
| no | +| [folder\_id](#input\_folder\_id) | Folder Id where the GCP projects will be created | `string` | `null` | no | +| [github\_email](#input\_github\_email) | GitHub user email. | `string` | n/a | yes | +| [github\_org](#input\_github\_org) | GitHub org. | `string` | n/a | yes | +| [github\_token](#input\_github\_token) | GitHub token. It is a token with write permissions as it will create a repo in the GitHub org. | `string` | n/a | yes | +| [github\_user](#input\_github\_user) | GitHub user name. | `string` | n/a | yes | +| [install\_kuberay](#input\_install\_kuberay) | Flag to install kuberay operator. | `number` | `1` | no | +| [install\_ray\_in\_ns](#input\_install\_ray\_in\_ns) | Flag to install ray cluster in the namespace created with the demo. | `number` | `1` | no | +| [namespace](#input\_namespace) | Name of the namespace to demo. | `string` | `"ml-team"` | no | +| [network\_name](#input\_network\_name) | VPC network where GKE cluster will be created | `string` | `"ml-vpc"` | no | +| [ondemand\_taints](#input\_ondemand\_taints) | Taints to be applied to the on-demand node pool. |
list(object({
key = string
value = any
effect = string
}))
|
[
{
"effect": "NO_SCHEDULE",
"key": "ondemand",
"value": true
}
]
| no | +| [org\_id](#input\_org\_id) | The GCP orig id | `string` | `null` | no | +| [project\_id](#input\_project\_id) | The GCP project where the resources will be created. It is a map with environments as keys and project\_ids s values | `map` | n/a | yes | +| [project\_name](#input\_project\_name) | GCP project name | `string` | `null` | no | +| [reserved\_taints](#input\_reserved\_taints) | Taints to be applied to the reserved node pool. |
list(object({
key = string
value = any
effect = string
}))
|
[
{
"effect": "NO_SCHEDULE",
"key": "reserved",
"value": true
}
]
| no | +| [routing\_mode](#input\_routing\_mode) | VPC routing mode. | `string` | `"GLOBAL"` | no | +| [secret\_for\_rootsync](#input\_secret\_for\_rootsync) | Create git-cred in config-management-system namespace. | `number` | `1` | no | +| [spot\_taints](#input\_spot\_taints) | Taints to be applied to the spot node pool. |
list(object({
key = string
value = any
effect = string
}))
|
[
{
"effect": "NO_SCHEDULE",
"key": "spot",
"value": true
}
]
| no | +| [subnet\_01\_description](#input\_subnet\_01\_description) | Description of the first subnet. | `string` | `"subnet 01"` | no | +| [subnet\_01\_ip](#input\_subnet\_01\_ip) | CIDR of the first subnet. | `string` | `"10.40.0.0/22"` | no | +| [subnet\_01\_name](#input\_subnet\_01\_name) | Name of the first subnet in the VPC network. | `string` | `"ml-vpc-subnet-01"` | no | +| [subnet\_01\_region](#input\_subnet\_01\_region) | Region of the first subnet. | `string` | `"us-central1"` | no | +| [subnet\_02\_description](#input\_subnet\_02\_description) | Description of the second subnet. | `string` | `"subnet 02"` | no | +| [subnet\_02\_ip](#input\_subnet\_02\_ip) | CIDR of the second subnet. | `string` | `"10.12.0.0/22"` | no | +| [subnet\_02\_name](#input\_subnet\_02\_name) | Name of the second subnet in the VPC network. | `string` | `"gke-vpc-subnet-02"` | no | +| [subnet\_02\_region](#input\_subnet\_02\_region) | Region of the second subnet. | `string` | `"us-west2"` | no | + +## Outputs + +| Name | Description | +|------|-------------| +| [project\_ids](#output\_project\_ids) | n/a | + + +## Platform Principles + +This reference architecture demonstrates how to build a GKE platform that facilitates Machine Learning. The reference architecture is based on the following principles: + + - The platform admin will create the GKE platform using IaC tool like [Terraform][terraform]. The IaC will come with re-usuable modules that can be referred to create more resources as the demand grows. + - The platform will be based on [GitOps][gitops]. + - After the GKE platform has been created, cluster scoped resources on it will be created through [Config Sync][config-sync] by the admins. + - Platform admins will create a namespace per application and provide the application team member full access to it. + - The namespace scoped resources will be created by the Application/ML teams either via [Config Sync][config-sync] or through a deployment tool like [Cloud Deploy][cloud-deploy] + +## CUJ and Personae addressed in the reference architecture + +### Persona : Platform Admin + +**CUJ 1** : Provide templates with built-in standard practices to stamp out GKE platforms to be used by ML Engineers, Data Scientists and Application teams. + +**CUJ 2** : Provide GKE clusters. + +**CUJ 2** : Provide space for the teams on GKE cluster to run their workloads and the permissions following the principle of least privilege. + +**CUJ 3** : Provide secure methods to the ML Engineers, Data Scientist, Application teams and the Operators to connect to the private GKE clusters. **CUJ 4** : Enforcing security policies on the underlying platform. -### ML Engineers +### Persona : ML Engineers **CUJ 1** : Use ML tools like `ray` to perform their day to day tasks like data pre-processing, ML training etc. **CUJ 2** : Use a development environment like Jupyter Notebook for faster inner loop of ML development. **[TBD]** -### Operators +### Persona : Operators **CUJ 1**: Act as a bridge between the Platform admins and the ML Engineers by providing and maintaining software needed by the ML engineers so they can focus on their job. @@ -52,29 +145,100 @@ It addresses following personae and provides means to automate and simplify thei ## Prerequistes 1. This tutorial has been tested on [Cloud Shell](https://shell.cloud.google.com) which comes preinstalled with [Google Cloud SDK](https://cloud.google.com/sdk) is required to complete this tutorial. - -## Deploy resources. - -Follow these steps in order to build the platform and use it. - -- Run Terraform in [01_gcp_project folder][projects]. This module creates GCP projects for your ML environments. This is an optional module. If you already have created GCP projects, directly run 02_gke module. - -- Run Terraform in [02_gke folder][gke]. This modules creates private GKE clusters for each environment. - -- Run Terraform in [03_configsync folder][configsync]. This modules enables Config management on GKE clusters, creates a repository in GitHub and creates a [root-sync][root-sync] on the clusters connected to the repo. - -- Run steps in [04_setup_clusters][setup-clusters]. This modules walks through how as platform admin you can set up cluster level software to the ML teams. - -- Run steps in [05_setup_teams][setup-teams]. This modules walks through how as platform admin you can set up spaces for ML teams on the cluster and transfer ownership to operators to maintain that space. - -- Run steps in [06_operating_teams][operating-teams]. This module walks through how as an operator you will provide the software required by ML engineers. - - -[projects]: ./01_gcp_project/README.md -[gke]: ./02_gke/README.md -[configsync]: ./03_configsync/README.md -[setup-clusters]: ./04_setup_clusters/README.md -[setup-teams]: ./05_setup_teams/README.md -[operating-teams]: ./06_operating_teams/README.md +2. Familiarity with [Google Kubernetes Engine][gke], [Terraform][terraform], [root-sync][root-sync] , [repo-sync][repo-sync] , [Git][git], [GitHub][github] + +# Workflow + +This reference architecture can be implemented in one of the following ways: + +- Deploy a single env reference architecture. +- Deploy a multi env reference architecture in single [GCP project][gcp-project] +- Deploy a multi env reference architecture with each env in its own [GCP project][gcp-project] + +## Deploy a single env reference architecture +This is the quick-start deployment. It can be used to quickly set up an environment and start playing with it to get an understanding of the flow. Single env reference architecture can be deployed with the provided default values. + +### Configuration +- You can either create a new GCP project or use an existing one. Skip this step if you choose to use an already existing project. + - To create a new project, open `cloudshell` and run the following command: + ``` + gcloud projects create + ``` + - Associate billing account to the project: + ``` + gcloud beta billing projects link \ + --billing-account + ``` +- Set up PROJECT_ID in environment variable in `cloudshell` : + ``` + export PROJECT_ID="" >> ~/.bashrc + ``` + + Replace with the id of the project that you created in the previous step or the id of an already existing project that you want to use. + +- Update ~/bashrc to automatically point to the required project when a new instance of the `cloudshell` is created: + ``` + echo gcloud config set project $PROJECT_ID >> ~/.bashrc && source ~/.bashrc + ``` + +- Create a GCS bucket in the project for storing TF state. + - To create a new bucket, run the following command in `cloudshell` : + ``` + export STATE_BUCKET="${PROJECT_ID}-tf-state" >> ~/.bashrc && source ~/.bashrc + + gcloud storage buckets create gs://${STATE_BUCKET} + ``` + +- Store github configurations in environment variables: + ``` + export GITHUB_USER= >> ~/.bashrc + export GITHUB_ORG= >> ~/.bashrc + export GITHUB_EMAIL= >> ~/.bashrc + source ~/.bashrc + ``` + +- Create a [Personal Access Token][personal-access-token] in [GitHub][github]: + + Note: It is recommended to use a [machine user account][machine-user-account] for this but you can use a personal user account just to try this reference architecture. + - Go to https://github.com/settings/tokens and login using your credentials + - Click "Generate new token" >> "Generate new token (classic)". + - You will be directed to a screen to created the new token. Provide the note and expiration. + - Choose the following two access: + - [x] repo + - [x] delete_repo + - Click "Generate token" + - Store the token safely. + +### Run Terraform + +- Clone the repo and change dir + ``` + git clone https://github.com/GoogleCloudPlatform/ai-on-gke + + cd ml-platform + ``` + +- Perform variable replacement. + ``` + sed -i "s/YOUR_STATE_BUCKET/${STATE_BUCKET}/g" backend.tf + + sed -i "s/YOUR_PROJECT_ID/${PROJECT_ID}/g" terraform.tfvars + ``` + +Typically, you would want to have dev, staging and production environments created in separate projects. To have such isolation, pass `env` input variable as `[ "dev", "staging", "prod" ]`. This will create one project for dev, staging and prod environments. You can update the input variable `env` based on how many environments/projects you want to create. + +However, if you want to use a single project for multiple environments, you can create just one project by passing one element to `env` input variable list e.g [ "dev" ] or ["my-playground"] etc. + + +[gitops]: https://about.gitlab.com/topics/gitops/ [repo-sync]: https://cloud.google.com/anthos-config-management/docs/reference/rootsync-reposync-fields -[root-sync]: https://cloud.google.com/anthos-config-management/docs/reference/rootsync-reposync-fields \ No newline at end of file +[root-sync]: https://cloud.google.com/anthos-config-management/docs/reference/rootsync-reposync-fields +[config-sync]: https://cloud.google.com/anthos-config-management/docs/config-sync-overview +[cloud-deploy]: https://cloud.google.com/deploy?hl=en +[terraform]: https://www.terraform.io/ +[gke]: https://cloud.google.com/kubernetes-engine?hl=en +[git]: https://git-scm.com/ +[github]: https://github.com/ +[gcp-project]: https://cloud.google.com/resource-manager/docs/creating-managing-projects +[personal-access-token]: https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/managing-your-personal-access-tokens +[machine-user-account]: https://docs.github.com/en/get-started/learning-about-github/types-of-github-accounts \ No newline at end of file diff --git a/ml-platform/02_gke/backend.tf b/ml-platform/backend.tf similarity index 96% rename from ml-platform/02_gke/backend.tf rename to ml-platform/backend.tf index 97deced77..a676d7219 100644 --- a/ml-platform/02_gke/backend.tf +++ b/ml-platform/backend.tf @@ -14,7 +14,8 @@ terraform { backend "gcs" { - prefix = "02_gke" + prefix = "terraform" bucket = "YOUR_STATE_BUCKET" } } + diff --git a/ml-platform/03_configsync/create_cluster_yamls.sh b/ml-platform/create_cluster_yamls.sh similarity index 53% rename from ml-platform/03_configsync/create_cluster_yamls.sh rename to ml-platform/create_cluster_yamls.sh index 3c659a198..627e8fd7a 100755 --- a/ml-platform/03_configsync/create_cluster_yamls.sh +++ b/ml-platform/create_cluster_yamls.sh @@ -28,49 +28,30 @@ sleep $sleep_total random=$(echo $RANDOM | md5sum | head -c 20; echo) log="$(pwd)/log" flag=0 -#github_token=${7} -#echo "${github_token}" >> log -#echo "${TF_VAR_github_token}" >> log -#ls -lrt >> log -#ls -lrt ../ >> log -#TIMESTAMP=$(date "+%Y%m%d%H%M%S") -download_acm_repo_name=$(echo ${acm_repo_name} | awk -F "/" '{print $2}')-${random} + +download_acm_repo_name="/tmp/$(echo ${acm_repo_name} | awk -F "/" '{print $2}')-${random}" git config --global user.name ${github_user} git config --global user.email ${github_emai} git clone https://${github_user}:${TF_VAR_github_token}@github.com/${acm_repo_name} ${download_acm_repo_name} -echo "Download repo is ${download_acm_repo_name}" >> ${log} -echo "ls -lrt before going into download repo is $(ls -lrt)" >> ${log} -cd ${download_acm_repo_name} -echo "ls -lrt in download repo is $(ls -lrt)" >> ${log} -if [ ! -d "manifests" ] && [ ! -d "templates" ]; then - echo "copying files" >> ${log} - cp -r ../templates/acm-template/* . + +if [ ! -d "${download_acm_repo_name}/manifests" ] && [ ! -d "${download_acm_repo_name}/templates" ]; then + echo "copying files" + cp -r templates/acm-template/* ${download_acm_repo_name} flag=1 fi -cd manifests/clusters -if [ ${flag} -eq 0 ]; then - echo "not copying files" >> ${log} +cd ${download_acm_repo_name}/manifests/clusters +if [ "${flag}" -eq 0 ]; then + echo "not copying files" fi -echo "In directory $(pwd)" >> ${log} -echo "level0 $(ls -lrt)" >> ${log} -echo "level1 $(ls -lrt ../)" >> ${log} -echo "level2 $(ls -lrt ../../)" >> ${log} -echo "level3 $(ls -lrt ../../../)" >> ${log} -echo "level4 $(ls -lrt ../../../../ )" >> ${log} -echo "env is ${cluster_env}" >> ${log} - -cp ../../templates/_cluster-template/cluster.yaml ./${cluster_name}-cluster.yaml -cp ../../templates/_cluster-template/selector.yaml ./${cluster_env}-selector.yaml -#cp ../../templates/_cluster-template/connect-gateway-rbac.yaml ./${cluster_name}-connect-gateway-rbac.yaml +cp ../../templates/_cluster_template/cluster.yaml ./${cluster_name}-cluster.yaml +cp ../../templates/_cluster_template/selector.yaml ./${cluster_env}-selector.yaml find . -type f -name ${cluster_name}-cluster.yaml -exec sed -i "s/CLUSTER_NAME/${cluster_name}/g" {} + find . -type f -name ${cluster_name}-cluster.yaml -exec sed -i "s/ENV/${cluster_env}/g" {} + find . -type f -name ${cluster_env}-selector.yaml -exec sed -i "s/ENV/${cluster_env}/g" {} + -#find . -type f -name ${cluster_name}-connect-gateway-rbac.yaml -exec sed -i "s/CLUSTER_NAME/${cluster_name}/g" {} + -#find . -type f -name ${cluster_name}-connect-gateway-rbac.yaml -exec sed -i "s/ENV/${cluster_env}/g" {} + -cp ../../templates/_cluster-template/kuberay . +#cp ../../templates/_cluster_template/kuberay . git add ../../. git config --global user.name ${github_user} @@ -78,5 +59,5 @@ git config --global user.email ${github_email} git commit -m "Adding ${cluster_name} cluster to the ${cluster_env} environment." git push origin -cd .. +cd - rm -rf ${download_acm_repo_name} diff --git a/ml-platform/create_git_cred.sh b/ml-platform/create_git_cred.sh new file mode 100755 index 000000000..da5a92104 --- /dev/null +++ b/ml-platform/create_git_cred.sh @@ -0,0 +1,38 @@ +#!/bin/bash +# +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +gke_cluster=${1} +project_id=${2} +git_user=${3} +namespace=${4} +index=${5} +sleep_time=60 +sleep_index=$((${index}+1)) +sleep_total=$((${sleep_time}*${sleep_index})) +sleep $sleep_total +gcloud container fleet memberships get-credentials ${gke_cluster} --project ${project_id} +ns_exists=$(kubectl get ns ${namespace} -o name | awk -F '/' '{print $2}') + +while [ "${ns_exists}" != "${namespace}" ] +do +sleep 10 +ns_exists=$(kubectl get ns ${namespace} -o name | awk -F '/' '{print $2}') +done +secret_exists=$(kubectl get secret git-creds -n ${namespace} -o name) +if [[ "${secret_exists}" == "secret/git-creds" ]]; then + exit 0 +else + kubectl create secret generic git-creds --namespace="${namespace}" --from-literal=username="${git_user}" --from-literal=token="${TF_VAR_github_token}" +fi \ No newline at end of file diff --git a/ml-platform/create_namespace.sh b/ml-platform/create_namespace.sh new file mode 100755 index 000000000..7a4e28d06 --- /dev/null +++ b/ml-platform/create_namespace.sh @@ -0,0 +1,57 @@ +#!/bin/bash +# +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +configsync_repo_name=${1} +github_email=${2} +github_org=${3} +github_user=${4} +namespace=${5} + +logfile=$(pwd)/log +random=$(echo $RANDOM | md5sum | head -c 20; echo) +download_acm_repo_name="/tmp/$(echo ${configsync_repo_name} | awk -F "/" '{print $2}')-${random}" +git config --global user.name ${github_user} +git config --global user.email ${github_emai} +git clone https://${github_user}:${TF_VAR_github_token}@github.com/${configsync_repo_name} ${download_acm_repo_name} +cd ${download_acm_repo_name}/manifests/clusters + +if [ -d "${namespace}" ]; then + exit 0 +fi +chars_in_namespace=$(echo -n ${namespace} | wc -c) +#adding 4 for number of chars in "dev-" +chars_in_reposync_name=$(expr $chars_in_namespace + 4) +mkdir ${namespace} || exit 1 +cp -r ../../templates/_cluster_template/team/* ${namespace} +sed -i "s?NAMESPACE?$namespace?g" ${namespace}/* +sed -ni '/#END OF SINGLE ENV DECLARATION/q;p' ${namespace}/reposync.yaml +sed -i "s?GIT_REPO?https://github.com/$configsync_repo_name?g" ${namespace}/reposync.yaml +sed -i "s??$chars_in_reposync_name?g" ${namespace}/reposync.yaml + +mkdir ../apps/${namespace} +touch ../apps/${namespace}/.gitkeep + +cat <>kustomization.yaml +- ./${namespace} +EOF +cd .. +git add . +git config --global user.name ${github_user} +git config --global user.email ${github_email} +git commit -m "Adding manifests to create a new namespace." +git push origin + +cd - +rm -rf ${download_acm_repo_name} \ No newline at end of file diff --git a/ml-platform/install_kuberay_operator.sh b/ml-platform/install_kuberay_operator.sh new file mode 100755 index 000000000..08bb41410 --- /dev/null +++ b/ml-platform/install_kuberay_operator.sh @@ -0,0 +1,47 @@ +#!/bin/bash +# +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +configsync_repo_name=${1} +github_email=${2} +github_org=${3} +github_user=${4} + +random=$(echo $RANDOM | md5sum | head -c 20; echo) +download_acm_repo_name="/tmp/$(echo ${configsync_repo_name} | awk -F "/" '{print $2}')-${random}" +git config --global user.name ${github_user} +git config --global user.email ${github_emai} +git clone https://${github_user}:${TF_VAR_github_token}@github.com/${configsync_repo_name} ${download_acm_repo_name} +cd ${download_acm_repo_name}/manifests/clusters +if [ -f "kustomization.yaml" ]; then + exit 0 +fi +yamlfiles=$(find . -type f -name "*.yaml") +cp ../../templates/_cluster_template/kustomization.yaml . +for yamlfile in `echo ${yamlfiles}` +do +cat <>kustomization.yaml + +- ${yamlfile} +EOF +done +cp -r ../../templates/_cluster_template/kuberay . +git add . +git config --global user.name ${github_user} +git config --global user.email ${github_email} +git commit -m "Adding manifests to install kuberay operator." +git push origin + +cd - +rm -rf ${download_acm_repo_name} \ No newline at end of file diff --git a/ml-platform/install_ray_cluster.sh b/ml-platform/install_ray_cluster.sh new file mode 100755 index 000000000..d7d62c8ba --- /dev/null +++ b/ml-platform/install_ray_cluster.sh @@ -0,0 +1,48 @@ +#!/bin/bash +# +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +configsync_repo_name=${1} +github_email=${2} +github_org=${3} +github_user=${4} +namespace=${5} + +random=$(echo $RANDOM | md5sum | head -c 20; echo) +download_acm_repo_name="/tmp/$(echo ${configsync_repo_name} | awk -F "/" '{print $2}')-${random}" +git config --global user.name ${github_user} +git config --global user.email ${github_emai} +git clone https://${github_user}:${TF_VAR_github_token}@github.com/${configsync_repo_name} ${download_acm_repo_name} +cd ${download_acm_repo_name}/manifests/apps +if [ ! -d "${namespace}" ]; then + echo "${namespace} folder doesnt exist in the configsync repo" + exit 1 +fi + +if [ -f "${namespace}/kustomization.yaml" ]; then + echo "${namespace} is already set up" + exit 0 +fi + +cp -r ../../templates/_namespace_template/app/* ${namespace}/ +sed -i "s?NAMESPACE?${namespace}?g" ${namespace}/* + +git add . +git config --global user.name ${github_user} +git config --global user.email ${github_email} +git commit -m "Installing ray cluster in ${namespace} namespace." +git push origin + +cd - +rm -rf ${download_acm_repo_name} \ No newline at end of file diff --git a/ml-platform/main.tf b/ml-platform/main.tf new file mode 100644 index 000000000..3f5b53b42 --- /dev/null +++ b/ml-platform/main.tf @@ -0,0 +1,390 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#TODO: Add a validation that the value if default_env must be one of the values in env list +module "gcp-project" { + count = var.create_projects + source = "./modules/projects" + org_id = var.org_id + folder_id = var.folder_id + env = var.env + billing_account = var.billing_account + project_name = var.project_name +} + + +locals { + #parsed_project_id = length(keys("${var.project_id}")) == 0 ? data.terraform_remote_state.gcp-projects[0].outputs.project_ids : var.project_id + #var.create_projects == 1 ? {for k, v in "${module.gcp-project.project_ids}" : k => v.project_id} : "" + parsed_project_id = var.create_projects == 0 ? var.project_id : { for k, v in "${module.gcp-project.project_ids}" : k => v.project_id } + parsed_gke_info = module.gke + parsed_gke_info_without_default_env = { for k, v in "${local.parsed_gke_info}" : k => v if k != var.default_env } + project_id_list = [for k, v in "${module.gke}" : v.gke_project_id] + gke_project_map = { for k, v in "${module.gke}" : v.cluster_name => v.gke_project_id } +} + +resource "google_project_service" "project_services-cr" { + for_each = local.parsed_project_id + project = each.value + service = "cloudresourcemanager.googleapis.com" + disable_on_destroy = false + disable_dependent_services = false + depends_on = [module.gcp-project] +} + +resource "google_project_service" "project_services-an" { + for_each = local.parsed_project_id + project = each.value + service = "anthos.googleapis.com" + disable_on_destroy = false + disable_dependent_services = false + depends_on = [module.gcp-project, google_project_service.project_services-cr] +} +resource "google_project_service" "project_services-anc" { + for_each = local.parsed_project_id + project = each.value + service = "anthosconfigmanagement.googleapis.com" + disable_on_destroy = false + disable_dependent_services = false + depends_on = [module.gcp-project, google_project_service.project_services-cr] +} +resource "google_project_service" "project_services-con" { + for_each = local.parsed_project_id + project = each.value + service = "container.googleapis.com" + disable_on_destroy = false + disable_dependent_services = false + depends_on = [module.gcp-project, google_project_service.project_services-cr] +} +resource "google_project_service" "project_services-com" { + for_each = local.parsed_project_id + project = each.value + service = "compute.googleapis.com" + disable_on_destroy = false + disable_dependent_services = false + depends_on = [module.gcp-project, google_project_service.project_services-cr] +} +resource "google_project_service" "project_services-gkecon" { + for_each = local.parsed_project_id + project = each.value + service = "gkeconnect.googleapis.com" + disable_on_destroy = false + disable_dependent_services = false + depends_on = [module.gcp-project, google_project_service.project_services-cr] +} +resource "google_project_service" "project_services-gkeh" { + for_each = local.parsed_project_id + project = each.value + service = "gkehub.googleapis.com" + disable_on_destroy = false + disable_dependent_services = false + depends_on = [module.gcp-project, google_project_service.project_services-cr] +} +resource "google_project_service" "project_services-iam" { + for_each = local.parsed_project_id + project = each.value + service = "iam.googleapis.com" + disable_on_destroy = false + disable_dependent_services = false + depends_on = [module.gcp-project, google_project_service.project_services-cr] +} + +resource "google_project_service" "project_services-gate" { + for_each = local.parsed_project_id + project = each.value + service = "connectgateway.googleapis.com" + disable_on_destroy = false + disable_dependent_services = false + depends_on = [module.gcp-project, google_project_service.project_services-cr] +} + +module "create-vpc" { + for_each = local.parsed_project_id + source = "./modules/network" + project_id = each.value + network_name = format("%s-%s", var.network_name, each.key) + routing_mode = var.routing_mode + subnet_01_name = format("%s-%s", var.subnet_01_name, each.key) + subnet_01_ip = var.subnet_01_ip + subnet_01_region = var.subnet_01_region + subnet_02_name = format("%s-%s", var.subnet_02_name, each.key) + subnet_02_ip = var.subnet_02_ip + subnet_02_region = var.subnet_02_region + #default_route_name = format("%s-%s","default-route",each.key) + depends_on = [module.gcp-project, google_project_service.project_services-com] +} + +resource "google_gke_hub_feature" "configmanagement_acm_feature" { + count = length(distinct(values(local.parsed_project_id))) + name = "configmanagement" + project = distinct(values(local.parsed_project_id))[count.index] + location = "global" + provider = google-beta + depends_on = [google_project_service.project_services-gkeh, google_project_service.project_services-anc, google_project_service.project_services-an, google_project_service.project_services-com, google_project_service.project_services-gkecon] +} + +module "gke" { + for_each = local.parsed_project_id + source = "./modules/cluster" + cluster_name = format("%s-%s", var.cluster_name, each.key) + network = module.create-vpc[each.key].vpc + subnet = module.create-vpc[each.key].subnet-1 + project_id = each.value + region = var.subnet_01_region + zone = "${var.subnet_01_region}-a" + master_auth_networks_ipcidr = var.subnet_01_ip + depends_on = [google_gke_hub_feature.configmanagement_acm_feature, google_project_service.project_services-con, google_project_service.project_services-com] + env = each.key +} +module "reservation" { + for_each = local.parsed_project_id + source = "./modules/vm-reservations" + cluster_name = module.gke[each.key].cluster_name + zone = "${var.subnet_01_region}-a" + project_id = each.value + depends_on = [module.gke] +} +module "node_pool-reserved" { + for_each = local.parsed_project_id + source = "./modules/node-pools" + node_pool_name = "reservation" + project_id = each.value + cluster_name = module.gke[each.key].cluster_name + region = "${var.subnet_01_region}" + taints = var.reserved_taints + resource_type = "reservation" + reservation_name = module.reservation[each.key].reservation_name + depends_on = [module.reservation] +} + +module "node_pool-ondemand" { + for_each = local.parsed_project_id + source = "./modules/node-pools" + node_pool_name = "ondemand" + project_id = each.value + cluster_name = module.gke[each.key].cluster_name + region = "${var.subnet_01_region}" + taints = var.ondemand_taints + resource_type = "ondemand" + depends_on = [module.gke] +} + +module "node_pool-spot" { + for_each = local.parsed_project_id + source = "./modules/node-pools" + node_pool_name = "spot" + project_id = each.value + cluster_name = module.gke[each.key].cluster_name + region = "${var.subnet_01_region}" + taints = var.spot_taints + resource_type = "spot" + depends_on = [module.gke] +} + +module "cloud-nat" { + for_each = local.parsed_project_id + source = "./modules/cloud-nat" + project_id = each.value + region = split("/", module.create-vpc[each.key].subnet-1)[3] + name = format("%s-%s", "nat-for-acm", each.key) + network = module.create-vpc[each.key].vpc + create_router = true + router = format("%s-%s", "router-for-acm", each.key) + depends_on = [module.create-vpc, google_project_service.project_services-com] +} + + + +//data "terraform_remote_state" "gke-clusters" { +// backend = "gcs" +// config = { +// bucket = var.lookup_state_bucket +// prefix = "02_gke" +// } +//} +// +//locals { +// parsed_gke_info = module.gke +// project_id_list = [for k,v in "${module.gke}" : v.gke_project_id] +//} + +//resource "google_gke_hub_feature" "configmanagement_acm_feature" { +// count = length(distinct(local.project_id_list)) +// name = "configmanagement" +// project = distinct(local.project_id_list)[count.index] +// location = "global" +// provider = google-beta +//} + +resource "google_gke_hub_membership" "membership" { + provider = google-beta + for_each = local.parsed_gke_info + project = each.value["gke_project_id"] + membership_id = each.value["cluster_name"] + endpoint { + gke_cluster { + resource_link = format("%s/%s", "//container.googleapis.com", each.value["cluster_id"]) + } + } + lifecycle { + ignore_changes = [ + labels + ] + } + depends_on = [google_gke_hub_feature.configmanagement_acm_feature, google_project_service.project_services-gkeh, google_project_service.project_services-gkecon] +} + +resource "github_repository" "acm_repo" { + name = var.configsync_repo_name + description = "Repo for Config Sync" + visibility = "private" + has_issues = false + has_projects = false + has_wiki = false + + allow_merge_commit = true + allow_squash_merge = true + allow_rebase_merge = true + delete_branch_on_merge = false + auto_init = true + vulnerability_alerts = true +} +//Create a branch for each env +resource "github_branch" "branch" { + for_each = local.parsed_gke_info + repository = split("/", github_repository.acm_repo.full_name)[1] + branch = each.key + depends_on = [github_repository.acm_repo] +} +//Set default branch as the lowest env +resource "github_branch_default" "default_branch" { + repository = split("/", github_repository.acm_repo.full_name)[1] + #branch = tostring(keys(local.parsed_gke_info)[0]) + branch = var.default_env + #rename = true + depends_on = [github_branch.branch] +} +#Protect branches other than the default branch +resource "github_branch_protection_v3" "branch_protection" { + for_each = length(keys(local.parsed_project_id)) > 1 ? local.parsed_gke_info_without_default_env : {} + repository = split("/", github_repository.acm_repo.full_name)[1] + branch = each.key + required_pull_request_reviews { + required_approving_review_count = 1 + require_code_owner_reviews = true + } + restrictions { + + } + + depends_on = [github_branch.branch] +} + +resource "google_gke_hub_feature_membership" "feature_member" { + provider = google-beta + for_each = local.parsed_gke_info + project = each.value["gke_project_id"] + location = "global" + feature = "configmanagement" + membership = google_gke_hub_membership.membership[each.key].membership_id + configmanagement { + version = "1.17.0" + config_sync { + source_format = "unstructured" + git { + sync_repo = "https://github.com/${github_repository.acm_repo.full_name}.git" + sync_branch = each.value["env"] + policy_dir = "manifests/clusters" + secret_type = "token" + } + } + policy_controller { + enabled = true + template_library_installed = true + referential_rules_enabled = true + } + } + + provisioner "local-exec" { + command = "${path.module}/create_cluster_yamls.sh ${var.github_org} ${github_repository.acm_repo.full_name} ${var.github_user} ${var.github_email} ${each.value["env"]} ${each.value["cluster_name"]} ${index(keys(local.parsed_gke_info), each.key)}" + } + + depends_on = [google_project_service.project_services-gkecon, google_project_service.project_services-gkeh, google_project_service.project_services-an, google_project_service.project_services-anc] +} + +resource "null_resource" "create_git_cred_cms" { + for_each = var.secret_for_rootsync == 1 ? local.gke_project_map : {} + triggers = { + timestamp = timestamp() + } + provisioner "local-exec" { + command = "${path.module}/create_git_cred.sh ${each.key} ${each.value} ${var.github_user} config-management-system ${index(keys(local.gke_project_map), each.key)}" + } + depends_on = [google_gke_hub_feature_membership.feature_member, module.gke, module.node_pool-reserved, module.node_pool-ondemand, module.node_pool-spot, module.cloud-nat] +} + +resource "null_resource" "install_kuberay_operator" { + count = var.install_kuberay + triggers = { + timestamp = timestamp() + } + provisioner "local-exec" { + command = "${path.module}/install_kuberay_operator.sh ${github_repository.acm_repo.full_name} ${var.github_email} ${var.github_org} ${var.github_user}" + } + depends_on = [google_gke_hub_feature_membership.feature_member, null_resource.create_git_cred_cms] +} + +resource "null_resource" "create_namespace" { + count = var.create_namespace + triggers = { + timestamp = timestamp() + } + provisioner "local-exec" { + command = "${path.module}/create_namespace.sh ${github_repository.acm_repo.full_name} ${var.github_email} ${var.github_org} ${var.github_user} ${var.namespace}" + } + depends_on = [google_gke_hub_feature_membership.feature_member, null_resource.install_kuberay_operator] +} + +resource "null_resource" "create_git_cred_ns" { + count = var.create_namespace + triggers = { + timestamp = timestamp() + } + provisioner "local-exec" { + command = "${path.module}/create_git_cred.sh ${local.parsed_gke_info[var.default_env].cluster_name} ${local.parsed_gke_info[var.default_env].gke_project_id} ${var.github_user} ${var.namespace}" + } + depends_on = [ google_gke_hub_feature_membership.feature_member, null_resource.create_namespace ] +} + +resource "null_resource" "install_ray_cluster" { + count = var.install_ray_in_ns + triggers = { + timestamp = timestamp() + } + provisioner "local-exec" { + command = "${path.module}/install_ray_cluster.sh ${github_repository.acm_repo.full_name} ${var.github_email} ${var.github_org} ${var.github_user} ${var.namespace}" + } + depends_on = [google_gke_hub_feature_membership.feature_member, null_resource.create_git_cred_ns] +} + +resource "null_resource" "manage_ray_ns" { + count = var.install_ray_in_ns + triggers = { + timestamp = timestamp() + } + provisioner "local-exec" { + command = "${path.module}/manage_ray_ns.sh ${github_repository.acm_repo.full_name} ${var.github_email} ${var.github_org} ${var.github_user} ${var.namespace}" + } + depends_on = [google_gke_hub_feature_membership.feature_member, null_resource.create_git_cred_ns, null_resource.install_ray_cluster] +} \ No newline at end of file diff --git a/ml-platform/manage_ray_ns.sh b/ml-platform/manage_ray_ns.sh new file mode 100755 index 000000000..021559fee --- /dev/null +++ b/ml-platform/manage_ray_ns.sh @@ -0,0 +1,43 @@ +#!/bin/bash +# +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +configsync_repo_name=${1} +github_email=${2} +github_org=${3} +github_user=${4} +namespace=${5} + +random=$(echo $RANDOM | md5sum | head -c 20; echo) +download_acm_repo_name="/tmp/$(echo ${configsync_repo_name} | awk -F "/" '{print $2}')-${random}" +git config --global user.name ${github_user} +git config --global user.email ${github_emai} +git clone https://${github_user}:${TF_VAR_github_token}@github.com/${configsync_repo_name} ${download_acm_repo_name} +cd ${download_acm_repo_name}/manifests/clusters/kuberay +ns_exists=$(grep ${namespace} values.yaml | wc -l) +if [ "${ns_exists}" -ne 0 ]; then + echo "namespace already present in values.yaml" + exit 0 +fi + +sed -i "s/watchNamespace:/watchNamespace:\n - ${namespace}/g" values.yaml + +git add . +git config --global user.name ${github_user} +git config --global user.email ${github_email} +git commit -m "Installing ray cluster in ${namespace} namespace." +git push origin + +cd - +rm -rf ${download_acm_repo_name} \ No newline at end of file diff --git a/ml-platform/mlenv.auto.tfvars b/ml-platform/mlenv.auto.tfvars new file mode 100644 index 000000000..3a7fe5c74 --- /dev/null +++ b/ml-platform/mlenv.auto.tfvars @@ -0,0 +1,9 @@ +project_id = {"dev":"YOUR_PROJECT_ID"} +default_env = "dev" +github_user = "YOUR_GITHUB_USER" +github_email = "YOUR_GITHUB_EMAIL" +github_org = "YOUR_GITHUB_ORG" +#github_token = "DO NOT ADD TOKEN HERE. PASS IT AS `export TF_VAR_github_token="YOUR TOKEN"` ON CLOUDSHELL" +#folder_id = "YOUR_FOLDER_ID" +#org_id = "YOUR_GCP_ORG_ID" +#billing_account = "YOUR_BILLING_ACCOUNT" \ No newline at end of file diff --git a/ml-platform/02_gke/modules/cloud-nat/README.md b/ml-platform/modules/cloud-nat/README.md similarity index 99% rename from ml-platform/02_gke/modules/cloud-nat/README.md rename to ml-platform/modules/cloud-nat/README.md index e498d7958..6952d4e9f 100644 --- a/ml-platform/02_gke/modules/cloud-nat/README.md +++ b/ml-platform/modules/cloud-nat/README.md @@ -1,4 +1,3 @@ - # Terraform Google Cloud NAT Module This module handles opinionated Google Cloud Platform Cloud NAT creation and configuration. diff --git a/ml-platform/02_gke/modules/cloud-nat/main.tf b/ml-platform/modules/cloud-nat/main.tf similarity index 100% rename from ml-platform/02_gke/modules/cloud-nat/main.tf rename to ml-platform/modules/cloud-nat/main.tf diff --git a/ml-platform/02_gke/modules/cloud-nat/outputs.tf b/ml-platform/modules/cloud-nat/outputs.tf similarity index 99% rename from ml-platform/02_gke/modules/cloud-nat/outputs.tf rename to ml-platform/modules/cloud-nat/outputs.tf index acd7f8ce6..86bf7c39d 100644 --- a/ml-platform/02_gke/modules/cloud-nat/outputs.tf +++ b/ml-platform/modules/cloud-nat/outputs.tf @@ -31,3 +31,4 @@ output "router_name" { description = "Cloud NAT router name" value = local.router } + diff --git a/ml-platform/02_gke/modules/cloud-nat/variables.tf b/ml-platform/modules/cloud-nat/variables.tf similarity index 100% rename from ml-platform/02_gke/modules/cloud-nat/variables.tf rename to ml-platform/modules/cloud-nat/variables.tf diff --git a/ml-platform/modules/cloud-nat/versions.tf b/ml-platform/modules/cloud-nat/versions.tf new file mode 100644 index 000000000..a6e8142dd --- /dev/null +++ b/ml-platform/modules/cloud-nat/versions.tf @@ -0,0 +1,50 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +//terraform { +// required_providers { +// +// google = { +// source = "hashicorp/google" +// #version = ">= 4.51, < 5.0" +// version = "4.72.1" +// } +// +// random = { +// source = "hashicorp/random" +// version = "~> 2.2" +// } +// } +// +//} +terraform { + required_providers { + google = { + source = "hashicorp/google" + version = "5.19.0" + } + google-beta = { + source = "hashicorp/google-beta" + version = "5.19.0" + } + github = { + source = "integrations/github" + version = "6.0.1" + } + random = { + source = "hashicorp/random" + version = "2.2" + } + } +} diff --git a/ml-platform/02_gke/modules/cluster/gke.tf b/ml-platform/modules/cluster/gke.tf similarity index 87% rename from ml-platform/02_gke/modules/cluster/gke.tf rename to ml-platform/modules/cluster/gke.tf index 418068752..b08e92b9b 100644 --- a/ml-platform/02_gke/modules/cluster/gke.tf +++ b/ml-platform/modules/cluster/gke.tf @@ -19,14 +19,15 @@ data "google_project" "project" { } resource "google_container_cluster" "gke_batch" { - provider = google-beta - name = var.cluster_name - project = var.project_id - location = var.region - network = var.network - subnetwork = var.subnet - node_locations = ["${var.region}-a", "${var.region}-b", "${var.region}-c"] - initial_node_count = 2 + provider = google-beta + deletion_protection = false + name = var.cluster_name + project = var.project_id + location = var.region + network = var.network + subnetwork = var.subnet + node_locations = ["${var.region}-a", "${var.region}-b", "${var.region}-c"] + initial_node_count = 2 workload_identity_config { workload_pool = "${var.project_id}.svc.id.goog" } @@ -104,7 +105,7 @@ resource "google_container_cluster" "gke_batch" { } } release_channel { - channel = "RAPID" + channel = "STABLE" } private_cluster_config { enable_private_nodes = true diff --git a/ml-platform/02_gke/modules/cluster/outputs.tf b/ml-platform/modules/cluster/outputs.tf similarity index 99% rename from ml-platform/02_gke/modules/cluster/outputs.tf rename to ml-platform/modules/cluster/outputs.tf index 57bd8a0de..b26d3be8e 100644 --- a/ml-platform/02_gke/modules/cluster/outputs.tf +++ b/ml-platform/modules/cluster/outputs.tf @@ -30,4 +30,4 @@ output "gke_project_id" { output "env" { value = var.env -} +} \ No newline at end of file diff --git a/ml-platform/02_gke/modules/cluster/variables.tf b/ml-platform/modules/cluster/variables.tf similarity index 99% rename from ml-platform/02_gke/modules/cluster/variables.tf rename to ml-platform/modules/cluster/variables.tf index 5d76462c4..6eccda35b 100644 --- a/ml-platform/02_gke/modules/cluster/variables.tf +++ b/ml-platform/modules/cluster/variables.tf @@ -55,4 +55,4 @@ variable "env" { type = string description = "environment" -} +} \ No newline at end of file diff --git a/ml-platform/02_gke/modules/node-pools/versions.tf b/ml-platform/modules/cluster/versions.tf similarity index 71% rename from ml-platform/02_gke/modules/node-pools/versions.tf rename to ml-platform/modules/cluster/versions.tf index fc374eab1..d4aada15b 100644 --- a/ml-platform/02_gke/modules/node-pools/versions.tf +++ b/ml-platform/modules/cluster/versions.tf @@ -12,15 +12,28 @@ # See the License for the specific language governing permissions and # limitations under the License. +//terraform { +// required_providers { +// google-beta = { +// source = "hashicorp/google-beta" +// version = "4.72.1" +// } +// google = { +// source = "hashicorp/google" +// version = "4.72.1" +// } +// } +//} + terraform { required_providers { - google-beta = { - source = "hashicorp/google-beta" - version = "4.72.1" - } google = { source = "hashicorp/google" - version = "4.72.1" + version = "5.19.0" + } + google-beta = { + source = "hashicorp/google-beta" + version = "5.19.0" } } } diff --git a/ml-platform/02_gke/modules/network/README.md b/ml-platform/modules/network/README.md similarity index 99% rename from ml-platform/02_gke/modules/network/README.md rename to ml-platform/modules/network/README.md index 14e1591d8..6de9bdc13 100644 --- a/ml-platform/02_gke/modules/network/README.md +++ b/ml-platform/modules/network/README.md @@ -1,4 +1,3 @@ - ## Requirements | Name | Version | diff --git a/ml-platform/02_gke/modules/network/outputs.tf b/ml-platform/modules/network/outputs.tf similarity index 99% rename from ml-platform/02_gke/modules/network/outputs.tf rename to ml-platform/modules/network/outputs.tf index 13026f645..bf9d36dad 100644 --- a/ml-platform/02_gke/modules/network/outputs.tf +++ b/ml-platform/modules/network/outputs.tf @@ -25,4 +25,4 @@ output "subnet-1" { output "subnet-2" { value = google_compute_subnetwork.subnet-2.id description = "subnet2." -} +} \ No newline at end of file diff --git a/ml-platform/02_gke/modules/network/variables.tf b/ml-platform/modules/network/variables.tf similarity index 92% rename from ml-platform/02_gke/modules/network/variables.tf rename to ml-platform/modules/network/variables.tf index db344133d..c7c12296c 100644 --- a/ml-platform/02_gke/modules/network/variables.tf +++ b/ml-platform/modules/network/variables.tf @@ -16,28 +16,23 @@ variable "project_id" { description = "Id of the GCP project where VPC is to be created." type = string } - variable "network_name" { description = "Name of the VPC network." type = string } - variable "routing_mode" { description = "The network routing mode." type = string default = "GLOBAL" } - variable "subnet_01_name" { description = "Name of first subnet." type = string } - variable "subnet_01_ip" { description = "IP range of first subnet." type = string } - variable "subnet_01_region" { description = "Region of first subnet." type = string @@ -47,13 +42,15 @@ variable "subnet_02_name" { description = "Name of the second subnet." type = string } - variable "subnet_02_ip" { description = "IP range of second subnet." type = string } - variable "subnet_02_region" { description = "Region of second subnet." type = string } +//variable "default_route_name" { +// description = "Name of the default route to internet." +// type = string +//} diff --git a/ml-platform/02_gke/modules/vm-reservations/versions.tf b/ml-platform/modules/network/versions.tf similarity index 80% rename from ml-platform/02_gke/modules/vm-reservations/versions.tf rename to ml-platform/modules/network/versions.tf index fc374eab1..e2e5241f2 100644 --- a/ml-platform/02_gke/modules/vm-reservations/versions.tf +++ b/ml-platform/modules/network/versions.tf @@ -12,15 +12,20 @@ # See the License for the specific language governing permissions and # limitations under the License. +//terraform { +// required_providers { +// google = { +// source = "hashicorp/google" +// version = ">= 4.28.0" +// } +// } +//} + terraform { required_providers { - google-beta = { - source = "hashicorp/google-beta" - version = "4.72.1" - } google = { source = "hashicorp/google" - version = "4.72.1" + version = "5.19.0" } } } diff --git a/ml-platform/02_gke/modules/network/vpc.tf b/ml-platform/modules/network/vpc.tf similarity index 85% rename from ml-platform/02_gke/modules/network/vpc.tf rename to ml-platform/modules/network/vpc.tf index a80166be5..37266b5d2 100644 --- a/ml-platform/02_gke/modules/network/vpc.tf +++ b/ml-platform/modules/network/vpc.tf @@ -36,3 +36,11 @@ resource "google_compute_subnetwork" "subnet-2" { network = google_compute_network.vpc-network.id private_ip_google_access = true } + +//resource "google_compute_route" "default-route" { +//name = var.default_route_name +//dest_range = "0.0.0.0/0" +//network = google_compute_network.vpc-network.id +//priority = 1000 +//next_hop_gateway = "default-internet-gateway" +//} diff --git a/ml-platform/02_gke/modules/node-pools/nodepools.tf b/ml-platform/modules/node-pools/nodepools.tf similarity index 90% rename from ml-platform/02_gke/modules/node-pools/nodepools.tf rename to ml-platform/modules/node-pools/nodepools.tf index 402e45695..72b07a239 100644 --- a/ml-platform/02_gke/modules/node-pools/nodepools.tf +++ b/ml-platform/modules/node-pools/nodepools.tf @@ -19,7 +19,14 @@ resource "google_container_node_pool" "node-pool" { location = var.region node_config { machine_type = var.machine_type - taint = var.taints + dynamic "taint" { + for_each = var.taints + content { + key = taint.value.key + value = taint.value.value + effect = taint.value.effect + } + } labels = { "resource-type" : var.resource_type } @@ -61,4 +68,4 @@ resource "google_container_node_pool" "node-pool" { network_config { enable_private_nodes = true } -} +} \ No newline at end of file diff --git a/ml-platform/02_gke/modules/node-pools/variables.tf b/ml-platform/modules/node-pools/variables.tf similarity index 98% rename from ml-platform/02_gke/modules/node-pools/variables.tf rename to ml-platform/modules/node-pools/variables.tf index 973d7a1fe..6a2f20e56 100644 --- a/ml-platform/02_gke/modules/node-pools/variables.tf +++ b/ml-platform/modules/node-pools/variables.tf @@ -16,19 +16,16 @@ variable "node_pool_name" { type = string description = "Name of the node pool" } - variable "project_id" { type = string description = "The GCP project where the resources will be created" default = "" } - variable "cluster_name" { type = string description = "GKE cluster name" default = "" } - variable "region" { type = string description = "The GCP zone where the reservation will be created" @@ -56,6 +53,7 @@ variable "resource_type" { default = "ondemand" } + variable "accelerator" { type = string description = "The GPU accelerator to use." @@ -67,7 +65,6 @@ variable "accelerator_count" { description = "The number of accelerators per machine." default = 2 } - variable "machine_reservation_count" { type = number description = "Number of machines reserved instances with GPUs" @@ -75,7 +72,7 @@ variable "machine_reservation_count" { } variable "autoscaling" { - type = map(any) + type = map default = { "total_min_node_count" : 0, "total_max_node_count" : 24, "location_policy" : "ANY" } } @@ -83,4 +80,4 @@ variable "reservation_name" { description = "reservation name to which the nodepool will be associated" type = string default = "" -} +} \ No newline at end of file diff --git a/ml-platform/02_gke/providers.tf b/ml-platform/modules/node-pools/versions.tf similarity index 71% rename from ml-platform/02_gke/providers.tf rename to ml-platform/modules/node-pools/versions.tf index fc374eab1..d4aada15b 100644 --- a/ml-platform/02_gke/providers.tf +++ b/ml-platform/modules/node-pools/versions.tf @@ -12,15 +12,28 @@ # See the License for the specific language governing permissions and # limitations under the License. +//terraform { +// required_providers { +// google-beta = { +// source = "hashicorp/google-beta" +// version = "4.72.1" +// } +// google = { +// source = "hashicorp/google" +// version = "4.72.1" +// } +// } +//} + terraform { required_providers { - google-beta = { - source = "hashicorp/google-beta" - version = "4.72.1" - } google = { source = "hashicorp/google" - version = "4.72.1" + version = "5.19.0" + } + google-beta = { + source = "hashicorp/google-beta" + version = "5.19.0" } } } diff --git a/ml-platform/01_gcp_project/modules/projects/outputs.tf b/ml-platform/modules/projects/outputs.tf similarity index 99% rename from ml-platform/01_gcp_project/modules/projects/outputs.tf rename to ml-platform/modules/projects/outputs.tf index 431fe53dd..e087e6c85 100644 --- a/ml-platform/01_gcp_project/modules/projects/outputs.tf +++ b/ml-platform/modules/projects/outputs.tf @@ -14,4 +14,4 @@ output "project_ids" { value = "${google_project.project_under_folder}" == {} ? "${google_project.project_under_org}" : "${google_project.project_under_folder}" -} +} \ No newline at end of file diff --git a/ml-platform/01_gcp_project/modules/projects/projects.tf b/ml-platform/modules/projects/projects.tf similarity index 99% rename from ml-platform/01_gcp_project/modules/projects/projects.tf rename to ml-platform/modules/projects/projects.tf index 76f7d1ef3..2b5c6b020 100644 --- a/ml-platform/01_gcp_project/modules/projects/projects.tf +++ b/ml-platform/modules/projects/projects.tf @@ -93,4 +93,4 @@ resource "google_project_service" "project_services-6" { disable_on_destroy = true disable_dependent_services = true depends_on = [google_project.project_under_folder, google_project.project_under_org] -} +} \ No newline at end of file diff --git a/ml-platform/01_gcp_project/modules/projects/variables.tf b/ml-platform/modules/projects/variables.tf similarity index 100% rename from ml-platform/01_gcp_project/modules/projects/variables.tf rename to ml-platform/modules/projects/variables.tf diff --git a/ml-platform/02_gke/modules/cluster/versions.tf b/ml-platform/modules/projects/versions.tf similarity index 80% rename from ml-platform/02_gke/modules/cluster/versions.tf rename to ml-platform/modules/projects/versions.tf index fc374eab1..e2e5241f2 100644 --- a/ml-platform/02_gke/modules/cluster/versions.tf +++ b/ml-platform/modules/projects/versions.tf @@ -12,15 +12,20 @@ # See the License for the specific language governing permissions and # limitations under the License. +//terraform { +// required_providers { +// google = { +// source = "hashicorp/google" +// version = ">= 4.28.0" +// } +// } +//} + terraform { required_providers { - google-beta = { - source = "hashicorp/google-beta" - version = "4.72.1" - } google = { source = "hashicorp/google" - version = "4.72.1" + version = "5.19.0" } } } diff --git a/ml-platform/02_gke/modules/vm-reservations/outputs.tf b/ml-platform/modules/vm-reservations/outputs.tf similarity index 99% rename from ml-platform/02_gke/modules/vm-reservations/outputs.tf rename to ml-platform/modules/vm-reservations/outputs.tf index 11ffcc6d8..5a4562e1a 100644 --- a/ml-platform/02_gke/modules/vm-reservations/outputs.tf +++ b/ml-platform/modules/vm-reservations/outputs.tf @@ -14,4 +14,4 @@ output "reservation_name" { value = split("/", google_compute_reservation.machine_reservation.id)[5] -} +} \ No newline at end of file diff --git a/ml-platform/02_gke/modules/vm-reservations/reservations.tf b/ml-platform/modules/vm-reservations/reservations.tf similarity index 99% rename from ml-platform/02_gke/modules/vm-reservations/reservations.tf rename to ml-platform/modules/vm-reservations/reservations.tf index 03438d0f7..177b0d384 100644 --- a/ml-platform/02_gke/modules/vm-reservations/reservations.tf +++ b/ml-platform/modules/vm-reservations/reservations.tf @@ -27,4 +27,4 @@ resource "google_compute_reservation" "machine_reservation" { } } } -} +} \ No newline at end of file diff --git a/ml-platform/02_gke/modules/vm-reservations/variables.tf b/ml-platform/modules/vm-reservations/variables.tf similarity index 99% rename from ml-platform/02_gke/modules/vm-reservations/variables.tf rename to ml-platform/modules/vm-reservations/variables.tf index 7ca5e5af3..3a8e3482d 100644 --- a/ml-platform/02_gke/modules/vm-reservations/variables.tf +++ b/ml-platform/modules/vm-reservations/variables.tf @@ -17,19 +17,16 @@ variable "project_id" { description = "The GCP project where the resources will be created" default = "" } - variable "cluster_name" { type = string description = "GKE cluster name" default = "" } - variable "zone" { type = string description = "The GCP zone where the reservation will be created" default = "us-central1-a" } - variable "machine_type" { type = string description = "The machine type to use." @@ -47,7 +44,6 @@ variable "accelerator_count" { description = "The number of accelerators per machine." default = 2 } - variable "machine_reservation_count" { type = number description = "Number of machines reserved instances with GPUs" diff --git a/ml-platform/modules/vm-reservations/versions.tf b/ml-platform/modules/vm-reservations/versions.tf new file mode 100644 index 000000000..7f4362ad6 --- /dev/null +++ b/ml-platform/modules/vm-reservations/versions.tf @@ -0,0 +1,39 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +//terraform { +// required_providers { +// google-beta = { +// source = "hashicorp/google-beta" +// version = "4.72.1" +// } +// google = { +// source = "hashicorp/google" +// version = "4.72.1" +// } +// } +//} + +terraform { + required_providers { + google = { + source = "hashicorp/google" + version = "5.19.0" + } + google-beta = { + source = "hashicorp/google-beta" + version = "5.19.0" + } + } +} \ No newline at end of file diff --git a/ml-platform/01_gcp_project/outputs.tf b/ml-platform/outputs.tf similarity index 72% rename from ml-platform/01_gcp_project/outputs.tf rename to ml-platform/outputs.tf index 11352c942..f9f8ea6f3 100644 --- a/ml-platform/01_gcp_project/outputs.tf +++ b/ml-platform/outputs.tf @@ -12,6 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. +//output "project_ids" { +// value = {for k, v in "${module.gcp-project.project_ids}" : k => v.project_id} +//} + output "project_ids" { - value = { for k, v in "${module.gcp-project.project_ids}" : k => v.project_id } -} + value = var.create_projects == 1 ? { for k, v in "${module.gcp-project.project_ids}" : k => v.project_id } : "" +} \ No newline at end of file diff --git a/ml-platform/03_configsync/templates/acm-template/manifests/apps/.gitkeep b/ml-platform/templates/acm-template/manifests/apps/.gitkeep similarity index 100% rename from ml-platform/03_configsync/templates/acm-template/manifests/apps/.gitkeep rename to ml-platform/templates/acm-template/manifests/apps/.gitkeep diff --git a/ml-platform/03_configsync/templates/acm-template/manifests/clusters/.gitkeep b/ml-platform/templates/acm-template/manifests/clusters/.gitkeep similarity index 100% rename from ml-platform/03_configsync/templates/acm-template/manifests/clusters/.gitkeep rename to ml-platform/templates/acm-template/manifests/clusters/.gitkeep diff --git a/ml-platform/03_configsync/templates/acm-template/templates/_cluster_template/cluster.yaml b/ml-platform/templates/acm-template/templates/_cluster_template/cluster.yaml similarity index 100% rename from ml-platform/03_configsync/templates/acm-template/templates/_cluster_template/cluster.yaml rename to ml-platform/templates/acm-template/templates/_cluster_template/cluster.yaml diff --git a/ml-platform/03_configsync/templates/acm-template/templates/_cluster_template/config-selector.yaml b/ml-platform/templates/acm-template/templates/_cluster_template/config-selector.yaml similarity index 100% rename from ml-platform/03_configsync/templates/acm-template/templates/_cluster_template/config-selector.yaml rename to ml-platform/templates/acm-template/templates/_cluster_template/config-selector.yaml diff --git a/ml-platform/03_configsync/templates/acm-template/templates/_cluster_template/kuberay/kustomization.yaml b/ml-platform/templates/acm-template/templates/_cluster_template/kuberay/kustomization.yaml similarity index 100% rename from ml-platform/03_configsync/templates/acm-template/templates/_cluster_template/kuberay/kustomization.yaml rename to ml-platform/templates/acm-template/templates/_cluster_template/kuberay/kustomization.yaml diff --git a/ml-platform/03_configsync/templates/acm-template/templates/_cluster_template/kuberay/rayclusters.yaml b/ml-platform/templates/acm-template/templates/_cluster_template/kuberay/rayclusters.yaml similarity index 100% rename from ml-platform/03_configsync/templates/acm-template/templates/_cluster_template/kuberay/rayclusters.yaml rename to ml-platform/templates/acm-template/templates/_cluster_template/kuberay/rayclusters.yaml diff --git a/ml-platform/03_configsync/templates/acm-template/templates/_cluster_template/kuberay/rayjobs.yaml b/ml-platform/templates/acm-template/templates/_cluster_template/kuberay/rayjobs.yaml similarity index 100% rename from ml-platform/03_configsync/templates/acm-template/templates/_cluster_template/kuberay/rayjobs.yaml rename to ml-platform/templates/acm-template/templates/_cluster_template/kuberay/rayjobs.yaml diff --git a/ml-platform/03_configsync/templates/acm-template/templates/_cluster_template/kuberay/rayservices.yaml b/ml-platform/templates/acm-template/templates/_cluster_template/kuberay/rayservices.yaml similarity index 100% rename from ml-platform/03_configsync/templates/acm-template/templates/_cluster_template/kuberay/rayservices.yaml rename to ml-platform/templates/acm-template/templates/_cluster_template/kuberay/rayservices.yaml diff --git a/ml-platform/03_configsync/templates/acm-template/templates/_cluster_template/kuberay/rbac.yaml b/ml-platform/templates/acm-template/templates/_cluster_template/kuberay/rbac.yaml similarity index 100% rename from ml-platform/03_configsync/templates/acm-template/templates/_cluster_template/kuberay/rbac.yaml rename to ml-platform/templates/acm-template/templates/_cluster_template/kuberay/rbac.yaml diff --git a/ml-platform/03_configsync/templates/acm-template/templates/_cluster_template/kuberay/values.yaml b/ml-platform/templates/acm-template/templates/_cluster_template/kuberay/values.yaml similarity index 99% rename from ml-platform/03_configsync/templates/acm-template/templates/_cluster_template/kuberay/values.yaml rename to ml-platform/templates/acm-template/templates/_cluster_template/kuberay/values.yaml index 7226bf446..626a6cb2a 100644 --- a/ml-platform/03_configsync/templates/acm-template/templates/_cluster_template/kuberay/values.yaml +++ b/ml-platform/templates/acm-template/templates/_cluster_template/kuberay/values.yaml @@ -85,8 +85,7 @@ singleNamespaceInstall: true # The KubeRay operator will watch the custom resources in the namespaces listed in the "watchNamespace" parameter. watchNamespace: -# - ml-team -# - ds-team + # Environment variables env: diff --git a/ml-platform/03_configsync/templates/acm-template/templates/_cluster_template/kustomization.yaml b/ml-platform/templates/acm-template/templates/_cluster_template/kustomization.yaml similarity index 100% rename from ml-platform/03_configsync/templates/acm-template/templates/_cluster_template/kustomization.yaml rename to ml-platform/templates/acm-template/templates/_cluster_template/kustomization.yaml diff --git a/ml-platform/03_configsync/templates/acm-template/templates/_cluster_template/selector.yaml b/ml-platform/templates/acm-template/templates/_cluster_template/selector.yaml similarity index 100% rename from ml-platform/03_configsync/templates/acm-template/templates/_cluster_template/selector.yaml rename to ml-platform/templates/acm-template/templates/_cluster_template/selector.yaml diff --git a/ml-platform/03_configsync/templates/acm-template/templates/_cluster_template/team/kustomization.yaml b/ml-platform/templates/acm-template/templates/_cluster_template/team/kustomization.yaml similarity index 100% rename from ml-platform/03_configsync/templates/acm-template/templates/_cluster_template/team/kustomization.yaml rename to ml-platform/templates/acm-template/templates/_cluster_template/team/kustomization.yaml diff --git a/ml-platform/03_configsync/templates/acm-template/templates/_cluster_template/team/namespace.yaml b/ml-platform/templates/acm-template/templates/_cluster_template/team/namespace.yaml similarity index 97% rename from ml-platform/03_configsync/templates/acm-template/templates/_cluster_template/team/namespace.yaml rename to ml-platform/templates/acm-template/templates/_cluster_template/team/namespace.yaml index 832e04dc4..08474cb90 100644 --- a/ml-platform/03_configsync/templates/acm-template/templates/_cluster_template/team/namespace.yaml +++ b/ml-platform/templates/acm-template/templates/_cluster_template/team/namespace.yaml @@ -17,4 +17,4 @@ kind: Namespace metadata: name: NAMESPACE labels: - app: APP_NAME \ No newline at end of file + app: NAMESPACE \ No newline at end of file diff --git a/ml-platform/03_configsync/templates/acm-template/templates/_cluster_template/team/network-policy.yaml b/ml-platform/templates/acm-template/templates/_cluster_template/team/network-policy.yaml similarity index 100% rename from ml-platform/03_configsync/templates/acm-template/templates/_cluster_template/team/network-policy.yaml rename to ml-platform/templates/acm-template/templates/_cluster_template/team/network-policy.yaml diff --git a/ml-platform/03_configsync/templates/acm-template/templates/_cluster_template/team/rbac.yaml b/ml-platform/templates/acm-template/templates/_cluster_template/team/rbac.yaml similarity index 100% rename from ml-platform/03_configsync/templates/acm-template/templates/_cluster_template/team/rbac.yaml rename to ml-platform/templates/acm-template/templates/_cluster_template/team/rbac.yaml diff --git a/ml-platform/03_configsync/templates/acm-template/templates/_cluster_template/team/reposync.yaml b/ml-platform/templates/acm-template/templates/_cluster_template/team/reposync.yaml similarity index 99% rename from ml-platform/03_configsync/templates/acm-template/templates/_cluster_template/team/reposync.yaml rename to ml-platform/templates/acm-template/templates/_cluster_template/team/reposync.yaml index 191a5b7f0..73149d513 100644 --- a/ml-platform/03_configsync/templates/acm-template/templates/_cluster_template/team/reposync.yaml +++ b/ml-platform/templates/acm-template/templates/_cluster_template/team/reposync.yaml @@ -50,7 +50,7 @@ roleRef: name: cluster-admin apiGroup: rbac.authorization.k8s.io --- - +#END OF SINGLE ENV DECLARATION #ROOT_SOURCE/namespaces/NAMESPACE/repo-sync.yaml apiVersion: configsync.gke.io/v1beta1 kind: RepoSync diff --git a/ml-platform/03_configsync/templates/acm-template/templates/_namespace_template/app/fluentd_config.yaml b/ml-platform/templates/acm-template/templates/_namespace_template/app/fluentd_config.yaml similarity index 100% rename from ml-platform/03_configsync/templates/acm-template/templates/_namespace_template/app/fluentd_config.yaml rename to ml-platform/templates/acm-template/templates/_namespace_template/app/fluentd_config.yaml diff --git a/ml-platform/03_configsync/templates/acm-template/templates/_namespace_template/app/kustomization.yaml b/ml-platform/templates/acm-template/templates/_namespace_template/app/kustomization.yaml similarity index 100% rename from ml-platform/03_configsync/templates/acm-template/templates/_namespace_template/app/kustomization.yaml rename to ml-platform/templates/acm-template/templates/_namespace_template/app/kustomization.yaml diff --git a/ml-platform/03_configsync/templates/acm-template/templates/_namespace_template/app/serviceaccount.yaml b/ml-platform/templates/acm-template/templates/_namespace_template/app/serviceaccount.yaml similarity index 100% rename from ml-platform/03_configsync/templates/acm-template/templates/_namespace_template/app/serviceaccount.yaml rename to ml-platform/templates/acm-template/templates/_namespace_template/app/serviceaccount.yaml diff --git a/ml-platform/03_configsync/templates/acm-template/templates/_namespace_template/app/values.yaml b/ml-platform/templates/acm-template/templates/_namespace_template/app/values.yaml similarity index 100% rename from ml-platform/03_configsync/templates/acm-template/templates/_namespace_template/app/values.yaml rename to ml-platform/templates/acm-template/templates/_namespace_template/app/values.yaml diff --git a/ml-platform/02_gke/variables.tf b/ml-platform/variables.tf similarity index 56% rename from ml-platform/02_gke/variables.tf rename to ml-platform/variables.tf index 05765c043..157c49d6b 100644 --- a/ml-platform/02_gke/variables.tf +++ b/ml-platform/variables.tf @@ -12,12 +12,51 @@ # See the License for the specific language governing permissions and # limitations under the License. +variable "org_id" { + type = string + description = "The GCP orig id" + default = null +} + +variable "env" { + type = set(string) + description = "List of environments" + default = ["dev"] +} + +variable "default_env" { + type = string + description = "Lowest environments" + default = "dev" +} + +variable "folder_id" { + type = string + description = "Folder Id where the GCP projects will be created" + default = null +} + +variable "billing_account" { + type = string + description = "GCP billing account" + default = null +} + +variable "project_name" { + type = string + description = "GCP project name" + default = null +} + +variable "create_projects" { + type = number + description = "Flag to create GCP projects" + default = 0 +} + variable "project_id" { - type = map(any) - description = "The GCP project where the resources will be created. It is a map with environments a skeys and project_ids s values" - default = {} - #Below is an example of not null project_id variable - #default = { "dev" : "project_id1", "staging" : "project_id2", "prod" : "project_id3" } + type = map + description = "The GCP project where the resources will be created. It is a map with environments as keys and project_ids s values" } variable "network_name" { @@ -25,31 +64,26 @@ variable "network_name" { description = "VPC network where GKE cluster will be created" type = string } - variable "routing_mode" { default = "GLOBAL" description = "VPC routing mode." type = string } - variable "subnet_01_name" { default = "ml-vpc-subnet-01" description = "Name of the first subnet in the VPC network." type = string } - variable "subnet_01_ip" { default = "10.40.0.0/22" description = "CIDR of the first subnet." type = string } - variable "subnet_01_region" { default = "us-central1" description = "Region of the first subnet." type = string } - variable "subnet_01_description" { default = "subnet 01" description = "Description of the first subnet." @@ -60,37 +94,33 @@ variable "subnet_02_name" { description = "Name of the second subnet in the VPC network." type = string } - variable "subnet_02_ip" { default = "10.12.0.0/22" description = "CIDR of the second subnet." type = string } - variable "subnet_02_region" { default = "us-west2" description = "Region of the second subnet." type = string } - variable "subnet_02_description" { default = "subnet 02" description = "Description of the second subnet." type = string } - -variable "lookup_state_bucket" { - description = "GCS bucket to look up TF state from previous steps." - type = string - default = "YOUR_STATE_BUCKET" -} +// +//variable "lookup_state_bucket" { +// description = "GCS bucket to look up TF state from previous steps." +// type = string +// default = "YOUR_STATE_BUCKET" +//} variable "cluster_name" { description = "Name of the GKE cluster" default = "gke-ml" type = string } - variable "reserved_taints" { description = "Taints to be applied to the reserved node pool." type = list(object({ @@ -132,3 +162,56 @@ variable "spot_taints" { effect = "NO_SCHEDULE" }] } + +variable "configsync_repo_name" { + type = string + description = "Name of the GitHub repo that will be synced to the cluster with Config sync." + default = "config-sync-repo" +} + +variable "github_user" { + description = "GitHub user name." + type = string +} +variable "github_email" { + description = "GitHub user email." + type = string +} +variable "github_org" { + type = string + description = "GitHub org." +} +variable "github_token" { + type = string + description = "GitHub token. It is a token with write permissions as it will create a repo in the GitHub org." +} + +variable "secret_for_rootsync" { + type = number + description = "Create git-cred in config-management-system namespace." + default = 1 +} + +variable "create_namespace" { + type = number + description = "Setup a namespace to demo." + default = 1 +} + +variable "namespace" { + type = string + description = "Name of the namespace to demo." + default = "ml-team" +} + +variable "install_kuberay" { + type = number + description = "Flag to install kuberay operator." + default = 1 +} + +variable "install_ray_in_ns" { + type = number + description = "Flag to install ray cluster in the namespace created with the demo." + default = 1 +} \ No newline at end of file diff --git a/ml-platform/03_configsync/providers.tf b/ml-platform/versions.tf similarity index 81% rename from ml-platform/03_configsync/providers.tf rename to ml-platform/versions.tf index 6ba18fc39..4f0c767da 100644 --- a/ml-platform/03_configsync/providers.tf +++ b/ml-platform/versions.tf @@ -14,23 +14,24 @@ terraform { required_providers { - google-beta = { - source = "hashicorp/google-beta" - version = "4.72.1" - } google = { source = "hashicorp/google" - version = "4.72.1" + version = "5.19.0" } - kubernetes = { - source = "hashicorp/kubernetes" - version = "2.21.1" + google-beta = { + source = "hashicorp/google-beta" + version = "5.19.0" } github = { - source = "hashicorp/github" - version = ">= 4.3.0" + source = "integrations/github" + version = "6.0.1" + } + null = { + source = "hashicorp/null" + version = "3.2.2" } } + } provider "github" { From 63f6272c51cbe88891bfb31270eab95a17108447 Mon Sep 17 00:00:00 2001 From: Aaron Rueth Date: Wed, 13 Mar 2024 11:44:04 -0700 Subject: [PATCH 07/39] MLP restructure (#347) * Updated folder structure * Added steps to for Terraform apply and destory * Enabled image streaming * Moved acm-templates to the correct folder * Modified git token command to remove new lines * Cleaned up main.tf and fixed issue with create_cluster_yamls.sh * Cleaned up create_git_cred.sh * Cleaned up install_kuberay_operator.sh * Cleaned up create_namespace.sh and fixed and issue with the templates * Cleaned up install_ray_cluster.sh * Removed unnecessary outputs * Bringing in changes from PR #332 --- ml-platform/README.md | 320 ++++++++++-------- ml-platform/docs/images/configsync.png | Bin 0 -> 36630 bytes ml-platform/mlenv.auto.tfvars | 9 - ml-platform/terraform/README.md | 112 ++++++ ml-platform/{ => terraform}/backend.tf | 0 ml-platform/{ => terraform}/main.tf | 238 +++++++++---- ml-platform/terraform/mlp.auto.tfvars | 9 + .../modules/cloud-nat/README.md | 0 .../{ => terraform}/modules/cloud-nat/main.tf | 0 .../modules/cloud-nat/outputs.tf | 0 .../modules/cloud-nat/variables.tf | 0 .../modules/cloud-nat/versions.tf | 0 .../{ => terraform}/modules/cluster/gke.tf | 9 +- .../modules/cluster/outputs.tf | 0 .../modules/cluster/variables.tf | 0 .../modules/cluster/versions.tf | 0 .../{ => terraform}/modules/network/README.md | 0 .../modules/network/outputs.tf | 0 .../modules/network/variables.tf | 0 .../modules/network/versions.tf | 0 .../{ => terraform}/modules/network/vpc.tf | 0 .../modules/node-pools/nodepools.tf | 5 +- .../modules/node-pools/variables.tf | 0 .../modules/node-pools/versions.tf | 0 .../modules/projects/outputs.tf | 0 .../modules/projects/projects.tf | 0 .../modules/projects/variables.tf | 0 .../modules/projects/versions.tf | 0 .../modules/vm-reservations/outputs.tf | 0 .../modules/vm-reservations/reservations.tf | 0 .../modules/vm-reservations/variables.tf | 0 .../modules/vm-reservations/versions.tf | 0 ml-platform/{ => terraform}/outputs.tf | 8 - .../scripts}/create_cluster_yamls.sh | 20 +- .../scripts}/create_git_cred.sh | 22 +- .../scripts}/create_namespace.sh | 19 +- .../scripts}/install_kuberay_operator.sh | 18 +- .../scripts}/install_ray_cluster.sh | 15 +- .../{ => terraform/scripts}/manage_ray_ns.sh | 11 +- .../acm-template/manifests/apps/.gitkeep | 0 .../acm-template/manifests/clusters/.gitkeep | 0 .../templates/_cluster_template/cluster.yaml | 0 .../_cluster_template/config-selector.yaml | 0 .../kuberay/kustomization.yaml | 0 .../kuberay/rayclusters.yaml | 0 .../_cluster_template/kuberay/rayjobs.yaml | 0 .../kuberay/rayservices.yaml | 0 .../_cluster_template/kuberay/rbac.yaml | 0 .../_cluster_template/kuberay/values.yaml | 0 .../_cluster_template/kustomization.yaml | 0 .../templates/_cluster_template/selector.yaml | 0 .../_cluster_template/team/kustomization.yaml | 0 .../_cluster_template/team/namespace.yaml | 0 .../team/network-policy.yaml | 0 .../_cluster_template/team/rbac.yaml | 0 .../_cluster_template/team/reposync.yaml | 39 ++- .../app/fluentd_config.yaml | 0 .../app/kustomization.yaml | 0 .../app/serviceaccount.yaml | 0 .../_namespace_template/app/values.yaml | 0 ml-platform/{ => terraform}/variables.tf | 25 +- ml-platform/{ => terraform}/versions.tf | 0 62 files changed, 592 insertions(+), 287 deletions(-) create mode 100644 ml-platform/docs/images/configsync.png delete mode 100644 ml-platform/mlenv.auto.tfvars create mode 100644 ml-platform/terraform/README.md rename ml-platform/{ => terraform}/backend.tf (100%) rename ml-platform/{ => terraform}/main.tf (62%) create mode 100644 ml-platform/terraform/mlp.auto.tfvars rename ml-platform/{ => terraform}/modules/cloud-nat/README.md (100%) rename ml-platform/{ => terraform}/modules/cloud-nat/main.tf (100%) rename ml-platform/{ => terraform}/modules/cloud-nat/outputs.tf (100%) rename ml-platform/{ => terraform}/modules/cloud-nat/variables.tf (100%) rename ml-platform/{ => terraform}/modules/cloud-nat/versions.tf (100%) rename ml-platform/{ => terraform}/modules/cluster/gke.tf (96%) rename ml-platform/{ => terraform}/modules/cluster/outputs.tf (100%) rename ml-platform/{ => terraform}/modules/cluster/variables.tf (100%) rename ml-platform/{ => terraform}/modules/cluster/versions.tf (100%) rename ml-platform/{ => terraform}/modules/network/README.md (100%) rename ml-platform/{ => terraform}/modules/network/outputs.tf (100%) rename ml-platform/{ => terraform}/modules/network/variables.tf (100%) rename ml-platform/{ => terraform}/modules/network/versions.tf (100%) rename ml-platform/{ => terraform}/modules/network/vpc.tf (100%) rename ml-platform/{ => terraform}/modules/node-pools/nodepools.tf (97%) rename ml-platform/{ => terraform}/modules/node-pools/variables.tf (100%) rename ml-platform/{ => terraform}/modules/node-pools/versions.tf (100%) rename ml-platform/{ => terraform}/modules/projects/outputs.tf (100%) rename ml-platform/{ => terraform}/modules/projects/projects.tf (100%) rename ml-platform/{ => terraform}/modules/projects/variables.tf (100%) rename ml-platform/{ => terraform}/modules/projects/versions.tf (100%) rename ml-platform/{ => terraform}/modules/vm-reservations/outputs.tf (100%) rename ml-platform/{ => terraform}/modules/vm-reservations/reservations.tf (100%) rename ml-platform/{ => terraform}/modules/vm-reservations/variables.tf (100%) rename ml-platform/{ => terraform}/modules/vm-reservations/versions.tf (100%) rename ml-platform/{ => terraform}/outputs.tf (69%) rename ml-platform/{ => terraform/scripts}/create_cluster_yamls.sh (74%) rename ml-platform/{ => terraform/scripts}/create_git_cred.sh (61%) rename ml-platform/{ => terraform/scripts}/create_namespace.sh (78%) rename ml-platform/{ => terraform/scripts}/install_kuberay_operator.sh (82%) rename ml-platform/{ => terraform/scripts}/install_ray_cluster.sh (76%) rename ml-platform/{ => terraform/scripts}/manage_ray_ns.sh (86%) rename ml-platform/{ => terraform}/templates/acm-template/manifests/apps/.gitkeep (100%) rename ml-platform/{ => terraform}/templates/acm-template/manifests/clusters/.gitkeep (100%) rename ml-platform/{ => terraform}/templates/acm-template/templates/_cluster_template/cluster.yaml (100%) rename ml-platform/{ => terraform}/templates/acm-template/templates/_cluster_template/config-selector.yaml (100%) rename ml-platform/{ => terraform}/templates/acm-template/templates/_cluster_template/kuberay/kustomization.yaml (100%) rename ml-platform/{ => terraform}/templates/acm-template/templates/_cluster_template/kuberay/rayclusters.yaml (100%) rename ml-platform/{ => terraform}/templates/acm-template/templates/_cluster_template/kuberay/rayjobs.yaml (100%) rename ml-platform/{ => terraform}/templates/acm-template/templates/_cluster_template/kuberay/rayservices.yaml (100%) rename ml-platform/{ => terraform}/templates/acm-template/templates/_cluster_template/kuberay/rbac.yaml (100%) rename ml-platform/{ => terraform}/templates/acm-template/templates/_cluster_template/kuberay/values.yaml (100%) rename ml-platform/{ => terraform}/templates/acm-template/templates/_cluster_template/kustomization.yaml (100%) rename ml-platform/{ => terraform}/templates/acm-template/templates/_cluster_template/selector.yaml (100%) rename ml-platform/{ => terraform}/templates/acm-template/templates/_cluster_template/team/kustomization.yaml (100%) rename ml-platform/{ => terraform}/templates/acm-template/templates/_cluster_template/team/namespace.yaml (100%) rename ml-platform/{ => terraform}/templates/acm-template/templates/_cluster_template/team/network-policy.yaml (100%) rename ml-platform/{ => terraform}/templates/acm-template/templates/_cluster_template/team/rbac.yaml (100%) rename ml-platform/{ => terraform}/templates/acm-template/templates/_cluster_template/team/reposync.yaml (81%) rename ml-platform/{ => terraform}/templates/acm-template/templates/_namespace_template/app/fluentd_config.yaml (100%) rename ml-platform/{ => terraform}/templates/acm-template/templates/_namespace_template/app/kustomization.yaml (100%) rename ml-platform/{ => terraform}/templates/acm-template/templates/_namespace_template/app/serviceaccount.yaml (100%) rename ml-platform/{ => terraform}/templates/acm-template/templates/_namespace_template/app/values.yaml (100%) rename ml-platform/{ => terraform}/variables.tf (96%) rename ml-platform/{ => terraform}/versions.tf (100%) diff --git a/ml-platform/README.md b/ml-platform/README.md index 90c3a165f..2edc62bae 100644 --- a/ml-platform/README.md +++ b/ml-platform/README.md @@ -1,114 +1,14 @@ - -## Requirements - -| Name | Version | -|------|---------| -| [github](#requirement\_github) | 6.0.1 | -| [google](#requirement\_google) | 5.19.0 | -| [google-beta](#requirement\_google-beta) | 5.19.0 | -| [null](#requirement\_null) | 3.2.2 | - -## Providers - -| Name | Version | -|------|---------| -| [github](#provider\_github) | 6.0.1 | -| [google](#provider\_google) | 5.19.0 | -| [google-beta](#provider\_google-beta) | 5.19.0 | -| [null](#provider\_null) | 3.2.2 | - -## Modules - -| Name | Source | Version | -|------|--------|---------| -| [cloud-nat](#module\_cloud-nat) | ./modules/cloud-nat | n/a | -| [create-vpc](#module\_create-vpc) | ./modules/network | n/a | -| [gcp-project](#module\_gcp-project) | ./modules/projects | n/a | -| [gke](#module\_gke) | ./modules/cluster | n/a | -| [node\_pool-ondemand](#module\_node\_pool-ondemand) | ./modules/node-pools | n/a | -| [node\_pool-reserved](#module\_node\_pool-reserved) | ./modules/node-pools | n/a | -| [node\_pool-spot](#module\_node\_pool-spot) | ./modules/node-pools | n/a | -| [reservation](#module\_reservation) | ./modules/vm-reservations | n/a | - -## Resources - -| Name | Type | -|------|------| -| [github_branch.branch](https://registry.terraform.io/providers/integrations/github/6.0.1/docs/resources/branch) | resource | -| [github_branch_default.default_branch](https://registry.terraform.io/providers/integrations/github/6.0.1/docs/resources/branch_default) | resource | -| [github_branch_protection_v3.branch_protection](https://registry.terraform.io/providers/integrations/github/6.0.1/docs/resources/branch_protection_v3) | resource | -| [github_repository.acm_repo](https://registry.terraform.io/providers/integrations/github/6.0.1/docs/resources/repository) | resource | -| [google-beta_google_gke_hub_feature.configmanagement_acm_feature](https://registry.terraform.io/providers/hashicorp/google-beta/5.19.0/docs/resources/google_gke_hub_feature) | resource | -| [google-beta_google_gke_hub_feature_membership.feature_member](https://registry.terraform.io/providers/hashicorp/google-beta/5.19.0/docs/resources/google_gke_hub_feature_membership) | resource | -| [google-beta_google_gke_hub_membership.membership](https://registry.terraform.io/providers/hashicorp/google-beta/5.19.0/docs/resources/google_gke_hub_membership) | resource | -| [google_project_service.project_services-an](https://registry.terraform.io/providers/hashicorp/google/5.19.0/docs/resources/project_service) | resource | -| [google_project_service.project_services-anc](https://registry.terraform.io/providers/hashicorp/google/5.19.0/docs/resources/project_service) | resource | -| [google_project_service.project_services-com](https://registry.terraform.io/providers/hashicorp/google/5.19.0/docs/resources/project_service) | resource | -| [google_project_service.project_services-con](https://registry.terraform.io/providers/hashicorp/google/5.19.0/docs/resources/project_service) | resource | -| [google_project_service.project_services-cr](https://registry.terraform.io/providers/hashicorp/google/5.19.0/docs/resources/project_service) | resource | -| [google_project_service.project_services-gate](https://registry.terraform.io/providers/hashicorp/google/5.19.0/docs/resources/project_service) | resource | -| [google_project_service.project_services-gkecon](https://registry.terraform.io/providers/hashicorp/google/5.19.0/docs/resources/project_service) | resource | -| [google_project_service.project_services-gkeh](https://registry.terraform.io/providers/hashicorp/google/5.19.0/docs/resources/project_service) | resource | -| [google_project_service.project_services-iam](https://registry.terraform.io/providers/hashicorp/google/5.19.0/docs/resources/project_service) | resource | -| [null_resource.create_git_cred_cms](https://registry.terraform.io/providers/hashicorp/null/3.2.2/docs/resources/resource) | resource | -| [null_resource.create_git_cred_ns](https://registry.terraform.io/providers/hashicorp/null/3.2.2/docs/resources/resource) | resource | -| [null_resource.create_namespace](https://registry.terraform.io/providers/hashicorp/null/3.2.2/docs/resources/resource) | resource | -| [null_resource.install_kuberay_operator](https://registry.terraform.io/providers/hashicorp/null/3.2.2/docs/resources/resource) | resource | -| [null_resource.install_ray_cluster](https://registry.terraform.io/providers/hashicorp/null/3.2.2/docs/resources/resource) | resource | -| [null_resource.manage_ray_ns](https://registry.terraform.io/providers/hashicorp/null/3.2.2/docs/resources/resource) | resource | - -## Inputs - -| Name | Description | Type | Default | Required | -|------|-------------|------|---------|:--------:| -| [billing\_account](#input\_billing\_account) | GCP billing account | `string` | `null` | no | -| [cluster\_name](#input\_cluster\_name) | Name of the GKE cluster | `string` | `"gke-ml"` | no | -| [configsync\_repo\_name](#input\_configsync\_repo\_name) | Name of the GitHub repo that will be synced to the cluster with Config sync. | `string` | `"config-sync-repo"` | no | -| [create\_namespace](#input\_create\_namespace) | Setup a namespace to demo. | `number` | `1` | no | -| [create\_projects](#input\_create\_projects) | Flag to create GCP projects | `number` | `0` | no | -| [env](#input\_env) | List of environments | `set(string)` |
[
"dev"
]
| no | -| [folder\_id](#input\_folder\_id) | Folder Id where the GCP projects will be created | `string` | `null` | no | -| [github\_email](#input\_github\_email) | GitHub user email. | `string` | n/a | yes | -| [github\_org](#input\_github\_org) | GitHub org. | `string` | n/a | yes | -| [github\_token](#input\_github\_token) | GitHub token. It is a token with write permissions as it will create a repo in the GitHub org. | `string` | n/a | yes | -| [github\_user](#input\_github\_user) | GitHub user name. | `string` | n/a | yes | -| [install\_kuberay](#input\_install\_kuberay) | Flag to install kuberay operator. | `number` | `1` | no | -| [install\_ray\_in\_ns](#input\_install\_ray\_in\_ns) | Flag to install ray cluster in the namespace created with the demo. | `number` | `1` | no | -| [namespace](#input\_namespace) | Name of the namespace to demo. | `string` | `"ml-team"` | no | -| [network\_name](#input\_network\_name) | VPC network where GKE cluster will be created | `string` | `"ml-vpc"` | no | -| [ondemand\_taints](#input\_ondemand\_taints) | Taints to be applied to the on-demand node pool. |
list(object({
key = string
value = any
effect = string
}))
|
[
{
"effect": "NO_SCHEDULE",
"key": "ondemand",
"value": true
}
]
| no | -| [org\_id](#input\_org\_id) | The GCP orig id | `string` | `null` | no | -| [project\_id](#input\_project\_id) | The GCP project where the resources will be created. It is a map with environments as keys and project\_ids s values | `map` | n/a | yes | -| [project\_name](#input\_project\_name) | GCP project name | `string` | `null` | no | -| [reserved\_taints](#input\_reserved\_taints) | Taints to be applied to the reserved node pool. |
list(object({
key = string
value = any
effect = string
}))
|
[
{
"effect": "NO_SCHEDULE",
"key": "reserved",
"value": true
}
]
| no | -| [routing\_mode](#input\_routing\_mode) | VPC routing mode. | `string` | `"GLOBAL"` | no | -| [secret\_for\_rootsync](#input\_secret\_for\_rootsync) | Create git-cred in config-management-system namespace. | `number` | `1` | no | -| [spot\_taints](#input\_spot\_taints) | Taints to be applied to the spot node pool. |
list(object({
key = string
value = any
effect = string
}))
|
[
{
"effect": "NO_SCHEDULE",
"key": "spot",
"value": true
}
]
| no | -| [subnet\_01\_description](#input\_subnet\_01\_description) | Description of the first subnet. | `string` | `"subnet 01"` | no | -| [subnet\_01\_ip](#input\_subnet\_01\_ip) | CIDR of the first subnet. | `string` | `"10.40.0.0/22"` | no | -| [subnet\_01\_name](#input\_subnet\_01\_name) | Name of the first subnet in the VPC network. | `string` | `"ml-vpc-subnet-01"` | no | -| [subnet\_01\_region](#input\_subnet\_01\_region) | Region of the first subnet. | `string` | `"us-central1"` | no | -| [subnet\_02\_description](#input\_subnet\_02\_description) | Description of the second subnet. | `string` | `"subnet 02"` | no | -| [subnet\_02\_ip](#input\_subnet\_02\_ip) | CIDR of the second subnet. | `string` | `"10.12.0.0/22"` | no | -| [subnet\_02\_name](#input\_subnet\_02\_name) | Name of the second subnet in the VPC network. | `string` | `"gke-vpc-subnet-02"` | no | -| [subnet\_02\_region](#input\_subnet\_02\_region) | Region of the second subnet. | `string` | `"us-west2"` | no | - -## Outputs - -| Name | Description | -|------|-------------| -| [project\_ids](#output\_project\_ids) | n/a | - +# Machine learning platform (MLP) on GKE reference architecture for enabling Machine Learning Operations (MLOps) ## Platform Principles This reference architecture demonstrates how to build a GKE platform that facilitates Machine Learning. The reference architecture is based on the following principles: - - The platform admin will create the GKE platform using IaC tool like [Terraform][terraform]. The IaC will come with re-usuable modules that can be referred to create more resources as the demand grows. - - The platform will be based on [GitOps][gitops]. - - After the GKE platform has been created, cluster scoped resources on it will be created through [Config Sync][config-sync] by the admins. - - Platform admins will create a namespace per application and provide the application team member full access to it. - - The namespace scoped resources will be created by the Application/ML teams either via [Config Sync][config-sync] or through a deployment tool like [Cloud Deploy][cloud-deploy] +- The platform admin will create the GKE platform using IaC tool like [Terraform][terraform]. The IaC will come with re-usuable modules that can be referred to create more resources as the demand grows. +- The platform will be based on [GitOps][gitops]. +- After the GKE platform has been created, cluster scoped resources on it will be created through [Config Sync][config-sync] by the admins. +- Platform admins will create a namespace per application and provide the application team member full access to it. +- The namespace scoped resources will be created by the Application/ML teams either via [Config Sync][config-sync] or through a deployment tool like [Cloud Deploy][cloud-deploy] ## CUJ and Personae addressed in the reference architecture @@ -124,8 +24,6 @@ This reference architecture demonstrates how to build a GKE platform that facili **CUJ 4** : Enforcing security policies on the underlying platform. - - ### Persona : ML Engineers **CUJ 1** : Use ML tools like `ray` to perform their day to day tasks like data pre-processing, ML training etc. @@ -145,7 +43,7 @@ This reference architecture demonstrates how to build a GKE platform that facili ## Prerequistes 1. This tutorial has been tested on [Cloud Shell](https://shell.cloud.google.com) which comes preinstalled with [Google Cloud SDK](https://cloud.google.com/sdk) is required to complete this tutorial. -2. Familiarity with [Google Kubernetes Engine][gke], [Terraform][terraform], [root-sync][root-sync] , [repo-sync][repo-sync] , [Git][git], [GitHub][github] +2. Familiarity with [Google Kubernetes Engine][gke], [Terraform][terraform], [root-sync][root-sync] , [repo-sync][repo-sync] , [Git][git], [GitHub][github] # Workflow @@ -156,79 +54,209 @@ This reference architecture can be implemented in one of the following ways: - Deploy a multi env reference architecture with each env in its own [GCP project][gcp-project] ## Deploy a single env reference architecture + This is the quick-start deployment. It can be used to quickly set up an environment and start playing with it to get an understanding of the flow. Single env reference architecture can be deployed with the provided default values. ### Configuration -- You can either create a new GCP project or use an existing one. Skip this step if you choose to use an already existing project. - - To create a new project, open `cloudshell` and run the following command: - ``` - gcloud projects create - ``` - - Associate billing account to the project: - ``` - gcloud beta billing projects link \ - --billing-account - ``` + +- You can either create a new GCP project or use an existing one. Skip this step if you choose to use an already existing project. + - To create a new project, open `cloudshell` and run the following command: + ``` + gcloud projects create + ``` + - Associate billing account to the project: + ``` + gcloud beta billing projects link \ + --billing-account + ``` - Set up PROJECT_ID in environment variable in `cloudshell` : + ``` export PROJECT_ID="" >> ~/.bashrc ``` - - Replace with the id of the project that you created in the previous step or the id of an already existing project that you want to use. - + + Replace with the id of the project that you created in the previous step or the id of an already existing project that you want to use. + + **If you are using an already existing project, get `roles/owner` role on the project** + - Update ~/bashrc to automatically point to the required project when a new instance of the `cloudshell` is created: + ``` echo gcloud config set project $PROJECT_ID >> ~/.bashrc && source ~/.bashrc ``` - Create a GCS bucket in the project for storing TF state. - - To create a new bucket, run the following command in `cloudshell` : - ``` - export STATE_BUCKET="${PROJECT_ID}-tf-state" >> ~/.bashrc && source ~/.bashrc - - gcloud storage buckets create gs://${STATE_BUCKET} - ``` + + - To create a new bucket, run the following command in `cloudshell` : + + ``` + export STATE_BUCKET="${PROJECT_ID}-tf-state" >> ~/.bashrc && source ~/.bashrc + + gcloud storage buckets create gs://${STATE_BUCKET} + ``` - Store github configurations in environment variables: ``` - export GITHUB_USER= >> ~/.bashrc - export GITHUB_ORG= >> ~/.bashrc - export GITHUB_EMAIL= >> ~/.bashrc + export GITHUB_USER= >> ~/.bashrc + export GITHUB_ORG= >> ~/.bashrc + export GITHUB_EMAIL= >> ~/.bashrc source ~/.bashrc ``` - - Create a [Personal Access Token][personal-access-token] in [GitHub][github]: - + Note: It is recommended to use a [machine user account][machine-user-account] for this but you can use a personal user account just to try this reference architecture. + - Go to https://github.com/settings/tokens and login using your credentials - Click "Generate new token" >> "Generate new token (classic)". - You will be directed to a screen to created the new token. Provide the note and expiration. - Choose the following two access: - - [x] repo - - [x] delete_repo + - [x] repo - Full control of private repositories + - [x] delete_repo - Delete repositories - Click "Generate token" - - Store the token safely. + - Store the token in a secure file. + + ``` + # Create a secure directory + mkdir -p ${HOME}/secrets/ + chmod go-rwx ${HOME}/secrets + + # Create a secure file + touch ${HOME}/secrets/mlp-github-token + chmod go-rwx ${HOME}/secrets/mlp-github-token + + # Put the token in the secure file using your prefered editor + nano ${HOME}/secrets/mlp-github-token + ``` ### Run Terraform -- Clone the repo and change dir +- Clone the repository and change directory to the `ml-platform` directory + ``` git clone https://github.com/GoogleCloudPlatform/ai-on-gke - cd ml-platform - ``` + ``` + +- Set environment variables + + ``` + export MLP_BASE_DIR=$(pwd) && \ + echo "export MLP_BASE_DIR=${MLP_BASE_DIR}" >> ${HOME}/.bashrc + ``` + +- Set the configuration variables + + ``` + sed -i "s/YOUR_STATE_BUCKET/${STATE_BUCKET}/g" ${MLP_BASE_DIR}/terraform/backend.tf + sed -i "s/YOUR_GITHUB_EMAIL/${GITHUB_EMAIL}/g" ${MLP_BASE_DIR}/terraform/mlp.auto.tfvars + sed -i "s/YOUR_GITHUB_ORG/${GITHUB_ORG}/g" ${MLP_BASE_DIR}/terraform/mlp.auto.tfvars + sed -i "s/YOUR_GITHUB_USER/${GITHUB_USER}/g" ${MLP_BASE_DIR}/terraform/mlp.auto.tfvars + sed -i "s/YOUR_PROJECT_ID/${PROJECT_ID}/g" ${MLP_BASE_DIR}/terraform/mlp.auto.tfvars + ``` + +- Create the resources + + ``` + cd ${MLP_BASE_DIR}/terraform && \ + terraform init && \ + terraform plan -input=false -var github_token="$(tr --delete '\n' < ${HOME}/secrets/mlp-github-token)" -out=tfplan && \ + terraform apply -input=false tfplan && \ + rm tfplan + ``` + +### Review the resources + +#### GKE clusters and ConfigSync + +- Go to Google Cloud Console, click on the navigation menu and click on Kubernetes Engine > Clusters. You should see three clusters. + +- Go to Google Cloud Console, click on the navigation menu and click on Kubernetes Engine > Config. If you haven't enabled GKE Enterprise in the project earlier, Click `LEARN AND ENABLE` button and then `ENABLE GKE ENTERPRISE`. You should see a RootSync and RepoSync object. + ![configsync](docs/images/configsync.png) + +#### Software installed via RepoSync and Reposync + +Open `cloudshell` to execute the following commands: + +- Store your GKE cluster name in env variable: + + `export GKE_CLUSTER=` + +- Get cluster credentials: + + ``` + gcloud container fleet memberships get-credentials ${GKE_CLUSTER} + ``` + +- Fetch kuberay operator CRDs + + ``` + kubectl get crd | grep ray + ``` + + The output will be similar to the following: -- Perform variable replacement. ``` - sed -i "s/YOUR_STATE_BUCKET/${STATE_BUCKET}/g" backend.tf - - sed -i "s/YOUR_PROJECT_ID/${PROJECT_ID}/g" terraform.tfvars + rayclusters.ray.io 2024-02-12T21:19:06Z + rayjobs.ray.io 2024-02-12T21:19:09Z + rayservices.ray.io 2024-02-12T21:19:12Z ``` -Typically, you would want to have dev, staging and production environments created in separate projects. To have such isolation, pass `env` input variable as `[ "dev", "staging", "prod" ]`. This will create one project for dev, staging and prod environments. You can update the input variable `env` based on how many environments/projects you want to create. +- Fetch kuberay operator pod -However, if you want to use a single project for multiple environments, you can create just one project by passing one element to `env` input variable list e.g [ "dev" ] or ["my-playground"] etc. + ``` + kubectl get pods + ``` + + The output will be similar to the following: + + ``` + NAME READY STATUS RESTARTS AGE + kuberay-operator-56b8d98766-2nvht 1/1 Running 0 6m26s + ``` + +- Check the namespace `ml-team` created: + + ``` + kubectl get ns | grep ml-team + ``` +- Check the RepoSync object created `ml-team` namespace: + ``` + kubectl get reposync -n ml-team + ``` +- Check the `raycluster` in `ml-team` namespace + + ``` + kubectl get raycluster -n ml-team + ``` + + The output will be similar to the following: + + ``` + NAME DESIRED WORKERS AVAILABLE WORKERS STATUS AGE + ray-cluster-kuberay 1 1 ready 29m + ``` + +- Check the head and worker pods of kuberay`in`ml-team` namespace + ``` + kubectl get pods -n -n ml-team + ``` + The output will be similar to the following: + ``` + NAME READY STATUS RESTARTS AGE + ray-cluster-kuberay-head-sp6dg 2/2 Running 0 3m21s + ray-cluster-kuberay-worker-workergroup-rzpjw 2/2 Running 0 3m21s + ``` + +### Cleanup + +- Destroy the resources + + ``` + cd ${MLP_BASE_DIR}/terraform && \ + terraform init && \ + terraform destroy -auto-approve -var github_token="$(tr --delete '\n' < ${HOME}/secrets/mlp-github-token)" + ``` [gitops]: https://about.gitlab.com/topics/gitops/ [repo-sync]: https://cloud.google.com/anthos-config-management/docs/reference/rootsync-reposync-fields @@ -241,4 +269,4 @@ However, if you want to use a single project for multiple environments, you can [github]: https://github.com/ [gcp-project]: https://cloud.google.com/resource-manager/docs/creating-managing-projects [personal-access-token]: https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/managing-your-personal-access-tokens -[machine-user-account]: https://docs.github.com/en/get-started/learning-about-github/types-of-github-accounts \ No newline at end of file +[machine-user-account]: https://docs.github.com/en/get-started/learning-about-github/types-of-github-accounts diff --git a/ml-platform/docs/images/configsync.png b/ml-platform/docs/images/configsync.png new file mode 100644 index 0000000000000000000000000000000000000000..75ed75ca4823226adb9767ffe476abb75bd87813 GIT binary patch literal 36630 zcmd3tWmFqoyZ5mc_u@|Q7Aq79?(VdO;;umh6nBT9#oZ|`1&UiqffBU1yB2qQ)B8E+ ztn=YM-`}++tXT;&vuE#XF8lu`Qe9OJ8-pAJ0RaJ9L0(!D0RbfmzTb)V68<+3N7Wwz z0i)DLN=jWpN{Uw9#nIBn-U0zZKJrTns(R8KG4%M_cP|tnUkSYfeFB}9@d5#3LPA@T zfHoNqIe+fU4VT z+nbBeB0De-D;xxE3^@kHKnp~9BFl(i-$k+njd838JaGgvb;K7xom=+kKd7rCh=T^6 zmj=8Lc=wLeRBRvbpPzqketrORAgJRFIk$ry*oK(LFJoR4SRn*x=Q%G`VvRExiR((l z5J#LXDX6Ax=PN)mw>h;0^LwZfjK`|cUg5}PAt1<5#qzxUj$VdiAv#1XnIj%?X5PV3 zUT!0t9G7PwvK^tGGi5d4TbgvjKO#`h6rM!8wP#oov&t$@QtagNGh&j7x0kwRzYNo1Te=^&ixgWZMx|qhA=p6*eM$nzY6Exp_^)cwVewS_y~<_WSce zrN~K?f{HKCLq-2I?06}8tEoI!8zJho@&mZ1KF$kSHTiYMzpF^6X03{aUSDL4peJod zzg;Z0iqs4$x_x*>V)KB*R7qCU6RJPe|{WnnnlK zqY$lsp^9(y*eg47A~Xskkbi0+aNY)E@!ST#U=T;f5GS%mGBD{ zQP8c~En}zp!ZiNHhafjvnpZ&}V`diQ`))*aY)H^X9)1=gNzZ#HLU4e?3EoAZ#0f!6 z05=zo1RiyPBv_tkBItrn>nDb(Bu6~SD})zP8tQM75zy(-8wutSh$Lmy$r@?C2Kt4n zN|EFrs566U_QS*So5!q316ga4DiJBV&yAUW1wF2*e@4$j;0$+N6aI{;kMXfrZuR92 zy$|7^u=~~26BZ=Aw*?{A082E&Q*opOYX)k3Kt%#l5lXrAe!OK7(f8mBx|(=3Qpzxh ziSaKkRZ=5m(y(Y_xb!GyTs_0e`4ba>50w;wQqLz^skruDT7y8T&2>emdjrXS|pRIgobAm^^ zcQ!mW;#NmD+D?LR;!xg7L>tSTFi8`Jgj|LY^qTYvN|LMznq;vkO)&!rkRydnahzYD z($=Ma&J>WRWFAbwQh*hyYZBN{l_zV9EHa>;5Pi%{QNpOjNS%_sYpdZHGm!u2O;+UkJ zWIWH1E!vyZX)JEk|Ac;oa>RG^<&DO)__S0CpzTwW{#B!W&0!g(0Zfxao#VYj!=lOe z0~e{Y&z{JhoSr#8IS+`pN@vL6kZPstJl8#nUDUmgyTKR&FQi^HNhV6>_iBW?hvLh4 z%251p`eFaW6!8k3gZ636F#6yh@+Qe| zIe26ne}D7+#yEffZQ&c#4a=<;Jbf^AAUw^u_ttI0E!s^FEO*g(LA+ZvRq@+9Pi@L- zpLzdy!twXBv9UFtN$q&vxbaW*AMv0xI~}{5pAoj3wvCHr6T8*iur!-2`*vtc9|~XH zI|W`={_h}+%=vT-b_&>hIzrv-WZCcyT@!;LG%Cw`sGU$fS8}w&x z0z@+F&%-y1&ex+M=PI!8e#yartI+0Emc=hc zQ!0}V_hu)Owp^Fz)wSKap^YHs^}P+<)ww^6$3)xjw=etkph7)u)875iX{*rD7e1D#xSM&RDGkbO}rL+ za~6#jSr*Kub#?f4;bvVC>BBd3Y`k~iJ}asOdlE5ilmFCN(nU0wA3)SLI-qz?$<5m>L0`jRyc_aDh&+ny$BQ4w9vlmv9sMQd zD;yIXgVmiA0TU%lUTw`sPnsz@(*}PG4UO^*>&n;rsF!o9I~=?Zu51opFJ9kT^^9I$ z-5)xZ@s_<23=oRn_)9~Z{+55xkY}TkDDs)o2@i#-dE3^ggvatPz1e#x-Xd=adPR``53(deoLco{FF3nn;_S zoq6?p%5(l0tkj@&UcU0nzXDk3Tk=r3K3BCPzjFpt$r=?A^O69s{r(XWjm$p$iP|u# z9ejOOFnsj!p+>)L-}h4Ub;aewyB#~LY9T{0-(b(e(~_l;FusK*t!8`2Y};l##}<%( zTX|o8;dfWRo$nko)zKcp$NC+^o!$-JCLUI5I-?!SY0I77?cP$?T7SAc<8O^0*5>7A zD(4-1$_-o#_ogQEMy8Bw)i`*~dgbDi!bW?r4mAj)cdU>9sk^4n>#>yb!-T}4u7ZdV zrdSYmsSx6c)kY_@UaE{fYoR<5s*NsaS)z6a8X`!3M8I>x3|O+*Wr^OOG(|uw$aiK& z(9=_z`L!tYu23yQh6*8TO~K1ty@mLpuVm-?c>k#sHR&tgYVMN}PQ#1SHz3Moc;+f# zp`&1_qJqE%-$z4047Wi*f$t&0KjiQa0s>MV(tm%0l9Y%1-}?xie}4q>zI%^=Ada9Q zEusAx@h}@TO90ur~UU$ z7lmmUD1I*^jw1c9TQ}OzbaRFa)_*^mK3+i_R-`Nq{MTdQUv{H`*N0o2Hx8Rc|IbU% zCIomejfmbKMxt7WqeTS#`$meW;>tsC_X-go@87nc47667O!QR$$8FLZqiia_{r~sy zG1ham{~0?z60X=oEro~|*v2lE!(h?tpz7mY6(_y=F-pY{V^*82PMOX4o*z%@NA=zB_hYhKZ z#E##$h{Jp43*~V6A1@|TC!XPBTrATlT+~!z-}<2M{A0r17eBCx_GCNLv9&bU4|q8t zbXXgFFk5G~&pkiagu()~%JrT0Js;tB*iN@{$K-6U)$`o?VnO?*Tt`*pQk&nnJHWC$ zy}^3PZN@E5`0OXcxZ)v=sL$sH*Zose(A=ol^I-#|XJmC)-*sf`!zUZYnQ|>(lEj2AT=xsJeQQvQ+8nyaA->)ox_~iEc{cEo(FV%L_l#_~@ z&V9d|HB_;)@8>X~R8rsTC67PFsfK4%ofER~`QoELnN$<|J}P?u%P3nhMk9l@B&|1Y zN+)GELcul_0k-FT+%c27;B!8*W9a|rcKvWZDnzL)^ryvaL@@c!_Z@1v$CjNOZ@$R} z-rlKn_{nn@tP5io>xVfq-K}M=WT+GrwB{o8>sg(ACM0UESsjyq$B8}d+K*X0+?@V_ z@17jU9CbZEK_DI?Xr9|O}ub@y}diASxUFc*(26>l9*}pI416ioE+uKuY6pqyUS5Q$N_(LLZEh^ zHF3zWDLMV}wAvF)^^!Az1incA?vg$DTa{|-69t7~lZowep<&To*1odlSUWl)$`|dG zE6%q+fzYcBJdDr?fGggdsGiS#Ss{c?Fi9XBr}YSPvaI*81w^mO$P0y{DZ2_`(g$*>zN;xh9j^|G}9iA*k@F#{aP*|1{Tmi-bD< zJ+baixDBIkoD$&c^|ERs6ugP2@g?9+BLRuQuEzqt8yZ*cH9)L74xVq+1%L&hR!VC7j39p&{~R8z`LsOh{LT38)0pVqZ< z3#axfd$hkw|ALAGE5sZjsG6i9a%1A|eRsQ=)VAQ*wM4NFzF809QtGu9yWJ!dJFB~x zP-?3-XuE@~y{>6rNAm@=$H*f?==xLCePtu$Ae4K4kCz|CK6R~yQQZ0J2bYz>ZN0DK za#{}(?ojuG*j2>)3OuR8Z=g@ysDfgO{&2P6_~)=;^s3NhI{o44{&=bD>5`NGxc$8K zsCi{M8y=1gyJnyMv>o5QlixM2{xJactVj~~t=B*UH0aZjgG5uq>N4UBqnne7dWAeiP2=0uA_U9dO;ttNQ&QDL2us%IWU*0hi+1D5JZdbfOc0;H=>1rN zwP1IXQI}t5Y*CH4uUvL__e3Bn^=^$D{1)#HCO)QF<#_rmfX3Z*EwqaK6xyJ0PY-+_ zSKYFXv2r!}QR3q`;P9nAS;cVuVSM8bVZ`&}vA=J6Pwv5quA}Dg;}ASg7~}Cf`ZnX! z{}x1XBrqC1m({qJj0~b3#r8$pX9(G08pJX8|3Hy@iIphZq4bHOx5GjNfR)&tyJt70 zGG&&GzhmHcyH#uzgvc~cV*ia!Dy-$*v`*iGmOir3BYdnUT7Bbk6UN^WAdm2eWXW?U zd*{Rq@3(@l-0dVyQKy}DB<>FD zUQ;LK1^zPG6!2>h#Ta1lN5dW;%a`(%o?@%0T)zPTF@oRIBgvqx=H*Sw%1B8C<|*wm zv?^|mcx2&M{+241E=Awifu2J(cB_86eo4ghF@hT06A7cSS0wId@@qC}wca;844n;X9$u zoguc9dl4D^N}jug5G=q-IGPa2fZ z9g*MTI~G7^b9;iic|#IAyZQ<^2fpdW5%5~c>CXX0dkuRH?8Rv2G^!+XyJ3tz|15!# zN?MTs2n@`{yj0{e_;Bjn<2RgpjpO}JdQdyhw{flShpHmzTZYQQhKH!BMtZ+CLr>}oouXQKakWZcf&W(;Q4a$29@N!ArIKNe~2s&EtS zsJXV&q1=I0wG?#=lO%J9st^@E5O)(@q+emNN{?Txwg(t(r?QcN=FQ>$@feAk+(R&l6%pV@}@8<>fk@wl>WrT9g1cL zrJKn^ah##A2lY8U6QTPTN*^c^FJshkjl);-1`P9_ECb@eHcGXbq;JfHRs>;(wnMo2 zkNILyT4X-qBfw?Hm%>Z*xv=U9CiT&i!_X_QY+?!1rq|@@{qOfapPs0*4#c7$=G-Rm z{vr*M#u-Z>Uoig}A{jWl%O}f1bjY`0zd@xyN>`fY66F|S{U*Y=cWl`>7~5wj7EJ0( zYk-9~{USa>+z=@SCtv7u+PD_Eg=}n!p2K%d0#Y{VVY-+yjXCx7vC}@*z_s32NCnzs zx9|jSY0X5ghOr7DUf9jT05hi)2KPpO&MPR@$TRFIm(G?57N-gz%rxu$$wmCBG)Zlv zr!JYN5W7g}c8!!-1*I>=D{O%7j_*wjFyulZ&aP01SCcfkD0V#J$(UMO%Q;ThF+BLx`$3=I9TEuO9g^PFE*~6FuE^5e@P9tngUD z59OYeE|cOC9u>LPcl@k*CGd7!Ms|YxY@BDF7g&^E2)T-dAV}U86oTiuLRwzlEtjg% zi1o%+*X{Co*?MiM#oT^u98sQR2aBp2MK(#LRxNMp_IFFp*j&T<J4DmCfL{J*-Nt z`p8xYfWWQftGta9$(1sta=_k{X7gH$bp79uARtx)akbAd&ZBYaM(pe?I^63n-V?d* z=aJ!MhHCf|^eEg98%weXa2H(iT+wIqW+@0qV=>Y3iR2s$_@|SP^29XIpfVnbLDmXJ ziT8dNjCmH|^e>n6DZhF#N{MgQ7`%&miFNRbUK*27C6Xr&^3GidL<@LGMhz}^!6!1RHF}VFNyCmxaz3?5G8A*v1h8!#y7O!QfXdf;)D4f zzW$gNm~!DX2$SQ)P5AOt3RU8zPd&{XJi%A;QoG~&%3uD&kI^D5Tiz(ZD?}7YRV4>| zpOx_N{k;6VbHSvuZK2hyzl}RB=+h^PQ6-7`9EoA64nNCpbh@KxZh{HMQ>cxgStn_+ zjVYawb)`Jr&%C3yMSS%)SuVZ6F4q3}8H5J)DYQz zHpLh~$T&bSNf_gCSRp`_>@Ji$@*K6J7fl8i-A-2Wv8XTZDFbHGctSabr&EL&izyzYX3@e8EUBueGO>{5? z4msMPj+5*Y&a1{S*a>lS=(32!y#n#*73c{ha=Gi;y?7irk=<2p(k{z%CTP5Wj7d$zv;I`d{FDa=_i%_27j*WmoSW?(Z<3YX-#V8+T81e6j<&wETT{o&G&@>O}cHwg8jt%^1{DL0RMtV~U=7irG zGKcm#Jo{eJKSKbc?;co{E)aT_dAN;UTv`?mN#m-#IL0%>4UWN~rs_}+@ZT#=y~FpC zJ@d>%Qy7RJB%sa`kl4cmI>yY>`>Z1vH%k9a57eE<6ih@2WhuDT3n3BKUb`iGwR-@d zJRl;C&K4>?I&2Dkqy;+oZW$#xR!8Ldunbwg_NUIjMV1 z_MWOE^~A^9{d<%1@V$NL-qeH#fldj=u8P)ov3- zG%U{#d*DKzm=irK4oJD+$4`Cq3Sw_o6fCqF;mN367Mjc@%b%Q$c|1siw}O|bU|ds? zyf=ZzzRyz7|E8xUjT(yzK*%@Ne>>3=6j2!| zov+?qwP#M~9(^*{E=MT4V9m=wubzy6jRIqk?H*U7e3c(^H^f5WWsmO^qWPAs{}Y0? zX~g@OH5Qq)oX#CUcNsH;6nNR;b$P}-9&g;Zu3V4z9*Bd<7mLU3hGj)dHDC0S*~@}r zkAZRy_(57{6%8sMLEg=)vx{u<>%l~O%=lD6goFtAJ0=BhNG**=l9}VxaK*b%hke%O zwg_PYv80#QEHdZ#Ihof&RL&)J#WDfm0h|WCl1fvaYbsfcf%M}9P^fzdmpro z#x-)}D2b%;>G{~KuZcv$kKn1CAl@g~xv~!M5>YEnr;E9UXRtc=VvgF4(uzi-F_}k} zADC9Wp$kkX-nepkburzag>@x#mo9hgj5antK)flWIYn%qfj>$Tj3it*z7uH$%MqF< z0Vf730o`SCt9C`E=N;$COA5K=gU(ZS$d<%wW78syO|v|wJ(&a{|R;~t$}g7HjZ3p-%$EXrD+jOT&q2e z_)CS(`3{CTcywac+9#8j;vazwdUY}lO2>lAiv1Y%$4Q3L_Ec&u$myC)%k)f)0_&xB zUz`8+Gud-Q3ZurcLbm(bV?rf=ZvQM1JQ8*S*E(~IBNip8Y%CI(bh1rub2-Q-Az3Z+X)S*~ zPXL~<=g)VJ_TUFf#5xcRkj6RzkX}k99r4Y>c9|uuP&zC#m;oP2M`CowTJ+LB-srWKe*0TZq+=<4hSUkfc+Ta7;=MF5(KqC*8PN1JUnN zWK#i|?MDHm@qePl&D3?#BZTIjfLGhr#EvN^n7SlsdA`bTv z;GkdU6+ei2##}t^V6vLvi!v&Yf_y3&`eyWHW|>*-sFVEKSOH<~^hXxWuK2h{Z-zKs zuVgTd{b>E)$VS;etB{OqF_28Vzu}egY5$|<(`&KhZ{nz*QnQ*VYyz@uzi zgo+8QfI*0>qI&oo>l3x@Eg>`Ir8rBi#B;-Ze5F;$9Gbw1jw({yf<1nNL<^G}(d7#_ z61Pv;-cGFTV)p@CAxe9ewalhWOrJiO|h+bD*NJ*#ixDH<^@W@(bkvQZM34 zbB3}8g2NE|5Q}@vemaMQoN=wA736*1jdyD7K7)AygzaB(;Br8vPBGW>y&H{#2+!_xEU@q`LvKH^Q=wVksj{9Vjm?-R z@t7P1YwU1blLcX(KG!t2CfV9eU?WbZm4BrN>Z2 zc{%7rDfc=*rq_UAN%@QuaDuhhL>wBVilx=t!UUrQ-jZTV=No@+z+PIX#4?zG&H|o! zy$+#ds?=p-k^QTFEjTz#6ZG|j8}TfpSr)ptC;Yh^G!V8D_aEpQa^R`YFYS}ry(sSw zbl2zK=6FP_7QWfNC=%I7CW5YNm$Z}46##}8ipS7N&j@P5p z!}xHPB*mCOua69%bqQsT%BiceP>O147re-5ztaD*avT(T=r}rX ztFbvt%Gco*KTe~kL%%sNbOP>7wQWAa)cE=iwng*xj4*zTovBu-+ytXVdb*y z&~~)Fpw{M~X_RmMnCKq2`C=`@*-IKqjEeFL?`5xzcZ@f@ps=t?!~tPlmCjL1yQr%Q zV+9*QujO1;LEI_spnan6U!5Xwk$iXRP~g` z*kopS=L2_CWL)(F&G=~djPdxmf}2)Vs$9Z@n>EI0hB)KobkLeoU`m^-N~p4w(I4%Dkyd^hhBq!Z2+W}WAxL@fLQHGr$8EVFDCbUTJ*QZy|bu< zNt}_!6;VURt(WIzOGiR%z7+j!`+HHO9PN5!z4B~Yht-6GbhiRTAd?J{_>Txf z%!6cEZhiy+Sg^mQ5p9Ki)D)+Ur}9K$kq;D93Q=y5n#fU`?GqT}h-@?LaImZJv(WdJ zktRT+U1Ng>@c? z7&dYtRt)WilSy)0v1SpZpic4iaSO~(|EaHiz(7oeMv(tCjme$DiZjL!wzIcfBJZ$WF0QB%u$XL9WVd1>18O9d(dJye>;4f>1*KC^ zNHE>_{jh18n;);peYTt7hq+hdgDrMMtoRGnqRgx zEWy0KiVJe-<|1ag_Xi#N4fA+^IFHB_T(l`th4)vDz7}a9#P|22jR@D44{Cr85W@m;dDo;sE@3p7I1mt?U1Tc_3nf zg94$5uRZ_$^wtbWE==z&6ea0u}p z7x|Y_{xfv#w1ATD?@4PDN&f9V;U}Soj}-8w}{#lO_|8frs}5T_4B1cbLD7KZVy zh}o`>J{fp_e_HZVS8#_|`o=Ln2O_d>^>W6@LrwJ7e9s-|w!di!S9zKb>xWyv@8q;Kc90wSJwJNz zyKKr|!>=aP$qARk9KO8&aC%!laCz`ZUN=rp#{Xlgd94H0g7Tg7b;GFW!CxWJ>6f|( z2b{2QSPLb|`7Dm@-*(iruvlx-w+Qd;f9(5Sxm5XHs_Un85 zcE@9>&BFuU8TLI2(t!OXKr*_u)I_!pijF()VIw@F!VVj;{O&v>yjl(3e`TU}_8$Q+ zE~jNFK-wu@2ane)9&nnX=^oB*_-1;;Wx985)sNQuqb+jOp8dY?^Z&zG7__;&VX`;I zA6@0<`d+0s*U8&m%~^TC6~<~kdgRE0GEaRs;N;KPOHWAyQtbkBQ0g2HEbm&AFP zy*Vq>G>R3a2nT(YvIVZYEGBEI?#Xp<@%`;b@|*Uf&D5F6HE$*=r94*u>3a6#_qwod zUXOa6GV!z)PO~^JM|1Razw+E_Q<6TQB6_>|=dUc~Nrwv$`F49czLCtebMbzsLIh3g za1Y4TX*Kv)q^joOc%@U6scT)^oG7q}W~khxP# z&7*Gfcn23+7ZsSZH*>wSe_%q#7mxz@xMy83?puO` z8_NTLQG$P5R^U1c??{zEgQbYkEhf=I<1M)?o1KMHpiXUS*cK#HJ0XV^f zva)smt3&7Q_Ui**JkJmSK?iq_aqq0)RCbo*x>7F2H5ngoNC)6iTUiO_!|7t_|v@P45jkR`?Hy(b`{)3>AD`t+)Clt zKE9wk4!-;ck&>p(6z`K`;?72fBCItxFSmvQo^g|aUm}A4F6giL?AMB!@CJ@(bkH2hN2$^#uKg9~0gPb9NK;d`_u)8HK4wbP zy`HPejxAU7b{)Y{W|4jwaCU$ye*nHLZGgkngjW`*|xZi2pRg3!qd0lSAWZ zj!BOS?Z@kA{2#_CK)WyA&Y36O3TrICuB;8Zhflm#hM|Rr)F)B}@~N#TzOpsrN+=?V6(0>fw}||yJ;iZtywkQ#LB(p)&v4JU4?#G35v(teRFYxK zmg|40N+~?cD-ds85piOY7wUyfN4~e@v2w{b2xoDQn}0kOF89F-mh-a#py=N1)>+|M zfkuN_i8Vi^W4OD$Opu2YBo|sIv#LTr!TkFI(U$bRw%KmetTG|4SbL@$j^BQNa05M) zV){nGL%sMnbZwEhkucJ=GWG-xgVvi%g{UkgW(?$X7*95cUgQR#e^nSKdWMZTz^TCk z-p3JG4bVJ$*cJeu3S``jTogS$-rsZiW$}{Up$)Y8ax&ui2yVpiJOBlF+m)@>`Kg`& zal#Wjv0-VqO9DiAz&l?P`%qN70l;b z0&DSkN?6@UaY1rEucOldVYg_iELrmYj|WB_f#KC--+6YBZ%xd(^{oQr$x9{^nq#5OXwJ?S4|0t#D7Ck}%{-$l z-{pX9Mb)-k7=M*N;w; zT5;;kbJt$(%HAw@@N|6@gmM_7UK{G9tVaPAM>kca6-rsbnr1|YFw8fSS!IpGCEg+T zUUcS`j_c(GP;Jl4;?@wa7ny_hrPf}RYyBCD8u?V;BtG3CRJxLm0NmK{6{iA;vB_2B zy8Z*r!YQ|(#8*GH<7I*R~E$t>3W)G8dR^yT=s0e^Q>37_k zlF5Wr%CaBu=;KM<2FQBKFzb!ArVvf~(6CqFpwqEXy1-n7%lDi0=*Npp1NXTfG*X{X zsXyn%iPF_1`k?G?@({SgnJQ1YY%&0Nx#1$rI#AEC>f1qp>7sPw`_wSM1u9Bq}>$Q%O(6JCjM{Ncw*r}9`F5AQET z6}aIliR|Vwk#NZ37aq2><_U_%S7u%cfm{V3A}hLSPR_e6<6FZR_L;EEDHTa4geT$7 zuqBSKXI9o@Os?dWx<9QzQG~q&$xlKz49wu9mtg`gdF(!|)+WCK{I#T0ogtuu0mB7! z`jI8^+X(HWj;FW=57iF*0J7AwRqCbi_CYd*Ch0Tkx)WGaO94w>H`*pxjX6 z>;UI`g_ao{a!=<=*Y4RMZ=C(~J7cvXU|{&0gQ>@eXa2J3VGVI*sc9TY=zrH@Na~;# zf}4Q6Ex%3*^pYkSAD7*&Y#?RJwkISP^5dBZ;$c=rEB}GeEC#VaNt69DZp^d2cSuMj z$v~j>G0j#S^%WgKB#aGGr9Q1=7CWIPt7U!DLM`b>#$JHPx7G?42ayMZW~_t}3tZqy zAx%wY%+Ja4UK?v!RqBqO&ui_qlul>_2Bxh<{%$)pQeX@mB}(*NJRGU4Eae(0C-8&> zuK}Ifq0%G~&@*BW1A>P2b<8HjkWkPp0rx2c_&8I1{yZ((>-4cQ4@(S02x zFP0-i|Av)wx{;o(ABCBA%T$(k3nFUFn|Y!f=0;#)zSVcGCLm@J>)I1?29Jv>4{8@N ztnvat?#* z?+fKPg<~d4`5_?nMnm+tNe?*fJis9CExY;dC_$xdKp}}i;lAv+R?miFWPl!fpU$3q zek=texDlSF=hI|iA7!PK{CAC$h@L!ZtI{cZnA46^JPk}NASpF5V~di>ob?0FDD%*R zNqY9Ac7}o0?pfLJ2eWBO6upqq_r8w{s_^3#(zHXbMt4C#Wq7|RA-!q~)85;Hppchc zR3v^Ad$eGZH80+uF&N=UJH;uV*cOY9(NWyzZJ22cADg=#M^edTBUFUm6z~hMIp2f? zEgP@mT_!Usi_mF?vc^=7_FeE~7KCx+lfV@@LJdyo-XeB92N=pEk>p@{JrTLnE)W<& zSV7IGFOXSoR~g#R=gMLsnAozz70dfl`VpSxQi~GcSrLqKYmqvy(h4N8Sj9MG!IX{a z;gbtyEv)rnS=e<|no5dANn(Prl`O0wY{kiB2*#oDmf+In_X+RK&*!MnPQ@zjk>D%r zitncVg6ST_7py>Qi@JG(0VXi5U=db}#0tj&&Z5v~^@ zuj}WF{HKQ?&KLMq%@wo39H+R5)p5(6!5IV`kw%dQMDK=}^d~W+SRpKMmFR*O`ll%s zqE=~p0Wp^K=T6$=-Q$C1RpirE#uD#Kvd1Fu$z%$Z`UhuA_HYE=+D}0zvMwR$955D}Zj})gP@f z&q#Bo_YxCbOi})R6JvWDOI^Bhi>@uz^~-rA!e^>n$+!PPx1_XK5OTgU-VdlE{4v$k z()79e`dvN1Tc(IxfW zw05lc+tJ2j=XaXRL;d8+x(Qh02kFQ`522D^Ovq~mZlw0P;Bmg`mt2Dt`~tLl59zDP z=IjO5Q)x##giw^pd7U-}Hd0x%1W-g(@ehjLvANw>{Ql}gVw4{QN^W>0__<%D|Iw@= zm|1I{`EDhcc(%xM&|_To2_-SxY|Bgwe-6?Us}&$q<5>;To`Nad7dhmMMu-jz@aqzy z(Lp)W{7MwoOKkxGA^o#i*RZmH*W4dnogZ>C#sP zIK_&<#1`skjRqzZd<_9`+Ew434C$`2hJ^LcwBZrB?ukrAC0`(6ebf7+a&L-G*`|*F z(+zfy7|>msZXCaVup13G8<`RfS#gi_B&LKoV-X*a{IdUy-riU;prZC{+*tu+u zOPsly%jwI-0F%JUxE~elQOt46cnbYsB5-tlQJ+|gjReS0%r3ej7_)=hsz4^;VT;rM zbdt!KW!#ye7hR4P`18~tz}n3gP_sS5XbgurC`uevQ=Su47nVc(+RjVKq772oSy4P8p^ z$NOtXK41?;3s<#g2VZpZ+q>RY;F_KjSB~?86D2lbjbJQ>R+0`+WV5!D0l5Y-r$sCw zr3#ZWs1Lt~{=0J_o4fu5^B#U={~deG#H#u3Ow9_zfhc2k&Cv)tmv4+JWwzkZySU{s zI^net!bKSpgH4?=iZ$V(Fn+uAJRX1?g^?{A${2EYlj8 z5R1R0X_0QQKNXu+oVvl2*G&|u;vEo&(K`TceJ{xuy} zXP2*_c^{>-)(#6qy9MN+5l)HlLc->eUhYkrs%|E7BuD@oK4Tuol$-oP90Vef$O)eTjZBgDcyoQe(e9fgoI5gu>k3BCNkRCd2a*ercujwee;8Ia zc2x%h90(B*^<%PGJC;H3F7`G+OS^(s6g#hayP2;9l!Ulgq=Y^&3?=X-N)VZk68z);%`KTyy z*X*{~ab-lEV=T6s<-LIJnn<2Q6yYXWlU&N|gRAsj>^6vsS)rAwA$@hYVM6CiqJawje5Fn>nkL&N*j=+|9ufGbXCSBe*TZD*V6d-U6t~uF)SwL_$Jf zE7F}J-JQ}Qf^-Th2_oTRwj4f4D-=ZnsM2@9N_x6T5n zfhHx99TcxE?o2f!YgqoELA+hb0ONofjcr%$@=4>$^cuG-w)FKUI91J*D$WuZE9r^x zFU=aJ`7bfpI)o<+vka_!%=7Wa`u``vAqW8!O*Z^L6U+bp`2$HN_C|t$`VZ{>PuJ$q z0U(IR4&DmF{K*Xb*YCsv;OZx3BN6}m%FuWIKmYIcADe*Tb%qb|!C(0+@Mag~KkU!| z{~PiDS$6*-waVRK6E`OQ|62+Ot1cHDgX#RIdJg0OgyOeY^(1>s*|U=k)u@Ck(fM(y0Y*PlmbZ-5WrLZ14kL zs%Ga7F-ATctmtQR_Uy&%&}<1Y8`foXWBC(cLaDJ??WoAppuee?YU~MBLi&yHQF=Qq zxOCRvp7bVkZo@$PRK5``-F{=$IG+-tKnhRQ_&KfZya*as8y%GG0pN0>#I;Ev zq4ScrWloH;t=oQu%8cIIX{T0Hj_TTjnqE{K=Io(!cr^SiPk?tj&DnH5WwrnPpFLy>c^d=$6l9#6HU)R;yoWzY>>MKj<-49 z)4tzcHI7jk7Pi{WNo}`&K48o{09K`w{{t!sB{meFYb1{ixH)S9&oxO`+yvVnI{XA$ zcE-}MCU;IdRe(>gP33c|0XFtZLw4y&%dRf+12mC3IVY=h*81 z#Odz(+>Z0T3?TLuoJJ(6T&Pe(lNGYWeq@Dm_dKbYVxqfqhhpGb&VWU$B^Sp``3gW> zo@8y7KE_XOfXcYDO4gtx^1RC60e<3n7FaTsu770#4DznNPjOJb1W5D^nEUI&MpO_^ zUF3u7TC&}Mjlk`Zh*@MH-}Bicz!W(GFuas!EBzDxKrSF&ST3e2r>2Sr$aQ-%#;{)G zn0*F1fUaHi(m9NfHX<~NWVp)k()}<`%n+aEx?cKP&dke;c_$rzn_k5I7!~d5zT(i- zuZ!pM=_A*VNRlNM-A_X%8So|Sr|9d?-W*noPMT_^0>Y@q+I5^;d|e*S)EDL<6=#6k z`ObREE060l8(`V4xdGs;#~RPiWJbM@hjWF18aoIe3q9&g1vcLX#kZl}Sok8Mx2rIp zA*9{O^s0 z0U$_*xz#X%1A;|5DdKf9s{1pPzR_3(d+IU*!gejx z5AuXu5ySOt3q#{W@Z21r+AI?cV{Oa|OZ)`<*kTCX_L#&a7(?bMlgw8(KsisHPIpB| zQ^cya&{C7V@VY(rhuX~Nxpxa!7hHGg7Mm%~ocq}(?E!m{xjR;%S~4O;sdZoiIM&d9 z@i(bI=uw%L+k#$}N^EP6=v&@^jcu)LXXz{XDl`zljRN-jH>EvhfDjVd&UT#t_x z?h{+IEXVwqfPbOvwjtb$A{}hfXo{2yu8sJt=mBA;?S~bVGr` z*K|DAEz{`^8uWZ9&%qYY2W*ORF45;QJp1LpoG~T8tWNtA+7Oz6&BMGW9sgO##Ya{s z7h@$lOvpGR-x*rh4G{uei)KUY{yxsWpNiD zf2q$r{fWF*pw|b_Uj|X_oma2phUL@KLQ$2?Fw=I`dktw%_0e*aDOaIPv{Un==hbOm zzou#FQSo*$hL?jN!FIwUuNjkh5ohDEK9LMkBR^@25GUN`L9#I^Mm4MhZJb~?Wk@o8 zv?srpB2IHfJiyvmuZ<-y*}o)!(Nsj;XXuzw+ot_l!zy}iM640v$2e`*Za`sqEYloJ zXjq*;slk`*FWwUOMT{4rmOse=@F{vQAqVBTykjP)XBx}H)qrCq)=ye;-lj?y> znM;b^{!WpCFGgoJV0KD(p)wE z^u${@+(JoB_FUZ3_5%S=1sSaQ6YQ9ZLaFF@4yW|P&O4r1Jc5Vp<9)Oo70PO;I?O%% z1#<@h4M!T5Q>U2DWL73$8kmh_!()eMyNttJdtWvX%Jdq==QaQ#k&++Gs2Wc2W8ZW* zSVEa%y@Y|!b^+cm&9^~d0jmSI1=rs8%i5HhZdfkst0|}J&NKf(?%kI<&)+=J3)4G= z7=MJ|<9RlHcyIvPO6hF!TGVx%z1iE(zQtoZT5DZ#T+uBNQTTZ=Xc-w9A^{5W)eJ4h z$aXEOK4L*=lmkTE-i6;kE(MeJU4RgTBUB2L5@U{GzL)r^bnwlNC-Dzk~xH?>?3=lK!!p)+YjUPD?HuHuTFf*Fc@2jmz{t z`k@h{!_y#kmNzXeOM3M}pRl-#UJ6{xKxwy@0tq!1ZlY&rMhh;RA(jKTAtlhcH1-Uo zlT?^*Fh4<3nZT+O!?y}**-sOF!E_#4k>bpLrEl#M@hUf50?L&{!94srCRy%{m_pLF zr+J^XO(bWO`7X-G5e*MVW13rsvuT?c)22!Au>%sp45e<{Tq?%rdh+bWMz4Nv9(-ZzFBNQW!PwH3>*m1MAQ={2Q@2gnwyt}M3 zo2tMn>b)cWLwA&=>0X#E0o`4_(OF~=&2Z>-6F{cD5cACNnNyfWeUX+ZB@W<`ZYxBO zVwrJUl^~Ez+ssk9DgVicSFp!|D?;t*;;*SmkrBUE7?biN4eLk84-;^Ix#(2lgmC+^ zlYYWvo&jc0J8~QRMVmFo8pyK~tL>AYKudH}7oJzF*v#jf5?jb?N6-?QAP|OtMe^d4 zoUJ`~_+o(<>1#b$E$;72d0&?~A5-*3Jmp4dIc+%v{98pBJUMxc-)eCy+9?@|sBioc z@6pS6jQ-OPdcniyrE|8uG2BTqwAc_}S5LVQH*iw4Z16>dE8%V6B}G4ZI=8Zo`+{>r^&k z#JPVkoFJM5aS_!rQAoAf9y_UYgd~WPpvOGr@FkXiz;yjtRK#F)ASbH(#jYrVkKZlW zq1BzRRJ_!D8qT`KT6%!g8K3P!W#ZlnA8%JUiaqHJp?Q|uzr`fg=P-IeLpwM}wad>m z8D$*&Oj)&=@3;-(d6%n>*3Hs!ZH@x#S(K%4U;8|%W};Dpp|R=Wy~wq!rg5!F2>C2g z{b~7mWQ)Gbx63L0CqxgrsNpaK>%4~7ppWdeGRkLWJ!UMo3ItgHVWJ8J((OKi5K9yS&0Ie9F4QMw7HeO$idS;>~DV^hj~p6mU`OOZ?Ofw%N*_Dqj1nJJu%PPdWj;2kZ2+( zs^c5ttY#$|7$}J1oln+cRCjuS6`u*)lI3rz%kq@Is zi+RFK2QEr5?~*1(VnhJB1`8dX$E=Y<2$zq!+f_GPpp3y|4JB*c+46@QF&ds(o!d^H zP-w;t=^@$4@O3m$1<7O@0*M1_f#RxUJfbalj&BKiK&5d1soq}C*TWB;FIVHx4#Oni!ey?#E{_eTKB7mPKTNy-5#EN-+7K{V1eXw#4Bbk z@vt-EZWz7ll}N0;yr@LNT0hGuk*@b?@J$H~dTc2FtMn1L74F*$?#q{8sajV;h~oD` zu}ipQtF@TEX=#JYXxMDSX*S>3S$o7O`7tZKj3Kh`s~aeJAspC&kry#e=FTkpjEq1W zlEpmC+G4~bqp?_q1jIau86B@%OmLn}py43XH)mR`(5DrOWjm zBNv0^@~^j^03!I|C1`A}X6JkA@{Go0Xo`8JHF?k!`LdPh>uOnS^gJY^kJ9B-Mx<}f zDU->$;w$$vvDaziq)nN#3V<~?u{6PuOm6)l%9JB{c`#q!>^{J?dDVWH5Zb0eX?--r zIJGjJ?QIvMpfKUHW(p9dy4{Xpvhb?* zr5tSmc~-Z4kbcw0S2h}?F1uH}CM*>`>>Jx@FI_l}|agvO^$ zC!KjkBEp*?w;|X}EjnQoIK~=LC-L@l7C`7sE%0CucVXdT@&4E3_n$5xS?eZ%pca#z zft~HnaOd%!_6`VJSypd!}th!d|%&Vs?+yB)=a;@+g?2v>&} z4xej>#io_%#UqcG_6_XGO;l}Siv=wwwva~WRTq6u;~%%q(AYC4lnQ=;WH=tklbyXoT>T-85> zebiU{3p#4CGwJi4DPbc<@hw+KS8gBmEnORl)30F>+kR z5EDgpXx$W+4>Q<9qhquMvUWWUQ@-DU8mf5p;n9Ul!o!x`!Ytw0&TGkfx62C!p+W)) zHxa&xJo&USYKLS+L}_xytsYDakV6pq3@1E0#NhR#z@!(#vw=~8=P27Bw4LT1JO$$t zU}aeJ8lmeT3#1{Zwg(|vy)6CLCSwT8;D=D1N30rb;g61%iMkMc1pCP7)>Jtup1A>C zT$U(AtPBQs!20{uN-vf~<@;M3EP@Tig5#!zb=`1?ceSg#f}B6)_bIs3+?a#%fySjX znkq6H(Uz8~;4e$Zb0B=DjwRq#^XJDIt@ck_T;^YmQcoT`$_Iz$Nz$!z(WT8ydLlhd z8HuHOK_dm(L>+`3?J++d-eRiz_7nllK|$Jgm$9XJrBAYhEri}GoSea?g+F&`I>~7z zbhz>sIytWSb&&<<{H73CB46>mp*OK0kWaOK7nw(%LZ_k~6AGvU3k=;9!5#9MLQ-KK z@ihy!9TYQ_AE`w*YUJO!l|iA{!WFByh~Y%bVqz6;RI}btyME)lEGo&%GU>R>{gtJo z8?k27SsZd3UleUGQ*VCwJ5JAhN3`C6n>MYl)_hxIc4P!9eXTzp$Y+FS$!hW`WBTGP zUYdWDJ|-pNx3Wsy8>!x7r>tGk=z&>WVIdZlWWS#fj@YR+a%x>Q`#H%c$>$-Fp7KS3fxYx!OaRoZ!vd zks0oVljz(suCmNl=y)3p!%pWQq0nu*`8hYvk=H?y?R#A8^L?TAxUm*>{oc#- z!R9=wPMVMt?!ypdjr3qJ`WAS^vl2jdn}{ky05s7lJqdLqUzlLWC%)s>dKilTVP;4t zfB>_G?((YOJ5+M+5Sb?^lae&?(P&_tFlL1HStuQ1+ofiT)A(Ojjt~i8eGq&3c0THF z9uTmf3urQUCZ*C1#b*EZVkHI~0&ekZ2Wil&{_+A4jVPj~fnb z5~!;AH57laH-Et0`A@y&$az7Cqg2j!`+t|ai5mc7HysY+cV0EC_Y$=w|H1f;Rq_ zSNa=J&|jXa<?>9k?_ag$hlgUQ7M@f2fdV%`i+Js{%VzVbAyDuE?8tZ6V*C6*YFMxp z!#Hjr8;pPxKu@mn4P>*jsif<8-Bplzfebal{&iU#OmA1>f4NtB%D56|qC^T}cN?;8 zzf|mxd{?K%SO;AweF3QsL_r<^yw$e%WtmL<;^z|v=ritr7W6OC@081bl_y0Zy$p>R zZ1qF6o78gBqF@-!EVrs&hIayg`vH=^%D3E8l#-ivzc(JFx}S891g3C1*p~IPCusn@ zdw$MtP-{kep8@1L*5Yy5X1jsaKEWdJ6)05oy!d)m_m+p@+_rwoI3l=@1q@GzLO9~G zb9W#`uZBs#!#R(^Ve_Py;SfvPc{vba?5Ilp+l?u<294SH(0A~>sAm(^L*OY?yb|2pvz@5F*N*ycMKm+fzbQ~yj3PP`3 z8=&w#kh7cNe2*3=05E(rflV6*KM@d|``&=7&H+~FFqC_T`mzJhYE3>rg}`K3%*6^% zJTi>cBoQ-C92houoj?HTH8V84l3;L!sS|(o04!yB^LFF8q4I#=hLS+4)U5#!io^kk zSAYF&@YRmM4(<>5zd9#A%c&*>jF;SI%i zqBL5Ie+H*6>2xhQyI$lxBX z$v9h_$YTr&G+74GRZ?r29xXK=^w56)jejTUhjK~Hp68Sxg4IpBFJ${a%pfnfN1$=5 zXnbd{J6ZlTgQWha83enoMQH9XW{_oI1{qYIUjt(MeGt-`6S@CPws2%JOk4gxB+;KS z@@rTEhwh+?T~+$~`z;XeI4Dm;GzKpu}wSCNb5zH~MBonql7|Wczg&)b(R&|jEMOS8!aWcijORDMnFrmKpCvc=DXtg56kx?OXS;MFT z!^q8v`(gcM-LI_KQ8^xqZT!qsL=tfZXED{X4+FZLp#;*MW36$d3S=7+Eke3xw8w!W zG?M&ROAi~7TREPlK1nsX@;vFLYS5&P+Rjb-Wnr6BjjP=5X+TIRzi{8#B-%Fv>IlR6v)IKrFFlj9OF;2mZdv3s?GzT(=F)ismHo2kkm5o?_d<3$&T_8A^;^1sl-bqHERBPk24j_kfPDdJs zB(on%ZL-`BaM{DfTZ$lfZ{gxU18%+SLwD6g6-^u8K?nxDJ*jbB0=w@gxC|Skn%`E5 z(Ihaw*SDGjh&oh&`$gfbkhouj?a5--yGtX(F@a>pC*8AZrh>dA{U`nh zU_Mmlb_68XiQz{(ECk{~bU9)OJ`7;I{gcc~@84E!bpb|V9zeP~*68d-8k=MT*5vWl zXoDg$h~Caq%obxS=^qe;X+k zjRrit@J|psU3ZXRnQ62I2awdT@9=In6%= z^C~cTKS1A8nqi0RV8Y9dVD)x*%X$di_>l!Jmg^}Gs__Y>c*~kq7HPHeYq)yw=%oF# zfQq3ACqPs5-qn8z!R@_%?Bb)pin7PMsSS0L*=XXEv#Meba)!2q;_a?5mi9b-$TD$% z{@v^6s~Vh;?U9@y9XwyLA*`A92+__|)+nE@lp`pi-Kg)$8T#or5g;a>D%T7KjD?q( z#PqUQJaZOR^L1pO2PhV0t#y|Wd4Nr*j!9nP1WYtr&}dkvE)xGM3=@1)nGg-HyYo8} zQPySFrG-{wv1>v2)WwCoLo?1cu^0UT6kM2c6|vf&({MtfR0?tlJ3*( z8G(#G>Pgu)=6pQFfp24ftM}?DwyWPK4F8;0*~8nd&4OpRKuUm#MvR}#d5B%on*SJ; z8)%lqyj(B#wu4}(;4y6UL}4OW@GPq{7Uk_9j2fzjJ<{5A7M`-<#mw=V>42d^1ZsOY7-iSsQFOW$*;%5AlR4yN<4x ze#`@u5EPTMk49K}viPC;-VSgC&BGFR-X-;b^sa_-7d#IO1e|{j7c_!M)f^1{)VKH@ zoP=G9sWpQ-GWX2TR@p>d{?>h_VtkD<@DyNV|&ReJ}@v0KZhzNUy2DUUhbj z57g2z4JJj^vdZp(@jNi1=%W*>r*Q|egEM(vwm)bmW{ikMbU^zQSFSK7i4v*Xx^hOD z$Wv~#I)YMVs3pLwy!g)&{e^V;PK2Z*&&KQ0Mai@x8*-*~|08wocfQsJ+k?A^k$fVt67({y>qGYbV8=bB8q_&5bS^Bv2nx#3z zn*?1UB*eCaIRn2y-`R7R2-4%TEs0pu#mc?&^lKSo z7e?yOI)^j?q5H=e`O%&V9IrQzROmBOP|y2D3|%WEQ!iRWWxq=A{U>En22A0jvxrl~ zHXlPoJpAbNKFYRQ?M>zPn{=-I?6bS$-QL3fy({rPK{t--BJZu?ijE|WR$rA^!IhyB zUYTG$;rVq>lvTm%&WV``Y*wSHTg0WQ=oqk~6eS@A5}Yy4t-eT#fssBPtoFp#$y`b1 zHA2qxW?DWb{&J~DoS}L>5XIxn(c`UPOcN0V-}AQto_c7!qv<0jrrR2x`=ganht<5Z zzg%zUz$W{am0!3zAv53;h34}YVX1BNm(WCgjx1@$EZ&zh-JzX&NQ!4P(NOy0xHFNH zlfe%aZ)4F}*#7ays3!;z#d*f1@Zzj34!C=n#i8eL2+v2WT25XW32*9IyY_pYn-PRu z>s*-cr~{Q1J{mdJ)POCy&HEWw$>k8pc^UBET^S1tUk1XgplWg^1iZ-lmY&S$O2b2n zTLto?83QF76zTO6(y4Me92phlZ%MKP%RVDE=cgOF>^5xuxF(QK$GrymCEOqMF%fJl z$mMS^;^K_8Fr9)cp#}rA2NUcq3sZoG0Fih0MpIH#80<}YoqCs3W9}OKO&9Aw``qq$ zO}785IuwmCtHRe_n*yA?+~R>|GSx6x420 zbbsZ4uYKSZyy8Q~Be|{NgP}OP651JIV0O7=oW9x{fH}+zRcE~^-M8e4AZG86uzjCG zhQ(J&qwjJ?j+R)YC+zcq&YgfDp*2ucJaYF`vK(VH$Ncd#1-i9%tf`HdI?-N`0-Q(O z8D)g8E1BhM<3c9DeokTORc{`|{ZZI|N z>`62Y^06Yna+uF1g@?E}^`vpVuU*tRB|2%eHTyA&{~)gemY@4V4Fk=S8Gafln65~a zARIi({GyPyB~d+^O;}5q^&K{Wse|y@T;Zb2OjseSxxjfc6hJLOWV~umOhC!r13SP3 zj!q>na?{PB++neVRbG~idx7+=^V8g~tN*MT)#cMElO^Z;xT1U3B}ip65mgzZIY>(a zn1+d7e(3)?E}|FhOHF{a@qSn@7l+Xy$Ls!AP5~9ZJVJ!=!`!527AUQ|6&E}Nk`t=$ z)FCO7++PaUL0U7-r#ht14vN-@#Z4T-n-4|Owq6YAe<2*TMdym)^?QH66@F@JhE-VE zl~U*W@uA*YSH!}0lkg!U)kj}6es0MCPBAQGN{ES%w*%NV-c*FX+*tyVjzMWF0V08N zsA=j2DtTY&4CbFdgh|(kXN4Q`)&1TILvM0CFl>Dljc<$5yTbwkb*45 zeI>|qe}Gs-8?;SkI(pd}r^AD^3SheEoFp5Aa2{J9p?eqv8AZXH^)5YKQ={EkT6F>W zIsRcE3{cs{oQ7!@dJG5jCgfU!F%EO1je#N=f#a$mmF5egpU7){e z8nc0}{XHPMr~>$sHag|D`Oa+npo-`*igr4Si_*32>FL?i`Y-Nl)b(B;)vB~NTP+$RC#m)Qhm1;iw$;# z!x^yH-}h%!hDx>QeJJ~=;&10QYgt>Q6Xjcn$<@BQwr$TiVqw+Q^s`BWZ7bTEFhyqq zapmnXmaC@a#d#{Bg62^sODC=6=?>ByQ?jZP5(1&(;9dwNV>7Xcs)|3yIujkLIGiGy zg#i(PVGkaL6eolAQ-uaj8tPH&2<;K7FQLBcG?Sz))Fj9fI`+e_eieCXrm;mkqrF_o zm2kCic^N@wf0Y%Q#fxkP+)ULBSYw;0=A6a_2#lG_zIF*!V~qPlZOPK2In(8wvvHeI zMv3_l`v5sE#YJa*qEHDq1Y*TG>)YkmE2`hZ&@fCst`*$edSAx{3Fe#!$sCm;_RQc* zn75jA&GO@5GYLHpOwCSRF7o93;$nyC;3e%-J~9(lXY$4>qYR)jr%-bQtra)79Ozf< zh>XQ;$Rz0IzEE=nyRH%%_ZG30;%SF&=88#NF&D>-an@EwL8FH2-I!>F;&1FH-5ida zHH2A)cqC(ZeNxBk+^V@pUEYtHAeUnr(VEVVa+GkKvv)kVw}H9UJ7q@3861nsz&4fSjH$RPVKeBae_$&rn_pp z3X3CvLW7Y1y%1)qDxnQ!}XeWJ02zCt`1bu7?^o!{BCr6pTY=%TvF~<+-~h z4O?7fWDWWY0FabN5%nW%*MM)3vtcxO5!`)W-xdsXvTpZ186y&sWy8j=>5Ldftv|oDQNio>c=y0bJ)TN2e6~OQD_921*Joz zgVP0*ml*a}KWXWj=F1&1--VK8M(nw99Y$kJfvDY<5F&vEd66$w4tYJ@nGvI=w0EWk zxiuVrz`@?vq(hduKgLx`sH@JUKD3wP#5{T4^=+)ac#GM}TEiqK=1rEt3kQ!J9-0Re ze!3A}6AGmqGcvBxmH!iY2g3nCqZ@kVfBc(7rZ1TBg_kRnLPxv?AzXL_fYw8@>{&@U9VSh7#hL1ZO)13aZ z`TUjgVF|*?RkBFBIo~A-hvL$5HPKUn#8{s)EFBSW3Rc2{x=C*`SjrPFTENnfWP`L zTIN50_pX74d}%#T`*%+?D*n*j-WF_&a%$UIWMd_R{vByH|>9 z+A7h%ee`p$bm{T0?=@UQ)4!k!|QGUOmyvH3(z~d;qIgm2DbN%ec0f|5U zO015jOBI-BmC5hQMW65<3093r(f$C5BDDL(U-oLD2~W_tCE&$;v*ilS1Ole(Bya%F z>f@BWPp{I8klMt83=BROShc!6>ESe8IRLhci4Yu?j^+4_gY0i=&da`3gs)o{f&81D z11O{bT|GH&O>f5WKe>Y=_;!$7bO>EqU|59R)?-v++lxcB6&ua`q0>B1d*B%mpF7{Rc#Hb) z19GG~69#2z6*ldcXxm0b=Tf6RMN_^XR7tQsu(YC)D!ufzTf+v$}l(a;gH zb&^Pdrs4u3U8^pX(3)2_D_Z9Z@3YtgAhG#0ldT;%cBjTLR;{C`qyzUC_Joov06FKt zaVi_{(>xrjx+#;Y0Zz|y;m*!XVHijed-V7kkO0*Jzph$~>P~trLBfv>5ZuQeE@8UX z0lz`bGZoEYhaIIt)otP^`pa^V`qa5WMl+}hrH#&d>Y?EK00rMhBS;fF_$K0yma`A! zbupx`$xXA@L`~IMx|1c`S5&1iCqzWzJ8*lT2S%ldOi|E3EoEUIbE$A69w+s6Lhn15o%Qmjmu=oFt4WJI3%d+r8Y5^LYCl?2%gE5*1 zGZ~oLFjdWiu)4KAlQU>68U~rz6>Gi@cDNcC z6ZB`BW3>g?6`^h9brjsU`U~}{sdqs?#+5aSq6IQ-ibH z&qE`tZYFOuy${j=E68*ZJLy48UqeGHI1mP11Etmi8rbIuk?v+J^XBK3#$F;yPZO9n zIA-DpXQ6#Lq`K8{b2=rJjOehbbDWGqcytegVD%!v?S!qDFm%_q>oysq} zmx4}j4GqP_JZYH$F_5|`d1}u+RocC#&Y(e8rIyA_ZBZ6K42CcPSKu=dgVIc9)$js8 z17xl!k=B!?6X83$M9d*rA`u+(( z${X>(=S~Ojlf|ohXo8Sal8yntWeV}P#PM)>!CNH4AzcpIa9|%K1$>Tu~%9b-Vou zNMFNekhD9YN@_i9R>8a9;j0VIF&aoV;c7QEqJ0E(=X(-HUSPvk8D3?X)&z3|j4bA8 znDCvTBJ*658EN;}{Ol>^DqaQ*Q|Qj&?j37RK0&w_2|6sT zP}2*}`H?B*OokHccM63K?hkVceVvs14~1hnp`!91~8P($l7cT zWhe`*msVbA4VLfrU7$}r_C7F3a~dvboSRMEV%MHv6&bnmjN-2ybatv%ad~$nQwIoC zF--C})uMVZD9w6o@w_vf5Nw@N5{_VZ-a-xtr@T?CAZ|$^=W-ZStn{5A05SsErTNof z>0n9d*_<1nOWvY&=z2tRtIX$+#I*m9(Ul)il<5mAFestT3wZKzB4=q4z&W26I@VFortQc z<5V~gsRAF#!BZ1f@zP0d$97PAM83kj^O2kZYbbt|G^X9#sGVpF&=kJPe^_mKtD?mQ zofc-=8QC?)R&oUBZp|>R%cIV>_cJk^{a6wgf$Vv6%LeP}<&qX4>52w6p4jtod+@>R z1NV&up*Z0JTW{}|XQc-B6D=$S@hsFoXFnIms)s}n<@l%!I8Oy0suECO8h9HB8;!4#(l8NY@+R(bbE^(#AW#xoKUI75ljv`Z8&&HIj5cV9KXgY-d< zuR+`nRL{N@@R%gLHQHQGeCe6`DE6p_QIhZpf~n4i4&ol8x7mf(bjvt)Xbp33_!t|u zeb{b8Bk)uuauCNUJ?sP-q2tQf&c4yrM03c(y#Y7gwDhgE-C#B=;cBkxuh*buKA_Nh za6okF+6^U3TB6*40ULq_c8X`y7e9f=T*GmQ^=~_XNWx35k))2kln<6?yusQ3jq)v> zlrvXMbE!sd_*Nu|hx@)p#ti@Dl;VEnMYkEEPtyX0E^k?;C2tMLwxra~Zx_8&^L_fVL~}T+30gg(M@_)l_I|}+w#opp zp~MNF*8Dr>8*g;}g+$&k&d+_12~w4QF`H}P4U!kTP{=+e!9EdD^`Ln6e#j|UGWX?f z-NTyCw;df~X1BtY)3Ybyr-;3G`U{hOyQzU`N!1)=vsA@+`$6K7>`jc_v=&tc5<~(8 z^lS@CwJGKDmA-8p1b&8u(_(1&Mhv-3q_2i9(lFMrGdxj+!3s?>=^b%uHyN6zI#| zLf2r|)k7+qcmjFjBvHzftQIHCUk55a52Q@PsbixbegzbZUBFAo0lRZ;Wga|WbSVf)l_1~lF65B3`;W5}0`oGvN2!gd>ots% z>e}nF^44LseE79Cb;fA@k%Rh1Hpbp*)4?$8;MZymq;gQwwS=#UAIA&)6syirxKQKq ztulT(x;A6>-2w>_cwhS+KgGZ9`LS{Z%~FjaX$%}EWJ;^0`_`GZcV#O%&SpbFkE3n` zQFYOC)`zCGZmzi&l!9r1M^bQP-)1?Gysv1GteYuxy?u-d8ZF53xE=)S(fzEqCYMjp zU1DNXIx#n7{C1)l$TH;#yV_BZctSEnUu-bHVQh?+1DxPyE=y7KI& zPrB!Rh(p&lbT0=!`ev7FL}Yl}hL=*LapWG8F4+ijmUr>7Df(R-aj_n@o5TiXon#q;m4)Ppi8u8HN-o&hL{fk8r^ z>{9ouNhzX-tNQf(KGvuxr41I-bAGV$yxi@-JR|pG3~GZelxuj%YN=|}0yNqY)^C>! z-|h(*%Ffn}Jo9SZDl-;&^E|MvN@(IWIt>9x(@I3XX}4uV=r@3*ZVkVqBj&Cq(g& zcK0FoM7fXfR~E=`OO0Nr#q4{z0`Z-^@>bgyC0Av>b}h^E0t*krMxg(cb*XwGqgVO3 zFD0ZCCTOj0MXUgB;*_v<>yFJGzErJL#v%3Xq5Gy6}-lNzEO&IjiU&m!3RH2 zb!w9k>NzB_%J}-xUp9txQW0_68*50FpH^>PQ?4J(HV8=*+%yo`H&2^HHCdK3P~l|; z8$U0@f?=oSKk9e9x%#50lSH5zvxfy5!wTG1bs;wl z3~I0Kby9YC&T*O3=(P$$HP$EjwiC(~1p;m?H<#Yk>>7f~;*TQxhmK#@QT23Ev>y;Rv?nJD{z z1~CzLdmD^U^A;yj84hHYx`E;+Bx^cX&NOq0-U(!P+rzZu0u$DMdog}yq6r~}6aMQ< zm(g1QZGnXcj1xY8dlm9|V@FLiAi@3ZL-0RPsL=ICtd*x({(k+5RKZ|Lvqd=AzkWF} zoF62B$%OOk9?pNi<$pbanLKevc7+w}gMYp8-yRGdwiT%@o;CUH1BUFyN2mD>>0`!nz;-FE-$ri~s-t literal 0 HcmV?d00001 diff --git a/ml-platform/mlenv.auto.tfvars b/ml-platform/mlenv.auto.tfvars deleted file mode 100644 index 3a7fe5c74..000000000 --- a/ml-platform/mlenv.auto.tfvars +++ /dev/null @@ -1,9 +0,0 @@ -project_id = {"dev":"YOUR_PROJECT_ID"} -default_env = "dev" -github_user = "YOUR_GITHUB_USER" -github_email = "YOUR_GITHUB_EMAIL" -github_org = "YOUR_GITHUB_ORG" -#github_token = "DO NOT ADD TOKEN HERE. PASS IT AS `export TF_VAR_github_token="YOUR TOKEN"` ON CLOUDSHELL" -#folder_id = "YOUR_FOLDER_ID" -#org_id = "YOUR_GCP_ORG_ID" -#billing_account = "YOUR_BILLING_ACCOUNT" \ No newline at end of file diff --git a/ml-platform/terraform/README.md b/ml-platform/terraform/README.md new file mode 100644 index 000000000..1c765d049 --- /dev/null +++ b/ml-platform/terraform/README.md @@ -0,0 +1,112 @@ +## Requirements + +| Name | Version | +|------|---------| +| [github](#requirement\_github) | 6.0.1 | +| [google](#requirement\_google) | 5.19.0 | +| [google-beta](#requirement\_google-beta) | 5.19.0 | +| [null](#requirement\_null) | 3.2.2 | + +## Providers + +| Name | Version | +|------|---------| +| [github](#provider\_github) | 6.0.1 | +| [google](#provider\_google) | 5.19.0 | +| [google-beta](#provider\_google-beta) | 5.19.0 | +| [null](#provider\_null) | 3.2.2 | + +## Modules + +| Name | Source | Version | +|------|--------|---------| +| [cloud-nat](#module\_cloud-nat) | ./modules/cloud-nat | n/a | +| [create-vpc](#module\_create-vpc) | ./modules/network | n/a | +| [gcp-project](#module\_gcp-project) | ./modules/projects | n/a | +| [gke](#module\_gke) | ./modules/cluster | n/a | +| [node\_pool-ondemand](#module\_node\_pool-ondemand) | ./modules/node-pools | n/a | +| [node\_pool-reserved](#module\_node\_pool-reserved) | ./modules/node-pools | n/a | +| [node\_pool-spot](#module\_node\_pool-spot) | ./modules/node-pools | n/a | +| [reservation](#module\_reservation) | ./modules/vm-reservations | n/a | + +## Resources + +| Name | Type | +|------|------| +| [github_branch.branch](https://registry.terraform.io/providers/integrations/github/6.0.1/docs/resources/branch) | resource | +| [github_branch_default.default_branch](https://registry.terraform.io/providers/integrations/github/6.0.1/docs/resources/branch_default) | resource | +| [github_branch_protection_v3.branch_protection](https://registry.terraform.io/providers/integrations/github/6.0.1/docs/resources/branch_protection_v3) | resource | +| [github_repository.acm_repo](https://registry.terraform.io/providers/integrations/github/6.0.1/docs/resources/repository) | resource | +| [google-beta_google_gke_hub_feature.configmanagement_acm_feature](https://registry.terraform.io/providers/hashicorp/google-beta/5.19.0/docs/resources/google_gke_hub_feature) | resource | +| [google-beta_google_gke_hub_feature_membership.feature_member](https://registry.terraform.io/providers/hashicorp/google-beta/5.19.0/docs/resources/google_gke_hub_feature_membership) | resource | +| [google-beta_google_gke_hub_membership.membership](https://registry.terraform.io/providers/hashicorp/google-beta/5.19.0/docs/resources/google_gke_hub_membership) | resource | +| [google_project_service.project_services-an](https://registry.terraform.io/providers/hashicorp/google/5.19.0/docs/resources/project_service) | resource | +| [google_project_service.project_services-anc](https://registry.terraform.io/providers/hashicorp/google/5.19.0/docs/resources/project_service) | resource | +| [google_project_service.project_services-com](https://registry.terraform.io/providers/hashicorp/google/5.19.0/docs/resources/project_service) | resource | +| [google_project_service.project_services-con](https://registry.terraform.io/providers/hashicorp/google/5.19.0/docs/resources/project_service) | resource | +| [google_project_service.project_services-cr](https://registry.terraform.io/providers/hashicorp/google/5.19.0/docs/resources/project_service) | resource | +| [google_project_service.project_services-gate](https://registry.terraform.io/providers/hashicorp/google/5.19.0/docs/resources/project_service) | resource | +| [google_project_service.project_services-gkecon](https://registry.terraform.io/providers/hashicorp/google/5.19.0/docs/resources/project_service) | resource | +| [google_project_service.project_services-gkeh](https://registry.terraform.io/providers/hashicorp/google/5.19.0/docs/resources/project_service) | resource | +| [google_project_service.project_services-iam](https://registry.terraform.io/providers/hashicorp/google/5.19.0/docs/resources/project_service) | resource | +| [null_resource.create_git_cred_cms](https://registry.terraform.io/providers/hashicorp/null/3.2.2/docs/resources/resource) | resource | +| [null_resource.create_git_cred_ns](https://registry.terraform.io/providers/hashicorp/null/3.2.2/docs/resources/resource) | resource | +| [null_resource.create_namespace](https://registry.terraform.io/providers/hashicorp/null/3.2.2/docs/resources/resource) | resource | +| [null_resource.install_kuberay_operator](https://registry.terraform.io/providers/hashicorp/null/3.2.2/docs/resources/resource) | resource | +| [null_resource.install_ray_cluster](https://registry.terraform.io/providers/hashicorp/null/3.2.2/docs/resources/resource) | resource | +| [null_resource.manage_ray_ns](https://registry.terraform.io/providers/hashicorp/null/3.2.2/docs/resources/resource) | resource | + +## Inputs + +| Name | Description | Type | Default | Required | +|------|-------------|------|---------|:--------:| +| [billing\_account](#input\_billing\_account) | GCP billing account | `string` | `null` | no | +| [cluster\_name](#input\_cluster\_name) | Name of the GKE cluster | `string` | `"gke-ml"` | no | +| [configsync\_repo\_name](#input\_configsync\_repo\_name) | Name of the GitHub repo that will be synced to the cluster with Config sync. | `string` | `"config-sync-repo"` | no | +| [create\_namespace](#input\_create\_namespace) | Setup a namespace to demo. | `number` | `1` | no | +| [create\_projects](#input\_create\_projects) | Flag to create GCP projects | `number` | `0` | no | +| [env](#input\_env) | List of environments | `set(string)` |
[
"dev"
]
| no | +| [folder\_id](#input\_folder\_id) | Folder Id where the GCP projects will be created | `string` | `null` | no | +| [github\_email](#input\_github\_email) | GitHub user email. | `string` | n/a | yes | +| [github\_org](#input\_github\_org) | GitHub org. | `string` | n/a | yes | +| [github\_token](#input\_github\_token) | GitHub token. It is a token with write permissions as it will create a repo in the GitHub org. | `string` | n/a | yes | +| [github\_user](#input\_github\_user) | GitHub user name. | `string` | n/a | yes | +| [install\_kuberay](#input\_install\_kuberay) | Flag to install kuberay operator. | `number` | `1` | no | +| [install\_ray\_in\_ns](#input\_install\_ray\_in\_ns) | Flag to install ray cluster in the namespace created with the demo. | `number` | `1` | no | +| [namespace](#input\_namespace) | Name of the namespace to demo. | `string` | `"ml-team"` | no | +| [network\_name](#input\_network\_name) | VPC network where GKE cluster will be created | `string` | `"ml-vpc"` | no | +| [ondemand\_taints](#input\_ondemand\_taints) | Taints to be applied to the on-demand node pool. |
list(object({
key = string
value = any
effect = string
}))
|
[
{
"effect": "NO_SCHEDULE",
"key": "ondemand",
"value": true
}
]
| no | +| [org\_id](#input\_org\_id) | The GCP orig id | `string` | `null` | no | +| [project\_id](#input\_project\_id) | The GCP project where the resources will be created. It is a map with environments as keys and project\_ids s values | `map` | n/a | yes | +| [project\_name](#input\_project\_name) | GCP project name | `string` | `null` | no | +| [reserved\_taints](#input\_reserved\_taints) | Taints to be applied to the reserved node pool. |
list(object({
key = string
value = any
effect = string
}))
|
[
{
"effect": "NO_SCHEDULE",
"key": "reserved",
"value": true
}
]
| no | +| [routing\_mode](#input\_routing\_mode) | VPC routing mode. | `string` | `"GLOBAL"` | no | +| [secret\_for\_rootsync](#input\_secret\_for\_rootsync) | Create git-cred in config-management-system namespace. | `number` | `1` | no | +| [spot\_taints](#input\_spot\_taints) | Taints to be applied to the spot node pool. |
list(object({
key = string
value = any
effect = string
}))
|
[
{
"effect": "NO_SCHEDULE",
"key": "spot",
"value": true
}
]
| no | +| [subnet\_01\_description](#input\_subnet\_01\_description) | Description of the first subnet. | `string` | `"subnet 01"` | no | +| [subnet\_01\_ip](#input\_subnet\_01\_ip) | CIDR of the first subnet. | `string` | `"10.40.0.0/22"` | no | +| [subnet\_01\_name](#input\_subnet\_01\_name) | Name of the first subnet in the VPC network. | `string` | `"ml-vpc-subnet-01"` | no | +| [subnet\_01\_region](#input\_subnet\_01\_region) | Region of the first subnet. | `string` | `"us-central1"` | no | +| [subnet\_02\_description](#input\_subnet\_02\_description) | Description of the second subnet. | `string` | `"subnet 02"` | no | +| [subnet\_02\_ip](#input\_subnet\_02\_ip) | CIDR of the second subnet. | `string` | `"10.12.0.0/22"` | no | +| [subnet\_02\_name](#input\_subnet\_02\_name) | Name of the second subnet in the VPC network. | `string` | `"gke-vpc-subnet-02"` | no | +| [subnet\_02\_region](#input\_subnet\_02\_region) | Region of the second subnet. | `string` | `"us-west2"` | no | + +## Outputs + +| Name | Description | +|------|-------------| +| [project\_ids](#output\_project\_ids) | n/a | + +[gitops]: https://about.gitlab.com/topics/gitops/ +[repo-sync]: https://cloud.google.com/anthos-config-management/docs/reference/rootsync-reposync-fields +[root-sync]: https://cloud.google.com/anthos-config-management/docs/reference/rootsync-reposync-fields +[config-sync]: https://cloud.google.com/anthos-config-management/docs/config-sync-overview +[cloud-deploy]: https://cloud.google.com/deploy?hl=en +[terraform]: https://www.terraform.io/ +[gke]: https://cloud.google.com/kubernetes-engine?hl=en +[git]: https://git-scm.com/ +[github]: https://github.com/ +[gcp-project]: https://cloud.google.com/resource-manager/docs/creating-managing-projects +[personal-access-token]: https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/managing-your-personal-access-tokens +[machine-user-account]: https://docs.github.com/en/get-started/learning-about-github/types-of-github-accounts \ No newline at end of file diff --git a/ml-platform/backend.tf b/ml-platform/terraform/backend.tf similarity index 100% rename from ml-platform/backend.tf rename to ml-platform/terraform/backend.tf diff --git a/ml-platform/main.tf b/ml-platform/terraform/main.tf similarity index 62% rename from ml-platform/main.tf rename to ml-platform/terraform/main.tf index 3f5b53b42..71dc012a3 100644 --- a/ml-platform/main.tf +++ b/ml-platform/terraform/main.tf @@ -12,6 +12,14 @@ # See the License for the specific language governing permissions and # limitations under the License. +locals { + parsed_project_id = var.create_projects == 0 ? var.project_id : { for k, v in "${module.gcp-project.project_ids}" : k => v.project_id } + parsed_gke_info = module.gke + parsed_gke_info_without_default_env = { for k, v in "${local.parsed_gke_info}" : k => v if k != var.default_env } + project_id_list = [for k, v in "${module.gke}" : v.gke_project_id] + gke_project_map = { for k, v in "${module.gke}" : v.cluster_name => v.gke_project_id } +} + #TODO: Add a validation that the value if default_env must be one of the values in env list module "gcp-project" { count = var.create_projects @@ -23,15 +31,13 @@ module "gcp-project" { project_name = var.project_name } - -locals { - #parsed_project_id = length(keys("${var.project_id}")) == 0 ? data.terraform_remote_state.gcp-projects[0].outputs.project_ids : var.project_id - #var.create_projects == 1 ? {for k, v in "${module.gcp-project.project_ids}" : k => v.project_id} : "" - parsed_project_id = var.create_projects == 0 ? var.project_id : { for k, v in "${module.gcp-project.project_ids}" : k => v.project_id } - parsed_gke_info = module.gke - parsed_gke_info_without_default_env = { for k, v in "${local.parsed_gke_info}" : k => v if k != var.default_env } - project_id_list = [for k, v in "${module.gke}" : v.gke_project_id] - gke_project_map = { for k, v in "${module.gke}" : v.cluster_name => v.gke_project_id } +resource "google_project_service" "containerfilesystem_googleapis_com" { + for_each = local.parsed_project_id + project = each.value + service = "containerfilesystem.googleapis.com" + disable_on_destroy = false + disable_dependent_services = false + depends_on = [module.gcp-project] } resource "google_project_service" "project_services-cr" { @@ -51,6 +57,7 @@ resource "google_project_service" "project_services-an" { disable_dependent_services = false depends_on = [module.gcp-project, google_project_service.project_services-cr] } + resource "google_project_service" "project_services-anc" { for_each = local.parsed_project_id project = each.value @@ -59,6 +66,7 @@ resource "google_project_service" "project_services-anc" { disable_dependent_services = false depends_on = [module.gcp-project, google_project_service.project_services-cr] } + resource "google_project_service" "project_services-con" { for_each = local.parsed_project_id project = each.value @@ -67,6 +75,7 @@ resource "google_project_service" "project_services-con" { disable_dependent_services = false depends_on = [module.gcp-project, google_project_service.project_services-cr] } + resource "google_project_service" "project_services-com" { for_each = local.parsed_project_id project = each.value @@ -75,6 +84,7 @@ resource "google_project_service" "project_services-com" { disable_dependent_services = false depends_on = [module.gcp-project, google_project_service.project_services-cr] } + resource "google_project_service" "project_services-gkecon" { for_each = local.parsed_project_id project = each.value @@ -83,6 +93,7 @@ resource "google_project_service" "project_services-gkecon" { disable_dependent_services = false depends_on = [module.gcp-project, google_project_service.project_services-cr] } + resource "google_project_service" "project_services-gkeh" { for_each = local.parsed_project_id project = each.value @@ -91,6 +102,7 @@ resource "google_project_service" "project_services-gkeh" { disable_dependent_services = false depends_on = [module.gcp-project, google_project_service.project_services-cr] } + resource "google_project_service" "project_services-iam" { for_each = local.parsed_project_id project = each.value @@ -121,17 +133,27 @@ module "create-vpc" { subnet_02_name = format("%s-%s", var.subnet_02_name, each.key) subnet_02_ip = var.subnet_02_ip subnet_02_region = var.subnet_02_region - #default_route_name = format("%s-%s","default-route",each.key) - depends_on = [module.gcp-project, google_project_service.project_services-com] + + depends_on = [ + module.gcp-project, + google_project_service.project_services-com + ] } resource "google_gke_hub_feature" "configmanagement_acm_feature" { - count = length(distinct(values(local.parsed_project_id))) - name = "configmanagement" - project = distinct(values(local.parsed_project_id))[count.index] - location = "global" - provider = google-beta - depends_on = [google_project_service.project_services-gkeh, google_project_service.project_services-anc, google_project_service.project_services-an, google_project_service.project_services-com, google_project_service.project_services-gkecon] + count = length(distinct(values(local.parsed_project_id))) + name = "configmanagement" + project = distinct(values(local.parsed_project_id))[count.index] + location = "global" + provider = google-beta + + depends_on = [ + google_project_service.project_services-gkeh, + google_project_service.project_services-anc, + google_project_service.project_services-an, + google_project_service.project_services-com, + google_project_service.project_services-gkecon + ] } module "gke" { @@ -147,6 +169,7 @@ module "gke" { depends_on = [google_gke_hub_feature.configmanagement_acm_feature, google_project_service.project_services-con, google_project_service.project_services-com] env = each.key } + module "reservation" { for_each = local.parsed_project_id source = "./modules/vm-reservations" @@ -155,13 +178,14 @@ module "reservation" { project_id = each.value depends_on = [module.gke] } + module "node_pool-reserved" { for_each = local.parsed_project_id source = "./modules/node-pools" node_pool_name = "reservation" project_id = each.value cluster_name = module.gke[each.key].cluster_name - region = "${var.subnet_01_region}" + region = var.subnet_01_region taints = var.reserved_taints resource_type = "reservation" reservation_name = module.reservation[each.key].reservation_name @@ -174,7 +198,7 @@ module "node_pool-ondemand" { node_pool_name = "ondemand" project_id = each.value cluster_name = module.gke[each.key].cluster_name - region = "${var.subnet_01_region}" + region = var.subnet_01_region taints = var.ondemand_taints resource_type = "ondemand" depends_on = [module.gke] @@ -186,7 +210,7 @@ module "node_pool-spot" { node_pool_name = "spot" project_id = each.value cluster_name = module.gke[each.key].cluster_name - region = "${var.subnet_01_region}" + region = var.subnet_01_region taints = var.spot_taints resource_type = "spot" depends_on = [module.gke] @@ -204,29 +228,6 @@ module "cloud-nat" { depends_on = [module.create-vpc, google_project_service.project_services-com] } - - -//data "terraform_remote_state" "gke-clusters" { -// backend = "gcs" -// config = { -// bucket = var.lookup_state_bucket -// prefix = "02_gke" -// } -//} -// -//locals { -// parsed_gke_info = module.gke -// project_id_list = [for k,v in "${module.gke}" : v.gke_project_id] -//} - -//resource "google_gke_hub_feature" "configmanagement_acm_feature" { -// count = length(distinct(local.project_id_list)) -// name = "configmanagement" -// project = distinct(local.project_id_list)[count.index] -// location = "global" -// provider = google-beta -//} - resource "google_gke_hub_membership" "membership" { provider = google-beta for_each = local.parsed_gke_info @@ -237,12 +238,18 @@ resource "google_gke_hub_membership" "membership" { resource_link = format("%s/%s", "//container.googleapis.com", each.value["cluster_id"]) } } + lifecycle { ignore_changes = [ labels ] } - depends_on = [google_gke_hub_feature.configmanagement_acm_feature, google_project_service.project_services-gkeh, google_project_service.project_services-gkecon] + + depends_on = [ + google_gke_hub_feature.configmanagement_acm_feature, + google_project_service.project_services-gkeh, + google_project_service.project_services-gkecon + ] } resource "github_repository" "acm_repo" { @@ -260,22 +267,20 @@ resource "github_repository" "acm_repo" { auto_init = true vulnerability_alerts = true } -//Create a branch for each env + resource "github_branch" "branch" { for_each = local.parsed_gke_info repository = split("/", github_repository.acm_repo.full_name)[1] branch = each.key depends_on = [github_repository.acm_repo] } -//Set default branch as the lowest env + resource "github_branch_default" "default_branch" { repository = split("/", github_repository.acm_repo.full_name)[1] - #branch = tostring(keys(local.parsed_gke_info)[0]) - branch = var.default_env - #rename = true + branch = var.default_env depends_on = [github_branch.branch] } -#Protect branches other than the default branch + resource "github_branch_protection_v3" "branch_protection" { for_each = length(keys(local.parsed_project_id)) > 1 ? local.parsed_gke_info_without_default_env : {} repository = split("/", github_repository.acm_repo.full_name)[1] @@ -285,7 +290,6 @@ resource "github_branch_protection_v3" "branch_protection" { require_code_owner_reviews = true } restrictions { - } depends_on = [github_branch.branch] @@ -299,7 +303,7 @@ resource "google_gke_hub_feature_membership" "feature_member" { feature = "configmanagement" membership = google_gke_hub_membership.membership[each.key].membership_id configmanagement { - version = "1.17.0" + version = var.config_management_version config_sync { source_format = "unstructured" git { @@ -316,75 +320,163 @@ resource "google_gke_hub_feature_membership" "feature_member" { } } + depends_on = [ + google_project_service.project_services-gkecon, + google_project_service.project_services-gkeh, + google_project_service.project_services-an, + google_project_service.project_services-anc + ] +} + +resource "null_resource" "create_cluster_yamls" { + for_each = local.parsed_gke_info + triggers = { + md5_script = filemd5("${path.module}/scripts/create_cluster_yamls.sh") + md5_files = md5(join("", [for f in fileset("${path.module}/templates/acm-template", "**") : md5("${path.module}/templates/acm-template/${f}")])) + } + provisioner "local-exec" { - command = "${path.module}/create_cluster_yamls.sh ${var.github_org} ${github_repository.acm_repo.full_name} ${var.github_user} ${var.github_email} ${each.value["env"]} ${each.value["cluster_name"]} ${index(keys(local.parsed_gke_info), each.key)}" + command = "${path.module}/scripts/create_cluster_yamls.sh ${var.github_org} ${github_repository.acm_repo.full_name} ${var.github_user} ${var.github_email} ${each.value["env"]} ${each.value["cluster_name"]} ${index(keys(local.parsed_gke_info), each.key)}" + environment = { + GIT_TOKEN = var.github_token + } } - depends_on = [google_project_service.project_services-gkecon, google_project_service.project_services-gkeh, google_project_service.project_services-an, google_project_service.project_services-anc] + depends_on = [google_gke_hub_feature_membership.feature_member] } resource "null_resource" "create_git_cred_cms" { for_each = var.secret_for_rootsync == 1 ? local.gke_project_map : {} triggers = { - timestamp = timestamp() + md5_script = filemd5("${path.module}/scripts/create_git_cred.sh") + md5_credentials = md5(join("", [var.github_user, var.github_token])) } + provisioner "local-exec" { - command = "${path.module}/create_git_cred.sh ${each.key} ${each.value} ${var.github_user} config-management-system ${index(keys(local.gke_project_map), each.key)}" + command = "${path.module}/scripts/create_git_cred.sh ${each.key} ${each.value} ${var.github_user} config-management-system ${index(keys(local.gke_project_map), each.key)}" + environment = { + GIT_TOKEN = var.github_token + } } - depends_on = [google_gke_hub_feature_membership.feature_member, module.gke, module.node_pool-reserved, module.node_pool-ondemand, module.node_pool-spot, module.cloud-nat] + + depends_on = [ + google_gke_hub_feature_membership.feature_member, + module.gke, + module.node_pool-reserved, + module.node_pool-ondemand, + module.node_pool-spot, + module.cloud-nat + ] } resource "null_resource" "install_kuberay_operator" { count = var.install_kuberay triggers = { - timestamp = timestamp() + md5_script = filemd5("${path.module}/scripts/install_kuberay_operator.sh") + md5_files = md5(join("", [for f in fileset("${path.module}/templates/acm-template/templates/_cluster_template/kuberay", "**") : md5("${path.module}/templates/acm-template/templates/_cluster_template/kuberay/${f}")])) } + provisioner "local-exec" { - command = "${path.module}/install_kuberay_operator.sh ${github_repository.acm_repo.full_name} ${var.github_email} ${var.github_org} ${var.github_user}" + command = "${path.module}/scripts/install_kuberay_operator.sh ${github_repository.acm_repo.full_name} ${var.github_email} ${var.github_org} ${var.github_user}" + environment = { + GIT_TOKEN = var.github_token + } } - depends_on = [google_gke_hub_feature_membership.feature_member, null_resource.create_git_cred_cms] + + depends_on = [ + google_gke_hub_feature_membership.feature_member, + null_resource.create_git_cred_cms + ] +} + +resource "google_service_account" "namespace_default" { + account_id = "wi-${var.namespace}-default" + display_name = "${var.namespace} Default Workload Identity Service Account" + project = local.parsed_project_id[var.default_env] +} + +resource "google_service_account_iam_member" "wi_cymbal_bank_backend_workload_identity_user" { + member = "serviceAccount:${local.parsed_project_id[var.default_env]}.svc.id.goog[${var.namespace}/${var.namespace}-default]" + role = "roles/iam.workloadIdentityUser" + service_account_id = google_service_account.namespace_default.id } resource "null_resource" "create_namespace" { count = var.create_namespace triggers = { - timestamp = timestamp() + md5_script = filemd5("${path.module}/scripts/create_namespace.sh") + md5_files = md5(join("", [for f in fileset("${path.module}/templates/acm-template/templates/_cluster_template/team", "**") : md5("${path.module}/templates/acm-template/templates/_cluster_template/team/${f}")])) } + provisioner "local-exec" { - command = "${path.module}/create_namespace.sh ${github_repository.acm_repo.full_name} ${var.github_email} ${var.github_org} ${var.github_user} ${var.namespace}" + command = "${path.module}/scripts/create_namespace.sh ${github_repository.acm_repo.full_name} ${var.github_email} ${var.github_org} ${var.github_user} ${var.namespace} ${var.default_env}" + environment = { + GIT_TOKEN = var.github_token + } } - depends_on = [google_gke_hub_feature_membership.feature_member, null_resource.install_kuberay_operator] + + depends_on = [ + google_gke_hub_feature_membership.feature_member, + null_resource.install_kuberay_operator + ] } resource "null_resource" "create_git_cred_ns" { count = var.create_namespace triggers = { - timestamp = timestamp() + md5_script = filemd5("${path.module}/scripts/create_git_cred.sh") + md5_credentials = md5(join("", [var.github_user, var.github_token])) } + provisioner "local-exec" { - command = "${path.module}/create_git_cred.sh ${local.parsed_gke_info[var.default_env].cluster_name} ${local.parsed_gke_info[var.default_env].gke_project_id} ${var.github_user} ${var.namespace}" + command = "${path.module}/scripts/create_git_cred.sh ${local.parsed_gke_info[var.default_env].cluster_name} ${local.parsed_gke_info[var.default_env].gke_project_id} ${var.github_user} ${var.namespace}" + environment = { + GIT_TOKEN = var.github_token + } } - depends_on = [ google_gke_hub_feature_membership.feature_member, null_resource.create_namespace ] + + depends_on = [ + google_gke_hub_feature_membership.feature_member, + null_resource.create_namespace + ] } resource "null_resource" "install_ray_cluster" { count = var.install_ray_in_ns triggers = { - timestamp = timestamp() + md5_script = filemd5("${path.module}/scripts/install_ray_cluster.sh") + md5_files = md5(join("", [for f in fileset("${path.module}/templates/acm-template//templates/_namespace_template/app", "**") : md5("${path.module}/templates/acm-template//templates/_namespace_template/app/${f}")])) } + provisioner "local-exec" { - command = "${path.module}/install_ray_cluster.sh ${github_repository.acm_repo.full_name} ${var.github_email} ${var.github_org} ${var.github_user} ${var.namespace}" + command = "${path.module}/scripts/install_ray_cluster.sh ${github_repository.acm_repo.full_name} ${var.github_email} ${var.github_org} ${var.github_user} ${var.namespace} ${google_service_account.namespace_default.email}" + environment = { + GIT_TOKEN = var.github_token + } } - depends_on = [google_gke_hub_feature_membership.feature_member, null_resource.create_git_cred_ns] + + depends_on = [ + google_gke_hub_feature_membership.feature_member, + null_resource.create_git_cred_ns + ] } resource "null_resource" "manage_ray_ns" { count = var.install_ray_in_ns triggers = { - timestamp = timestamp() + md5_script = filemd5("${path.module}/scripts/manage_ray_ns.sh") } + provisioner "local-exec" { - command = "${path.module}/manage_ray_ns.sh ${github_repository.acm_repo.full_name} ${var.github_email} ${var.github_org} ${var.github_user} ${var.namespace}" + command = "${path.module}/scripts/manage_ray_ns.sh ${github_repository.acm_repo.full_name} ${var.github_email} ${var.github_org} ${var.github_user} ${var.namespace}" + environment = { + GIT_TOKEN = var.github_token + } } - depends_on = [google_gke_hub_feature_membership.feature_member, null_resource.create_git_cred_ns, null_resource.install_ray_cluster] -} \ No newline at end of file + + depends_on = [ + google_gke_hub_feature_membership.feature_member, + null_resource.create_git_cred_ns, + null_resource.install_ray_cluster + ] +} diff --git a/ml-platform/terraform/mlp.auto.tfvars b/ml-platform/terraform/mlp.auto.tfvars new file mode 100644 index 000000000..142b9691f --- /dev/null +++ b/ml-platform/terraform/mlp.auto.tfvars @@ -0,0 +1,9 @@ +project_id = { "dev" : "YOUR_PROJECT_ID" } +default_env = "dev" +github_user = "YOUR_GITHUB_USER" +github_email = "YOUR_GITHUB_EMAIL" +github_org = "YOUR_GITHUB_ORG" +#github_token = "DO NOT ADD TOKEN HERE. PASS IT AS `export TF_VAR_github_token="YOUR TOKEN"` ON CLOUDSHELL" +#folder_id = "YOUR_FOLDER_ID" +#org_id = "YOUR_GCP_ORG_ID" +#billing_account = "YOUR_BILLING_ACCOUNT" diff --git a/ml-platform/modules/cloud-nat/README.md b/ml-platform/terraform/modules/cloud-nat/README.md similarity index 100% rename from ml-platform/modules/cloud-nat/README.md rename to ml-platform/terraform/modules/cloud-nat/README.md diff --git a/ml-platform/modules/cloud-nat/main.tf b/ml-platform/terraform/modules/cloud-nat/main.tf similarity index 100% rename from ml-platform/modules/cloud-nat/main.tf rename to ml-platform/terraform/modules/cloud-nat/main.tf diff --git a/ml-platform/modules/cloud-nat/outputs.tf b/ml-platform/terraform/modules/cloud-nat/outputs.tf similarity index 100% rename from ml-platform/modules/cloud-nat/outputs.tf rename to ml-platform/terraform/modules/cloud-nat/outputs.tf diff --git a/ml-platform/modules/cloud-nat/variables.tf b/ml-platform/terraform/modules/cloud-nat/variables.tf similarity index 100% rename from ml-platform/modules/cloud-nat/variables.tf rename to ml-platform/terraform/modules/cloud-nat/variables.tf diff --git a/ml-platform/modules/cloud-nat/versions.tf b/ml-platform/terraform/modules/cloud-nat/versions.tf similarity index 100% rename from ml-platform/modules/cloud-nat/versions.tf rename to ml-platform/terraform/modules/cloud-nat/versions.tf diff --git a/ml-platform/modules/cluster/gke.tf b/ml-platform/terraform/modules/cluster/gke.tf similarity index 96% rename from ml-platform/modules/cluster/gke.tf rename to ml-platform/terraform/modules/cluster/gke.tf index b08e92b9b..ab1b44141 100644 --- a/ml-platform/modules/cluster/gke.tf +++ b/ml-platform/terraform/modules/cluster/gke.tf @@ -104,6 +104,13 @@ resource "google_container_cluster" "gke_batch" { ] } } + node_pool_defaults { + node_config_defaults { + gcfs_config { + enabled = true + } + } + } release_channel { channel = "STABLE" } @@ -121,5 +128,3 @@ resource "google_container_cluster" "gke_batch" { } ip_allocation_policy {} } - - diff --git a/ml-platform/modules/cluster/outputs.tf b/ml-platform/terraform/modules/cluster/outputs.tf similarity index 100% rename from ml-platform/modules/cluster/outputs.tf rename to ml-platform/terraform/modules/cluster/outputs.tf diff --git a/ml-platform/modules/cluster/variables.tf b/ml-platform/terraform/modules/cluster/variables.tf similarity index 100% rename from ml-platform/modules/cluster/variables.tf rename to ml-platform/terraform/modules/cluster/variables.tf diff --git a/ml-platform/modules/cluster/versions.tf b/ml-platform/terraform/modules/cluster/versions.tf similarity index 100% rename from ml-platform/modules/cluster/versions.tf rename to ml-platform/terraform/modules/cluster/versions.tf diff --git a/ml-platform/modules/network/README.md b/ml-platform/terraform/modules/network/README.md similarity index 100% rename from ml-platform/modules/network/README.md rename to ml-platform/terraform/modules/network/README.md diff --git a/ml-platform/modules/network/outputs.tf b/ml-platform/terraform/modules/network/outputs.tf similarity index 100% rename from ml-platform/modules/network/outputs.tf rename to ml-platform/terraform/modules/network/outputs.tf diff --git a/ml-platform/modules/network/variables.tf b/ml-platform/terraform/modules/network/variables.tf similarity index 100% rename from ml-platform/modules/network/variables.tf rename to ml-platform/terraform/modules/network/variables.tf diff --git a/ml-platform/modules/network/versions.tf b/ml-platform/terraform/modules/network/versions.tf similarity index 100% rename from ml-platform/modules/network/versions.tf rename to ml-platform/terraform/modules/network/versions.tf diff --git a/ml-platform/modules/network/vpc.tf b/ml-platform/terraform/modules/network/vpc.tf similarity index 100% rename from ml-platform/modules/network/vpc.tf rename to ml-platform/terraform/modules/network/vpc.tf diff --git a/ml-platform/modules/node-pools/nodepools.tf b/ml-platform/terraform/modules/node-pools/nodepools.tf similarity index 97% rename from ml-platform/modules/node-pools/nodepools.tf rename to ml-platform/terraform/modules/node-pools/nodepools.tf index 72b07a239..d8535b8ce 100644 --- a/ml-platform/modules/node-pools/nodepools.tf +++ b/ml-platform/terraform/modules/node-pools/nodepools.tf @@ -18,6 +18,9 @@ resource "google_container_node_pool" "node-pool" { cluster = var.cluster_name location = var.region node_config { + gcfs_config { + enabled = true + } machine_type = var.machine_type dynamic "taint" { for_each = var.taints @@ -68,4 +71,4 @@ resource "google_container_node_pool" "node-pool" { network_config { enable_private_nodes = true } -} \ No newline at end of file +} diff --git a/ml-platform/modules/node-pools/variables.tf b/ml-platform/terraform/modules/node-pools/variables.tf similarity index 100% rename from ml-platform/modules/node-pools/variables.tf rename to ml-platform/terraform/modules/node-pools/variables.tf diff --git a/ml-platform/modules/node-pools/versions.tf b/ml-platform/terraform/modules/node-pools/versions.tf similarity index 100% rename from ml-platform/modules/node-pools/versions.tf rename to ml-platform/terraform/modules/node-pools/versions.tf diff --git a/ml-platform/modules/projects/outputs.tf b/ml-platform/terraform/modules/projects/outputs.tf similarity index 100% rename from ml-platform/modules/projects/outputs.tf rename to ml-platform/terraform/modules/projects/outputs.tf diff --git a/ml-platform/modules/projects/projects.tf b/ml-platform/terraform/modules/projects/projects.tf similarity index 100% rename from ml-platform/modules/projects/projects.tf rename to ml-platform/terraform/modules/projects/projects.tf diff --git a/ml-platform/modules/projects/variables.tf b/ml-platform/terraform/modules/projects/variables.tf similarity index 100% rename from ml-platform/modules/projects/variables.tf rename to ml-platform/terraform/modules/projects/variables.tf diff --git a/ml-platform/modules/projects/versions.tf b/ml-platform/terraform/modules/projects/versions.tf similarity index 100% rename from ml-platform/modules/projects/versions.tf rename to ml-platform/terraform/modules/projects/versions.tf diff --git a/ml-platform/modules/vm-reservations/outputs.tf b/ml-platform/terraform/modules/vm-reservations/outputs.tf similarity index 100% rename from ml-platform/modules/vm-reservations/outputs.tf rename to ml-platform/terraform/modules/vm-reservations/outputs.tf diff --git a/ml-platform/modules/vm-reservations/reservations.tf b/ml-platform/terraform/modules/vm-reservations/reservations.tf similarity index 100% rename from ml-platform/modules/vm-reservations/reservations.tf rename to ml-platform/terraform/modules/vm-reservations/reservations.tf diff --git a/ml-platform/modules/vm-reservations/variables.tf b/ml-platform/terraform/modules/vm-reservations/variables.tf similarity index 100% rename from ml-platform/modules/vm-reservations/variables.tf rename to ml-platform/terraform/modules/vm-reservations/variables.tf diff --git a/ml-platform/modules/vm-reservations/versions.tf b/ml-platform/terraform/modules/vm-reservations/versions.tf similarity index 100% rename from ml-platform/modules/vm-reservations/versions.tf rename to ml-platform/terraform/modules/vm-reservations/versions.tf diff --git a/ml-platform/outputs.tf b/ml-platform/terraform/outputs.tf similarity index 69% rename from ml-platform/outputs.tf rename to ml-platform/terraform/outputs.tf index f9f8ea6f3..633bb7f1e 100644 --- a/ml-platform/outputs.tf +++ b/ml-platform/terraform/outputs.tf @@ -11,11 +11,3 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - -//output "project_ids" { -// value = {for k, v in "${module.gcp-project.project_ids}" : k => v.project_id} -//} - -output "project_ids" { - value = var.create_projects == 1 ? { for k, v in "${module.gcp-project.project_ids}" : k => v.project_id } : "" -} \ No newline at end of file diff --git a/ml-platform/create_cluster_yamls.sh b/ml-platform/terraform/scripts/create_cluster_yamls.sh similarity index 74% rename from ml-platform/create_cluster_yamls.sh rename to ml-platform/terraform/scripts/create_cluster_yamls.sh index 627e8fd7a..8b46a24b9 100755 --- a/ml-platform/create_cluster_yamls.sh +++ b/ml-platform/terraform/scripts/create_cluster_yamls.sh @@ -22,17 +22,20 @@ cluster_env=${5} cluster_name=${6} index=${7} sleep_time=20 -sleep_index=$((${index}+1)) -sleep_total=$((${sleep_time}*${sleep_index})) +sleep_index=$((${index} + 1)) +sleep_total=$((${sleep_time} * ${sleep_index})) sleep $sleep_total -random=$(echo $RANDOM | md5sum | head -c 20; echo) +random=$( + echo $RANDOM | md5sum | head -c 20 + echo +) log="$(pwd)/log" flag=0 download_acm_repo_name="/tmp/$(echo ${acm_repo_name} | awk -F "/" '{print $2}')-${random}" git config --global user.name ${github_user} git config --global user.email ${github_emai} -git clone https://${github_user}:${TF_VAR_github_token}@github.com/${acm_repo_name} ${download_acm_repo_name} +git clone https://${github_user}:${GIT_TOKEN}@github.com/${acm_repo_name} ${download_acm_repo_name} || exit 1 if [ ! -d "${download_acm_repo_name}/manifests" ] && [ ! -d "${download_acm_repo_name}/templates" ]; then echo "copying files" @@ -47,11 +50,9 @@ fi cp ../../templates/_cluster_template/cluster.yaml ./${cluster_name}-cluster.yaml cp ../../templates/_cluster_template/selector.yaml ./${cluster_env}-selector.yaml -find . -type f -name ${cluster_name}-cluster.yaml -exec sed -i "s/CLUSTER_NAME/${cluster_name}/g" {} + -find . -type f -name ${cluster_name}-cluster.yaml -exec sed -i "s/ENV/${cluster_env}/g" {} + -find . -type f -name ${cluster_env}-selector.yaml -exec sed -i "s/ENV/${cluster_env}/g" {} + - -#cp ../../templates/_cluster_template/kuberay . +find . -type f -name ${cluster_name}-cluster.yaml -exec sed -i "s/CLUSTER_NAME/${cluster_name}/g" {} + +find . -type f -name ${cluster_name}-cluster.yaml -exec sed -i "s/ENV/${cluster_env}/g" {} + +find . -type f -name ${cluster_env}-selector.yaml -exec sed -i "s/ENV/${cluster_env}/g" {} + git add ../../. git config --global user.name ${github_user} @@ -59,5 +60,4 @@ git config --global user.email ${github_email} git commit -m "Adding ${cluster_name} cluster to the ${cluster_env} environment." git push origin -cd - rm -rf ${download_acm_repo_name} diff --git a/ml-platform/create_git_cred.sh b/ml-platform/terraform/scripts/create_git_cred.sh similarity index 61% rename from ml-platform/create_git_cred.sh rename to ml-platform/terraform/scripts/create_git_cred.sh index da5a92104..d48907d45 100755 --- a/ml-platform/create_git_cred.sh +++ b/ml-platform/terraform/scripts/create_git_cred.sh @@ -13,26 +13,28 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + gke_cluster=${1} project_id=${2} git_user=${3} namespace=${4} index=${5} sleep_time=60 -sleep_index=$((${index}+1)) -sleep_total=$((${sleep_time}*${sleep_index})) +sleep_index=$((${index} + 1)) +sleep_total=$((${sleep_time} * ${sleep_index})) sleep $sleep_total -gcloud container fleet memberships get-credentials ${gke_cluster} --project ${project_id} -ns_exists=$(kubectl get ns ${namespace} -o name | awk -F '/' '{print $2}') -while [ "${ns_exists}" != "${namespace}" ] -do -sleep 10 +gcloud container fleet memberships get-credentials ${gke_cluster} --project ${project_id} + ns_exists=$(kubectl get ns ${namespace} -o name | awk -F '/' '{print $2}') +while [ "${ns_exists}" != "${namespace}" ]; do + sleep 10 + ns_exists=$(kubectl get ns ${namespace} -o name | awk -F '/' '{print $2}') done + secret_exists=$(kubectl get secret git-creds -n ${namespace} -o name) if [[ "${secret_exists}" == "secret/git-creds" ]]; then - exit 0 + kubectl create secret generic git-creds --namespace="${namespace}" --save-config --dry-run=client --from-literal=username="${git_user}" --from-literal=token="${GIT_TOKEN}" -o yaml | kubectl apply -f - else - kubectl create secret generic git-creds --namespace="${namespace}" --from-literal=username="${git_user}" --from-literal=token="${TF_VAR_github_token}" -fi \ No newline at end of file + kubectl create secret generic git-creds --namespace="${namespace}" --save-config --from-literal=username="${git_user}" --from-literal=token="${GIT_TOKEN}" +fi diff --git a/ml-platform/create_namespace.sh b/ml-platform/terraform/scripts/create_namespace.sh similarity index 78% rename from ml-platform/create_namespace.sh rename to ml-platform/terraform/scripts/create_namespace.sh index 7a4e28d06..938c62b4e 100755 --- a/ml-platform/create_namespace.sh +++ b/ml-platform/terraform/scripts/create_namespace.sh @@ -13,30 +13,38 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + configsync_repo_name=${1} github_email=${2} github_org=${3} github_user=${4} namespace=${5} +cluster_env=${6} logfile=$(pwd)/log -random=$(echo $RANDOM | md5sum | head -c 20; echo) +random=$( + echo $RANDOM | md5sum | head -c 20 + echo +) download_acm_repo_name="/tmp/$(echo ${configsync_repo_name} | awk -F "/" '{print $2}')-${random}" git config --global user.name ${github_user} git config --global user.email ${github_emai} -git clone https://${github_user}:${TF_VAR_github_token}@github.com/${configsync_repo_name} ${download_acm_repo_name} +git clone https://${github_user}:${GIT_TOKEN}@github.com/${configsync_repo_name} ${download_acm_repo_name} || exit 1 cd ${download_acm_repo_name}/manifests/clusters if [ -d "${namespace}" ]; then exit 0 fi + +#TODO: This most likely needs to be fixed for multiple environments chars_in_namespace=$(echo -n ${namespace} | wc -c) -#adding 4 for number of chars in "dev-" -chars_in_reposync_name=$(expr $chars_in_namespace + 4) +chars_in_cluster_env=$(echo -n ${cluster_env} | wc -c) +chars_in_reposync_name=$(expr $chars_in_namespace + ${chars_in_cluster_env} + 1) mkdir ${namespace} || exit 1 cp -r ../../templates/_cluster_template/team/* ${namespace} sed -i "s?NAMESPACE?$namespace?g" ${namespace}/* sed -ni '/#END OF SINGLE ENV DECLARATION/q;p' ${namespace}/reposync.yaml +sed -i "s?ENV?$cluster_env?g" ${namespace}/reposync.yaml sed -i "s?GIT_REPO?https://github.com/$configsync_repo_name?g" ${namespace}/reposync.yaml sed -i "s??$chars_in_reposync_name?g" ${namespace}/reposync.yaml @@ -53,5 +61,4 @@ git config --global user.email ${github_email} git commit -m "Adding manifests to create a new namespace." git push origin -cd - -rm -rf ${download_acm_repo_name} \ No newline at end of file +rm -rf ${download_acm_repo_name} diff --git a/ml-platform/install_kuberay_operator.sh b/ml-platform/terraform/scripts/install_kuberay_operator.sh similarity index 82% rename from ml-platform/install_kuberay_operator.sh rename to ml-platform/terraform/scripts/install_kuberay_operator.sh index 08bb41410..11f0397aa 100755 --- a/ml-platform/install_kuberay_operator.sh +++ b/ml-platform/terraform/scripts/install_kuberay_operator.sh @@ -13,29 +13,34 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + configsync_repo_name=${1} github_email=${2} github_org=${3} github_user=${4} -random=$(echo $RANDOM | md5sum | head -c 20; echo) +random=$( + echo $RANDOM | md5sum | head -c 20 + echo +) download_acm_repo_name="/tmp/$(echo ${configsync_repo_name} | awk -F "/" '{print $2}')-${random}" git config --global user.name ${github_user} git config --global user.email ${github_emai} -git clone https://${github_user}:${TF_VAR_github_token}@github.com/${configsync_repo_name} ${download_acm_repo_name} +git clone https://${github_user}:${GIT_TOKEN}@github.com/${configsync_repo_name} ${download_acm_repo_name} || exit 1 cd ${download_acm_repo_name}/manifests/clusters if [ -f "kustomization.yaml" ]; then exit 0 fi + yamlfiles=$(find . -type f -name "*.yaml") cp ../../templates/_cluster_template/kustomization.yaml . -for yamlfile in `echo ${yamlfiles}` -do -cat <>kustomization.yaml +for yamlfile in $(echo ${yamlfiles}); do + cat <>kustomization.yaml - ${yamlfile} EOF done + cp -r ../../templates/_cluster_template/kuberay . git add . git config --global user.name ${github_user} @@ -43,5 +48,4 @@ git config --global user.email ${github_email} git commit -m "Adding manifests to install kuberay operator." git push origin -cd - -rm -rf ${download_acm_repo_name} \ No newline at end of file +rm -rf ${download_acm_repo_name} diff --git a/ml-platform/install_ray_cluster.sh b/ml-platform/terraform/scripts/install_ray_cluster.sh similarity index 76% rename from ml-platform/install_ray_cluster.sh rename to ml-platform/terraform/scripts/install_ray_cluster.sh index d7d62c8ba..01ef1231c 100755 --- a/ml-platform/install_ray_cluster.sh +++ b/ml-platform/terraform/scripts/install_ray_cluster.sh @@ -13,17 +13,23 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + configsync_repo_name=${1} github_email=${2} github_org=${3} github_user=${4} namespace=${5} +google_service_account=${6} +kubernetes_service_account="${namespace}-default" -random=$(echo $RANDOM | md5sum | head -c 20; echo) +random=$( + echo $RANDOM | md5sum | head -c 20 + echo +) download_acm_repo_name="/tmp/$(echo ${configsync_repo_name} | awk -F "/" '{print $2}')-${random}" git config --global user.name ${github_user} git config --global user.email ${github_emai} -git clone https://${github_user}:${TF_VAR_github_token}@github.com/${configsync_repo_name} ${download_acm_repo_name} +git clone https://${github_user}:${GIT_TOKEN}@github.com/${configsync_repo_name} ${download_acm_repo_name} || exit 1 cd ${download_acm_repo_name}/manifests/apps if [ ! -d "${namespace}" ]; then echo "${namespace} folder doesnt exist in the configsync repo" @@ -37,6 +43,8 @@ fi cp -r ../../templates/_namespace_template/app/* ${namespace}/ sed -i "s?NAMESPACE?${namespace}?g" ${namespace}/* +sed -i "s?GOOGLE_SERVICE_ACCOUNT?$google_service_account?g" ${namespace}/* +sed -i "s?KUBERNETES_SERVICE_ACCOUNT?$kubernetes_service_account?g" ${namespace}/* git add . git config --global user.name ${github_user} @@ -44,5 +52,4 @@ git config --global user.email ${github_email} git commit -m "Installing ray cluster in ${namespace} namespace." git push origin -cd - -rm -rf ${download_acm_repo_name} \ No newline at end of file +rm -rf ${download_acm_repo_name} diff --git a/ml-platform/manage_ray_ns.sh b/ml-platform/terraform/scripts/manage_ray_ns.sh similarity index 86% rename from ml-platform/manage_ray_ns.sh rename to ml-platform/terraform/scripts/manage_ray_ns.sh index 021559fee..a1ca8b2cb 100755 --- a/ml-platform/manage_ray_ns.sh +++ b/ml-platform/terraform/scripts/manage_ray_ns.sh @@ -13,17 +13,21 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + configsync_repo_name=${1} github_email=${2} github_org=${3} github_user=${4} namespace=${5} -random=$(echo $RANDOM | md5sum | head -c 20; echo) +random=$( + echo $RANDOM | md5sum | head -c 20 + echo +) download_acm_repo_name="/tmp/$(echo ${configsync_repo_name} | awk -F "/" '{print $2}')-${random}" git config --global user.name ${github_user} git config --global user.email ${github_emai} -git clone https://${github_user}:${TF_VAR_github_token}@github.com/${configsync_repo_name} ${download_acm_repo_name} +git clone https://${github_user}:${GIT_TOKEN}@github.com/${configsync_repo_name} ${download_acm_repo_name} || exit 1 cd ${download_acm_repo_name}/manifests/clusters/kuberay ns_exists=$(grep ${namespace} values.yaml | wc -l) if [ "${ns_exists}" -ne 0 ]; then @@ -39,5 +43,4 @@ git config --global user.email ${github_email} git commit -m "Installing ray cluster in ${namespace} namespace." git push origin -cd - -rm -rf ${download_acm_repo_name} \ No newline at end of file +rm -rf ${download_acm_repo_name} diff --git a/ml-platform/templates/acm-template/manifests/apps/.gitkeep b/ml-platform/terraform/templates/acm-template/manifests/apps/.gitkeep similarity index 100% rename from ml-platform/templates/acm-template/manifests/apps/.gitkeep rename to ml-platform/terraform/templates/acm-template/manifests/apps/.gitkeep diff --git a/ml-platform/templates/acm-template/manifests/clusters/.gitkeep b/ml-platform/terraform/templates/acm-template/manifests/clusters/.gitkeep similarity index 100% rename from ml-platform/templates/acm-template/manifests/clusters/.gitkeep rename to ml-platform/terraform/templates/acm-template/manifests/clusters/.gitkeep diff --git a/ml-platform/templates/acm-template/templates/_cluster_template/cluster.yaml b/ml-platform/terraform/templates/acm-template/templates/_cluster_template/cluster.yaml similarity index 100% rename from ml-platform/templates/acm-template/templates/_cluster_template/cluster.yaml rename to ml-platform/terraform/templates/acm-template/templates/_cluster_template/cluster.yaml diff --git a/ml-platform/templates/acm-template/templates/_cluster_template/config-selector.yaml b/ml-platform/terraform/templates/acm-template/templates/_cluster_template/config-selector.yaml similarity index 100% rename from ml-platform/templates/acm-template/templates/_cluster_template/config-selector.yaml rename to ml-platform/terraform/templates/acm-template/templates/_cluster_template/config-selector.yaml diff --git a/ml-platform/templates/acm-template/templates/_cluster_template/kuberay/kustomization.yaml b/ml-platform/terraform/templates/acm-template/templates/_cluster_template/kuberay/kustomization.yaml similarity index 100% rename from ml-platform/templates/acm-template/templates/_cluster_template/kuberay/kustomization.yaml rename to ml-platform/terraform/templates/acm-template/templates/_cluster_template/kuberay/kustomization.yaml diff --git a/ml-platform/templates/acm-template/templates/_cluster_template/kuberay/rayclusters.yaml b/ml-platform/terraform/templates/acm-template/templates/_cluster_template/kuberay/rayclusters.yaml similarity index 100% rename from ml-platform/templates/acm-template/templates/_cluster_template/kuberay/rayclusters.yaml rename to ml-platform/terraform/templates/acm-template/templates/_cluster_template/kuberay/rayclusters.yaml diff --git a/ml-platform/templates/acm-template/templates/_cluster_template/kuberay/rayjobs.yaml b/ml-platform/terraform/templates/acm-template/templates/_cluster_template/kuberay/rayjobs.yaml similarity index 100% rename from ml-platform/templates/acm-template/templates/_cluster_template/kuberay/rayjobs.yaml rename to ml-platform/terraform/templates/acm-template/templates/_cluster_template/kuberay/rayjobs.yaml diff --git a/ml-platform/templates/acm-template/templates/_cluster_template/kuberay/rayservices.yaml b/ml-platform/terraform/templates/acm-template/templates/_cluster_template/kuberay/rayservices.yaml similarity index 100% rename from ml-platform/templates/acm-template/templates/_cluster_template/kuberay/rayservices.yaml rename to ml-platform/terraform/templates/acm-template/templates/_cluster_template/kuberay/rayservices.yaml diff --git a/ml-platform/templates/acm-template/templates/_cluster_template/kuberay/rbac.yaml b/ml-platform/terraform/templates/acm-template/templates/_cluster_template/kuberay/rbac.yaml similarity index 100% rename from ml-platform/templates/acm-template/templates/_cluster_template/kuberay/rbac.yaml rename to ml-platform/terraform/templates/acm-template/templates/_cluster_template/kuberay/rbac.yaml diff --git a/ml-platform/templates/acm-template/templates/_cluster_template/kuberay/values.yaml b/ml-platform/terraform/templates/acm-template/templates/_cluster_template/kuberay/values.yaml similarity index 100% rename from ml-platform/templates/acm-template/templates/_cluster_template/kuberay/values.yaml rename to ml-platform/terraform/templates/acm-template/templates/_cluster_template/kuberay/values.yaml diff --git a/ml-platform/templates/acm-template/templates/_cluster_template/kustomization.yaml b/ml-platform/terraform/templates/acm-template/templates/_cluster_template/kustomization.yaml similarity index 100% rename from ml-platform/templates/acm-template/templates/_cluster_template/kustomization.yaml rename to ml-platform/terraform/templates/acm-template/templates/_cluster_template/kustomization.yaml diff --git a/ml-platform/templates/acm-template/templates/_cluster_template/selector.yaml b/ml-platform/terraform/templates/acm-template/templates/_cluster_template/selector.yaml similarity index 100% rename from ml-platform/templates/acm-template/templates/_cluster_template/selector.yaml rename to ml-platform/terraform/templates/acm-template/templates/_cluster_template/selector.yaml diff --git a/ml-platform/templates/acm-template/templates/_cluster_template/team/kustomization.yaml b/ml-platform/terraform/templates/acm-template/templates/_cluster_template/team/kustomization.yaml similarity index 100% rename from ml-platform/templates/acm-template/templates/_cluster_template/team/kustomization.yaml rename to ml-platform/terraform/templates/acm-template/templates/_cluster_template/team/kustomization.yaml diff --git a/ml-platform/templates/acm-template/templates/_cluster_template/team/namespace.yaml b/ml-platform/terraform/templates/acm-template/templates/_cluster_template/team/namespace.yaml similarity index 100% rename from ml-platform/templates/acm-template/templates/_cluster_template/team/namespace.yaml rename to ml-platform/terraform/templates/acm-template/templates/_cluster_template/team/namespace.yaml diff --git a/ml-platform/templates/acm-template/templates/_cluster_template/team/network-policy.yaml b/ml-platform/terraform/templates/acm-template/templates/_cluster_template/team/network-policy.yaml similarity index 100% rename from ml-platform/templates/acm-template/templates/_cluster_template/team/network-policy.yaml rename to ml-platform/terraform/templates/acm-template/templates/_cluster_template/team/network-policy.yaml diff --git a/ml-platform/templates/acm-template/templates/_cluster_template/team/rbac.yaml b/ml-platform/terraform/templates/acm-template/templates/_cluster_template/team/rbac.yaml similarity index 100% rename from ml-platform/templates/acm-template/templates/_cluster_template/team/rbac.yaml rename to ml-platform/terraform/templates/acm-template/templates/_cluster_template/team/rbac.yaml diff --git a/ml-platform/templates/acm-template/templates/_cluster_template/team/reposync.yaml b/ml-platform/terraform/templates/acm-template/templates/_cluster_template/team/reposync.yaml similarity index 81% rename from ml-platform/templates/acm-template/templates/_cluster_template/team/reposync.yaml rename to ml-platform/terraform/templates/acm-template/templates/_cluster_template/team/reposync.yaml index 73149d513..4b50dcee5 100644 --- a/ml-platform/templates/acm-template/templates/_cluster_template/team/reposync.yaml +++ b/ml-platform/terraform/templates/acm-template/templates/_cluster_template/team/reposync.yaml @@ -15,6 +15,44 @@ #ROOT_SOURCE/namespaces/NAMESPACE/repo-sync.yaml apiVersion: configsync.gke.io/v1beta1 kind: RepoSync +metadata: + name: ENV-NAMESPACE + namespace: NAMESPACE + annotations: + configmanagement.gke.io/cluster-selector: ENV +spec: + sourceType: git + # Since this is for a namespace repository, the format is unstructured + sourceFormat: unstructured + git: + repo: "GIT_REPO" + revision: "ENV" + #branch: NAMESPACE_BRANCH + dir: "manifests/apps/NAMESPACE" + auth: token + secretRef: + name: git-creds +--- +#ROOT_REPO/namespaces/NAMESPACE/sync-rolebinding.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: ENV-rb-NAMESPACE + namespace: NAMESPACE + annotations: + configmanagement.gke.io/cluster-selector: ENV +subjects: +- kind: ServiceAccount + name: ns-reconciler-NAMESPACE-ENV-NAMESPACE- + namespace: config-management-system +roleRef: + kind: ClusterRole + name: cluster-admin + apiGroup: rbac.authorization.k8s.io +--- +#END OF SINGLE ENV DECLARATION +apiVersion: configsync.gke.io/v1beta1 +kind: RepoSync metadata: name: dev-NAMESPACE namespace: NAMESPACE @@ -50,7 +88,6 @@ roleRef: name: cluster-admin apiGroup: rbac.authorization.k8s.io --- -#END OF SINGLE ENV DECLARATION #ROOT_SOURCE/namespaces/NAMESPACE/repo-sync.yaml apiVersion: configsync.gke.io/v1beta1 kind: RepoSync diff --git a/ml-platform/templates/acm-template/templates/_namespace_template/app/fluentd_config.yaml b/ml-platform/terraform/templates/acm-template/templates/_namespace_template/app/fluentd_config.yaml similarity index 100% rename from ml-platform/templates/acm-template/templates/_namespace_template/app/fluentd_config.yaml rename to ml-platform/terraform/templates/acm-template/templates/_namespace_template/app/fluentd_config.yaml diff --git a/ml-platform/templates/acm-template/templates/_namespace_template/app/kustomization.yaml b/ml-platform/terraform/templates/acm-template/templates/_namespace_template/app/kustomization.yaml similarity index 100% rename from ml-platform/templates/acm-template/templates/_namespace_template/app/kustomization.yaml rename to ml-platform/terraform/templates/acm-template/templates/_namespace_template/app/kustomization.yaml diff --git a/ml-platform/templates/acm-template/templates/_namespace_template/app/serviceaccount.yaml b/ml-platform/terraform/templates/acm-template/templates/_namespace_template/app/serviceaccount.yaml similarity index 100% rename from ml-platform/templates/acm-template/templates/_namespace_template/app/serviceaccount.yaml rename to ml-platform/terraform/templates/acm-template/templates/_namespace_template/app/serviceaccount.yaml diff --git a/ml-platform/templates/acm-template/templates/_namespace_template/app/values.yaml b/ml-platform/terraform/templates/acm-template/templates/_namespace_template/app/values.yaml similarity index 100% rename from ml-platform/templates/acm-template/templates/_namespace_template/app/values.yaml rename to ml-platform/terraform/templates/acm-template/templates/_namespace_template/app/values.yaml diff --git a/ml-platform/variables.tf b/ml-platform/terraform/variables.tf similarity index 96% rename from ml-platform/variables.tf rename to ml-platform/terraform/variables.tf index 157c49d6b..c7b2f92de 100644 --- a/ml-platform/variables.tf +++ b/ml-platform/terraform/variables.tf @@ -64,63 +64,67 @@ variable "network_name" { description = "VPC network where GKE cluster will be created" type = string } + variable "routing_mode" { default = "GLOBAL" description = "VPC routing mode." type = string } + variable "subnet_01_name" { default = "ml-vpc-subnet-01" description = "Name of the first subnet in the VPC network." type = string } + variable "subnet_01_ip" { default = "10.40.0.0/22" description = "CIDR of the first subnet." type = string } + variable "subnet_01_region" { default = "us-central1" description = "Region of the first subnet." type = string } + variable "subnet_01_description" { default = "subnet 01" description = "Description of the first subnet." type = string } + variable "subnet_02_name" { default = "gke-vpc-subnet-02" description = "Name of the second subnet in the VPC network." type = string } + variable "subnet_02_ip" { default = "10.12.0.0/22" description = "CIDR of the second subnet." type = string } + variable "subnet_02_region" { default = "us-west2" description = "Region of the second subnet." type = string } + variable "subnet_02_description" { default = "subnet 02" description = "Description of the second subnet." type = string } -// -//variable "lookup_state_bucket" { -// description = "GCS bucket to look up TF state from previous steps." -// type = string -// default = "YOUR_STATE_BUCKET" -//} variable "cluster_name" { description = "Name of the GKE cluster" default = "gke-ml" type = string } + variable "reserved_taints" { description = "Taints to be applied to the reserved node pool." type = list(object({ @@ -173,14 +177,17 @@ variable "github_user" { description = "GitHub user name." type = string } + variable "github_email" { description = "GitHub user email." type = string } + variable "github_org" { type = string description = "GitHub org." } + variable "github_token" { type = string description = "GitHub token. It is a token with write permissions as it will create a repo in the GitHub org." @@ -214,4 +221,10 @@ variable "install_ray_in_ns" { type = number description = "Flag to install ray cluster in the namespace created with the demo." default = 1 +} + +variable "config_management_version" { + type = string + description = "Version of Config Management to enable" + default = "1.17.1" } \ No newline at end of file diff --git a/ml-platform/versions.tf b/ml-platform/terraform/versions.tf similarity index 100% rename from ml-platform/versions.tf rename to ml-platform/terraform/versions.tf From 1b16ad0f877eda6e5e6e6e3e4c3c5828191c6603 Mon Sep 17 00:00:00 2001 From: arueth Date: Wed, 13 Mar 2024 19:44:49 +0000 Subject: [PATCH 08/39] Changed the Configuration flow --- ml-platform/README.md | 107 ++++++++++++++++++++---------------------- 1 file changed, 50 insertions(+), 57 deletions(-) diff --git a/ml-platform/README.md b/ml-platform/README.md index 2edc62bae..16d4fe02d 100644 --- a/ml-platform/README.md +++ b/ml-platform/README.md @@ -42,66 +42,27 @@ This reference architecture demonstrates how to build a GKE platform that facili ## Prerequistes -1. This tutorial has been tested on [Cloud Shell](https://shell.cloud.google.com) which comes preinstalled with [Google Cloud SDK](https://cloud.google.com/sdk) is required to complete this tutorial. -2. Familiarity with [Google Kubernetes Engine][gke], [Terraform][terraform], [root-sync][root-sync] , [repo-sync][repo-sync] , [Git][git], [GitHub][github] +- This guide is meant to be run on [Cloud Shell](https://shell.cloud.google.com) which comes preinstalled with the [Google Cloud SDK](https://cloud.google.com/sdk) and other tools that are required to complete this tutorial. +- Familiarity with following + - [Google Kubernetes Engine][gke] + - [Terraform][terraform] + - [git][git] + - [Google Configuration Management root-sync][root-sync] + - [Google Configuration Managementrepo-sync][repo-sync] + - [GitHub][github] -# Workflow +## Deploy a single environment reference architecture -This reference architecture can be implemented in one of the following ways: +This is the quick-start deployment guide. It can be used to set up an environment to familiarize yourself with the architecture and get an understanding of the concepts. -- Deploy a single env reference architecture. -- Deploy a multi env reference architecture in single [GCP project][gcp-project] -- Deploy a multi env reference architecture with each env in its own [GCP project][gcp-project] +### Requirements -## Deploy a single env reference architecture - -This is the quick-start deployment. It can be used to quickly set up an environment and start playing with it to get an understanding of the flow. Single env reference architecture can be deployed with the provided default values. +- New Google Cloud Project, preferably with no APIs enabled +- `roles/owner` IAM permissions on the project +- GitHub Personal Access Token, steps to create the token are provided below ### Configuration -- You can either create a new GCP project or use an existing one. Skip this step if you choose to use an already existing project. - - To create a new project, open `cloudshell` and run the following command: - ``` - gcloud projects create - ``` - - Associate billing account to the project: - ``` - gcloud beta billing projects link \ - --billing-account - ``` -- Set up PROJECT_ID in environment variable in `cloudshell` : - - ``` - export PROJECT_ID="" >> ~/.bashrc - ``` - - Replace with the id of the project that you created in the previous step or the id of an already existing project that you want to use. - - **If you are using an already existing project, get `roles/owner` role on the project** - -- Update ~/bashrc to automatically point to the required project when a new instance of the `cloudshell` is created: - - ``` - echo gcloud config set project $PROJECT_ID >> ~/.bashrc && source ~/.bashrc - ``` - -- Create a GCS bucket in the project for storing TF state. - - - To create a new bucket, run the following command in `cloudshell` : - - ``` - export STATE_BUCKET="${PROJECT_ID}-tf-state" >> ~/.bashrc && source ~/.bashrc - - gcloud storage buckets create gs://${STATE_BUCKET} - ``` - -- Store github configurations in environment variables: - ``` - export GITHUB_USER= >> ~/.bashrc - export GITHUB_ORG= >> ~/.bashrc - export GITHUB_EMAIL= >> ~/.bashrc - source ~/.bashrc - ``` - Create a [Personal Access Token][personal-access-token] in [GitHub][github]: Note: It is recommended to use a [machine user account][machine-user-account] for this but you can use a personal user account just to try this reference architecture. @@ -128,6 +89,37 @@ This is the quick-start deployment. It can be used to quickly set up an environm nano ${HOME}/secrets/mlp-github-token ``` +- Set the project environment variables in Cloud Shell + + Replace the following values + + - `` is the ID of your existing Google Cloud project + + ``` + export MLP_PROJECT_ID="" + export MLP_STATE_BUCKET="${MLP_PROJECT_ID}-tf-state" + ``` + +- Set the GitHub environment variables in Cloud Shell + + Replace the following values: + + - `` is the GitHub organization or user namespace to use for the repositories + - `` is the GitHub account to use for authentication + - `` is the email address to use for commit + + ``` + export MLP_GITHUB_ORG="" + export MLP_GITHUB_USER="" + export MLP_GITHUB_EMAIL="" + ``` + +- Create a Cloud Storage bucket to store the Terraform state + + ``` + gcloud storage buckets create gs://${MLP_STATE_BUCKET} --project ${MLP_PROJECT_ID} + ``` + ### Run Terraform - Clone the repository and change directory to the `ml-platform` directory @@ -147,10 +139,11 @@ This is the quick-start deployment. It can be used to quickly set up an environm - Set the configuration variables ``` - sed -i "s/YOUR_STATE_BUCKET/${STATE_BUCKET}/g" ${MLP_BASE_DIR}/terraform/backend.tf - sed -i "s/YOUR_GITHUB_EMAIL/${GITHUB_EMAIL}/g" ${MLP_BASE_DIR}/terraform/mlp.auto.tfvars - sed -i "s/YOUR_GITHUB_ORG/${GITHUB_ORG}/g" ${MLP_BASE_DIR}/terraform/mlp.auto.tfvars - sed -i "s/YOUR_GITHUB_USER/${GITHUB_USER}/g" ${MLP_BASE_DIR}/terraform/mlp.auto.tfvars + sed -i "s/YOUR_STATE_BUCKET/${MLP_STATE_BUCKET}/g" ${MLP_BASE_DIR}/terraform/backend.tf + + sed -i "s/YOUR_GITHUB_EMAIL/${MLP_GITHUB_EMAIL}/g" ${MLP_BASE_DIR}/terraform/mlp.auto.tfvars + sed -i "s/YOUR_GITHUB_ORG/${MLP_GITHUB_ORG}/g" ${MLP_BASE_DIR}/terraform/mlp.auto.tfvars + sed -i "s/YOUR_GITHUB_USER/${MLP_GITHUB_USER}/g" ${MLP_BASE_DIR}/terraform/mlp.auto.tfvars sed -i "s/YOUR_PROJECT_ID/${PROJECT_ID}/g" ${MLP_BASE_DIR}/terraform/mlp.auto.tfvars ``` From 6f7807ec71c607f8d267a7ce7184a84e0d29b259 Mon Sep 17 00:00:00 2001 From: arueth Date: Wed, 13 Mar 2024 20:59:21 +0000 Subject: [PATCH 09/39] Fixed terraform fmt issues --- ml-platform/terraform/mlp.auto.tfvars | 10 +++++----- ml-platform/terraform/modules/node-pools/variables.tf | 4 ++-- ml-platform/terraform/variables.tf | 4 ++-- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/ml-platform/terraform/mlp.auto.tfvars b/ml-platform/terraform/mlp.auto.tfvars index 142b9691f..1b58c802a 100644 --- a/ml-platform/terraform/mlp.auto.tfvars +++ b/ml-platform/terraform/mlp.auto.tfvars @@ -1,8 +1,8 @@ -project_id = { "dev" : "YOUR_PROJECT_ID" } -default_env = "dev" -github_user = "YOUR_GITHUB_USER" -github_email = "YOUR_GITHUB_EMAIL" -github_org = "YOUR_GITHUB_ORG" +project_id = { "dev" : "YOUR_PROJECT_ID" } +default_env = "dev" +github_user = "YOUR_GITHUB_USER" +github_email = "YOUR_GITHUB_EMAIL" +github_org = "YOUR_GITHUB_ORG" #github_token = "DO NOT ADD TOKEN HERE. PASS IT AS `export TF_VAR_github_token="YOUR TOKEN"` ON CLOUDSHELL" #folder_id = "YOUR_FOLDER_ID" #org_id = "YOUR_GCP_ORG_ID" diff --git a/ml-platform/terraform/modules/node-pools/variables.tf b/ml-platform/terraform/modules/node-pools/variables.tf index 6a2f20e56..61fb2504b 100644 --- a/ml-platform/terraform/modules/node-pools/variables.tf +++ b/ml-platform/terraform/modules/node-pools/variables.tf @@ -72,7 +72,7 @@ variable "machine_reservation_count" { } variable "autoscaling" { - type = map + type = map(any) default = { "total_min_node_count" : 0, "total_max_node_count" : 24, "location_policy" : "ANY" } } @@ -80,4 +80,4 @@ variable "reservation_name" { description = "reservation name to which the nodepool will be associated" type = string default = "" -} \ No newline at end of file +} diff --git a/ml-platform/terraform/variables.tf b/ml-platform/terraform/variables.tf index c7b2f92de..a93e61381 100644 --- a/ml-platform/terraform/variables.tf +++ b/ml-platform/terraform/variables.tf @@ -55,7 +55,7 @@ variable "create_projects" { } variable "project_id" { - type = map + type = map(any) description = "The GCP project where the resources will be created. It is a map with environments as keys and project_ids s values" } @@ -227,4 +227,4 @@ variable "config_management_version" { type = string description = "Version of Config Management to enable" default = "1.17.1" -} \ No newline at end of file +} From f6a42456ceac5f3f707566e70fd7ec31cc15e819 Mon Sep 17 00:00:00 2001 From: arueth Date: Thu, 14 Mar 2024 15:15:02 +0000 Subject: [PATCH 10/39] Enabled managed prometheus --- ml-platform/terraform/modules/cluster/gke.tf | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/ml-platform/terraform/modules/cluster/gke.tf b/ml-platform/terraform/modules/cluster/gke.tf index ab1b44141..377e7ee1c 100644 --- a/ml-platform/terraform/modules/cluster/gke.tf +++ b/ml-platform/terraform/modules/cluster/gke.tf @@ -104,6 +104,11 @@ resource "google_container_cluster" "gke_batch" { ] } } + monitoring_config { + managed_prometheus { + enabled = true + } + } node_pool_defaults { node_config_defaults { gcfs_config { From 816ddf62332c3ed9f098e4b7f21761dab2cd70ae Mon Sep 17 00:00:00 2001 From: arueth Date: Thu, 14 Mar 2024 15:44:23 +0000 Subject: [PATCH 11/39] Enabled logging and monitoring --- ml-platform/terraform/modules/cluster/gke.tf | 21 ++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/ml-platform/terraform/modules/cluster/gke.tf b/ml-platform/terraform/modules/cluster/gke.tf index 377e7ee1c..726793de3 100644 --- a/ml-platform/terraform/modules/cluster/gke.tf +++ b/ml-platform/terraform/modules/cluster/gke.tf @@ -104,7 +104,28 @@ resource "google_container_cluster" "gke_batch" { ] } } + logging_config { + enable_components = [ + "APISERVER", + "CONTROLLER_MANAGER", + "SCHEDULER", + "SYSTEM_COMPONENTS", + "WORKLOADS" + ] + } monitoring_config { + enable_components = [ + "APISERVER", + "CONTROLLER_MANAGER", + "DAEMONSET", + "DEPLOYMENT", + "HPA", + "POD", + "SCHEDULER", + "STATEFULSET", + "STORAGE", + "SYSTEM_COMPONENTS" + ] managed_prometheus { enabled = true } From 86d1b5f17b8f79b6ebedd6010d404a66d1d56b90 Mon Sep 17 00:00:00 2001 From: arueth Date: Thu, 14 Mar 2024 16:09:19 +0000 Subject: [PATCH 12/39] Alphabetized and standardized variables.tf --- ml-platform/terraform/variables.tf | 239 +++++++++++++++-------------- 1 file changed, 120 insertions(+), 119 deletions(-) diff --git a/ml-platform/terraform/variables.tf b/ml-platform/terraform/variables.tf index a93e61381..af2b8a131 100644 --- a/ml-platform/terraform/variables.tf +++ b/ml-platform/terraform/variables.tf @@ -12,219 +12,220 @@ # See the License for the specific language governing permissions and # limitations under the License. -variable "org_id" { - type = string - description = "The GCP orig id" +variable "billing_account" { default = null -} - -variable "env" { - type = set(string) - description = "List of environments" - default = ["dev"] -} - -variable "default_env" { + description = "GCP billing account" type = string - description = "Lowest environments" - default = "dev" } -variable "folder_id" { +variable "cluster_name" { + default = "gke-ml" + description = "Name of the GKE cluster" type = string - description = "Folder Id where the GCP projects will be created" - default = null } -variable "billing_account" { +variable "config_management_version" { + default = "1.17.1" + description = "Version of Config Management to enable" type = string - description = "GCP billing account" - default = null } -variable "project_name" { +variable "configsync_repo_name" { + default = "config-sync-repo" + description = "Name of the GitHub repo that will be synced to the cluster with Config sync." type = string - description = "GCP project name" - default = null } -variable "create_projects" { +variable "create_namespace" { + description = "Setup a namespace to demo." + default = 1 type = number - description = "Flag to create GCP projects" - default = 0 } -variable "project_id" { - type = map(any) - description = "The GCP project where the resources will be created. It is a map with environments as keys and project_ids s values" +variable "create_projects" { + default = 0 + description = "Flag to create GCP projects" + type = number } -variable "network_name" { - default = "ml-vpc" - description = "VPC network where GKE cluster will be created" +variable "default_env" { + default = "dev" + description = "Lowest environments" type = string } -variable "routing_mode" { - default = "GLOBAL" - description = "VPC routing mode." - type = string +variable "env" { + default = ["dev"] + description = "List of environments" + type = set(string) } -variable "subnet_01_name" { - default = "ml-vpc-subnet-01" - description = "Name of the first subnet in the VPC network." +variable "folder_id" { + default = null + description = "Folder Id where the GCP projects will be created" type = string } -variable "subnet_01_ip" { - default = "10.40.0.0/22" - description = "CIDR of the first subnet." +variable "github_email" { + description = "GitHub user email." type = string } -variable "subnet_01_region" { - default = "us-central1" - description = "Region of the first subnet." +variable "github_org" { + description = "GitHub org." type = string } -variable "subnet_01_description" { - default = "subnet 01" - description = "Description of the first subnet." +variable "github_token" { + description = "GitHub token. It is a token with write permissions as it will create a repo in the GitHub org." type = string } -variable "subnet_02_name" { - default = "gke-vpc-subnet-02" - description = "Name of the second subnet in the VPC network." +variable "github_user" { + description = "GitHub user name." type = string } -variable "subnet_02_ip" { - default = "10.12.0.0/22" - description = "CIDR of the second subnet." - type = string +variable "install_kuberay" { + default = 1 + description = "Flag to install kuberay operator." + type = number } -variable "subnet_02_region" { - default = "us-west2" - description = "Region of the second subnet." - type = string +variable "install_ray_in_ns" { + default = 1 + description = "Flag to install ray cluster in the namespace created with the demo." + type = number } -variable "subnet_02_description" { - default = "subnet 02" - description = "Description of the second subnet." +variable "namespace" { + default = "ml-team" + description = "Name of the namespace to demo." type = string } -variable "cluster_name" { - description = "Name of the GKE cluster" - default = "gke-ml" +variable "network_name" { + default = "ml-vpc" + description = "VPC network where GKE cluster will be created" type = string } -variable "reserved_taints" { - description = "Taints to be applied to the reserved node pool." +variable "ondemand_taints" { + default = [{ + key = "ondemand" + value = true + effect = "NO_SCHEDULE" + }] + description = "Taints to be applied to the on-demand node pool." type = list(object({ key = string value = any effect = string })) +} + +variable "org_id" { + default = null + description = "The GCP orig id" + type = string +} + +variable "project_id" { + description = "The GCP project where the resources will be created. It is a map with environments as keys and project_ids s values" + type = map(any) +} + +variable "project_name" { + default = null + description = "GCP project name" + type = string +} + +variable "reserved_taints" { default = [{ key = "reserved" value = true effect = "NO_SCHEDULE" }] -} - -variable "ondemand_taints" { - description = "Taints to be applied to the on-demand node pool." + description = "Taints to be applied to the reserved node pool." type = list(object({ key = string value = any effect = string })) +} + +variable "routing_mode" { + default = "GLOBAL" + description = "VPC routing mode." + type = string +} + +variable "secret_for_rootsync" { + default = 1 + description = "Create git-cred in config-management-system namespace." + type = number + +} + +variable "spot_taints" { default = [{ - key = "ondemand" + key = "spot" value = true effect = "NO_SCHEDULE" }] -} - -variable "spot_taints" { description = "Taints to be applied to the spot node pool." type = list(object({ key = string value = any effect = string })) - default = [{ - key = "spot" - value = true - effect = "NO_SCHEDULE" - }] } -variable "configsync_repo_name" { +variable "subnet_01_description" { + default = "subnet 01" + description = "Description of the first subnet." type = string - description = "Name of the GitHub repo that will be synced to the cluster with Config sync." - default = "config-sync-repo" } -variable "github_user" { - description = "GitHub user name." +variable "subnet_01_ip" { + default = "10.40.0.0/22" + description = "CIDR of the first subnet." type = string } -variable "github_email" { - description = "GitHub user email." +variable "subnet_01_name" { + default = "ml-vpc-subnet-01" + description = "Name of the first subnet in the VPC network." type = string } -variable "github_org" { +variable "subnet_01_region" { + default = "us-central1" + description = "Region of the first subnet." type = string - description = "GitHub org." } -variable "github_token" { +variable "subnet_02_description" { + default = "subnet 02" + description = "Description of the second subnet." type = string - description = "GitHub token. It is a token with write permissions as it will create a repo in the GitHub org." } -variable "secret_for_rootsync" { - type = number - description = "Create git-cred in config-management-system namespace." - default = 1 -} - -variable "create_namespace" { - type = number - description = "Setup a namespace to demo." - default = 1 -} - -variable "namespace" { +variable "subnet_02_ip" { + default = "10.12.0.0/22" + description = "CIDR of the second subnet." type = string - description = "Name of the namespace to demo." - default = "ml-team" -} - -variable "install_kuberay" { - type = number - description = "Flag to install kuberay operator." - default = 1 } -variable "install_ray_in_ns" { - type = number - description = "Flag to install ray cluster in the namespace created with the demo." - default = 1 +variable "subnet_02_name" { + default = "gke-vpc-subnet-02" + description = "Name of the second subnet in the VPC network." + type = string } -variable "config_management_version" { +variable "subnet_02_region" { + default = "us-west2" + description = "Region of the second subnet." type = string - description = "Version of Config Management to enable" - default = "1.17.1" } From 8bf2696a26fd5fc56f28c1ff902faeb397597993 Mon Sep 17 00:00:00 2001 From: arueth Date: Thu, 14 Mar 2024 16:11:07 +0000 Subject: [PATCH 13/39] Renamed cluster resource from gke_batch to mlp --- ml-platform/terraform/modules/cluster/gke.tf | 2 +- .../terraform/modules/cluster/outputs.tf | 17 +++++++++-------- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/ml-platform/terraform/modules/cluster/gke.tf b/ml-platform/terraform/modules/cluster/gke.tf index 726793de3..e774a2be4 100644 --- a/ml-platform/terraform/modules/cluster/gke.tf +++ b/ml-platform/terraform/modules/cluster/gke.tf @@ -18,7 +18,7 @@ data "google_project" "project" { project_id = var.project_id } -resource "google_container_cluster" "gke_batch" { +resource "google_container_cluster" "mlp" { provider = google-beta deletion_protection = false name = var.cluster_name diff --git a/ml-platform/terraform/modules/cluster/outputs.tf b/ml-platform/terraform/modules/cluster/outputs.tf index b26d3be8e..fd87e1c6f 100644 --- a/ml-platform/terraform/modules/cluster/outputs.tf +++ b/ml-platform/terraform/modules/cluster/outputs.tf @@ -12,22 +12,23 @@ # See the License for the specific language governing permissions and # limitations under the License. +output "cluster_id" { + value = google_container_cluster.mlp.id +} + + output "cluster_location" { - value = google_container_cluster.gke_batch.location + value = google_container_cluster.mlp.location } output "cluster_name" { - value = google_container_cluster.gke_batch.name + value = google_container_cluster.mlp.name } -output "cluster_id" { - value = google_container_cluster.gke_batch.id +output "env" { + value = var.env } output "gke_project_id" { value = var.project_id } - -output "env" { - value = var.env -} \ No newline at end of file From bccea3e8176a4544f4eb0ae435b16634f3a163dd Mon Sep 17 00:00:00 2001 From: arueth Date: Thu, 14 Mar 2024 16:58:34 +0000 Subject: [PATCH 14/39] Fixed typos in README --- ml-platform/README.md | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/ml-platform/README.md b/ml-platform/README.md index 16d4fe02d..4138ce497 100644 --- a/ml-platform/README.md +++ b/ml-platform/README.md @@ -4,7 +4,7 @@ This reference architecture demonstrates how to build a GKE platform that facilitates Machine Learning. The reference architecture is based on the following principles: -- The platform admin will create the GKE platform using IaC tool like [Terraform][terraform]. The IaC will come with re-usuable modules that can be referred to create more resources as the demand grows. +- The platform admin will create the GKE platform using IaC tool like [Terraform][terraform]. The IaC will come with re-usable modules that can be referred to create more resources as the demand grows. - The platform will be based on [GitOps][gitops]. - After the GKE platform has been created, cluster scoped resources on it will be created through [Config Sync][config-sync] by the admins. - Platform admins will create a namespace per application and provide the application team member full access to it. @@ -40,7 +40,7 @@ This reference architecture demonstrates how to build a GKE platform that facili **CUJ 4**: Operationalizing the models. **[TBD]** -## Prerequistes +## Prerequisites - This guide is meant to be run on [Cloud Shell](https://shell.cloud.google.com) which comes preinstalled with the [Google Cloud SDK](https://cloud.google.com/sdk) and other tools that are required to complete this tutorial. - Familiarity with following @@ -48,7 +48,7 @@ This reference architecture demonstrates how to build a GKE platform that facili - [Terraform][terraform] - [git][git] - [Google Configuration Management root-sync][root-sync] - - [Google Configuration Managementrepo-sync][repo-sync] + - [Google Configuration Management repo-sync][repo-sync] - [GitHub][github] ## Deploy a single environment reference architecture @@ -85,7 +85,7 @@ This is the quick-start deployment guide. It can be used to set up an environmen touch ${HOME}/secrets/mlp-github-token chmod go-rwx ${HOME}/secrets/mlp-github-token - # Put the token in the secure file using your prefered editor + # Put the token in the secure file using your preferred editor nano ${HOME}/secrets/mlp-github-token ``` @@ -166,9 +166,9 @@ This is the quick-start deployment guide. It can be used to set up an environmen - Go to Google Cloud Console, click on the navigation menu and click on Kubernetes Engine > Config. If you haven't enabled GKE Enterprise in the project earlier, Click `LEARN AND ENABLE` button and then `ENABLE GKE ENTERPRISE`. You should see a RootSync and RepoSync object. ![configsync](docs/images/configsync.png) -#### Software installed via RepoSync and Reposync +#### Software installed via RepoSync and RootSync -Open `cloudshell` to execute the following commands: +Open Cloud Shell to execute the following commands: - Store your GKE cluster name in env variable: @@ -180,7 +180,7 @@ Open `cloudshell` to execute the following commands: gcloud container fleet memberships get-credentials ${GKE_CLUSTER} ``` -- Fetch kuberay operator CRDs +- Fetch KubeRay operator CRDs ``` kubectl get crd | grep ray @@ -194,7 +194,7 @@ Open `cloudshell` to execute the following commands: rayservices.ray.io 2024-02-12T21:19:12Z ``` -- Fetch kuberay operator pod +- Fetch KubeRay operator pod ``` kubectl get pods @@ -230,9 +230,9 @@ Open `cloudshell` to execute the following commands: ray-cluster-kuberay 1 1 ready 29m ``` -- Check the head and worker pods of kuberay`in`ml-team` namespace +- Check the head and worker pods of kuberay in `ml-team` namespace ``` - kubectl get pods -n -n ml-team + kubectl get pods -n ml-team ``` The output will be similar to the following: ``` From 3df4a62d13580ebfc0b3738be76b2d8568341ae2 Mon Sep 17 00:00:00 2001 From: arueth Date: Thu, 14 Mar 2024 17:00:35 +0000 Subject: [PATCH 15/39] Fixed guide path --- ml-platform/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ml-platform/README.md b/ml-platform/README.md index 4138ce497..be3c50b89 100644 --- a/ml-platform/README.md +++ b/ml-platform/README.md @@ -122,11 +122,11 @@ This is the quick-start deployment guide. It can be used to set up an environmen ### Run Terraform -- Clone the repository and change directory to the `ml-platform` directory +- Clone the repository and change directory to the guide directory ``` git clone https://github.com/GoogleCloudPlatform/ai-on-gke - cd ml-platform + cd ai-on-gke/ml-platform ``` - Set environment variables From d16c0552af441b8dd6e7a9a5ca22878899f26336 Mon Sep 17 00:00:00 2001 From: arueth Date: Thu, 14 Mar 2024 17:02:34 +0000 Subject: [PATCH 16/39] Changed README to reference a single cluster --- ml-platform/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ml-platform/README.md b/ml-platform/README.md index be3c50b89..d5ccd4265 100644 --- a/ml-platform/README.md +++ b/ml-platform/README.md @@ -161,7 +161,7 @@ This is the quick-start deployment guide. It can be used to set up an environmen #### GKE clusters and ConfigSync -- Go to Google Cloud Console, click on the navigation menu and click on Kubernetes Engine > Clusters. You should see three clusters. +- Go to Google Cloud Console, click on the navigation menu and click on Kubernetes Engine > Clusters. You should see one cluster. - Go to Google Cloud Console, click on the navigation menu and click on Kubernetes Engine > Config. If you haven't enabled GKE Enterprise in the project earlier, Click `LEARN AND ENABLE` button and then `ENABLE GKE ENTERPRISE`. You should see a RootSync and RepoSync object. ![configsync](docs/images/configsync.png) From 53cf5bd9dcc27323a1d80d0d7a9a7d92aa662d62 Mon Sep 17 00:00:00 2001 From: arueth Date: Thu, 14 Mar 2024 18:44:13 +0000 Subject: [PATCH 17/39] Formatted for Terraform files readability and supportability --- ml-platform/terraform/backend.tf | 1 - ml-platform/terraform/main.tf | 474 +++++++++++------- ml-platform/terraform/mlp.auto.tfvars | 10 +- .../terraform/modules/cloud-nat/main.tf | 62 +-- .../terraform/modules/cloud-nat/outputs.tf | 1 - .../terraform/modules/cloud-nat/variables.tf | 143 +++--- .../terraform/modules/cloud-nat/versions.tf | 24 +- ml-platform/terraform/modules/cluster/gke.tf | 111 ++-- .../terraform/modules/cluster/outputs.tf | 1 - .../terraform/modules/cluster/variables.tf | 43 +- .../terraform/modules/cluster/versions.tf | 13 - .../terraform/modules/network/outputs.tf | 16 +- .../terraform/modules/network/variables.tf | 37 +- .../terraform/modules/network/versions.tf | 9 - ml-platform/terraform/modules/network/vpc.tf | 24 +- .../terraform/modules/node-pools/nodepools.tf | 62 +-- .../terraform/modules/node-pools/variables.tf | 95 ++-- .../terraform/modules/node-pools/versions.tf | 13 - .../terraform/modules/projects/outputs.tf | 2 +- .../terraform/modules/projects/projects.tf | 86 ++-- .../terraform/modules/projects/variables.tf | 24 +- .../terraform/modules/projects/versions.tf | 9 - .../modules/vm-reservations/outputs.tf | 2 +- .../modules/vm-reservations/reservations.tf | 9 +- .../modules/vm-reservations/variables.tf | 50 +- .../modules/vm-reservations/versions.tf | 15 +- ml-platform/terraform/versions.tf | 9 +- 27 files changed, 707 insertions(+), 638 deletions(-) diff --git a/ml-platform/terraform/backend.tf b/ml-platform/terraform/backend.tf index a676d7219..959028bb0 100644 --- a/ml-platform/terraform/backend.tf +++ b/ml-platform/terraform/backend.tf @@ -18,4 +18,3 @@ terraform { bucket = "YOUR_STATE_BUCKET" } } - diff --git a/ml-platform/terraform/main.tf b/ml-platform/terraform/main.tf index 71dc012a3..b4ff7c00e 100644 --- a/ml-platform/terraform/main.tf +++ b/ml-platform/terraform/main.tf @@ -13,140 +13,183 @@ # limitations under the License. locals { - parsed_project_id = var.create_projects == 0 ? var.project_id : { for k, v in "${module.gcp-project.project_ids}" : k => v.project_id } + gke_project_map = { for k, v in "${module.gke}" : v.cluster_name => v.gke_project_id } parsed_gke_info = module.gke parsed_gke_info_without_default_env = { for k, v in "${local.parsed_gke_info}" : k => v if k != var.default_env } + parsed_project_id = var.create_projects == 0 ? var.project_id : { for k, v in "${module.gcp-project.project_ids}" : k => v.project_id } project_id_list = [for k, v in "${module.gke}" : v.gke_project_id] - gke_project_map = { for k, v in "${module.gke}" : v.cluster_name => v.gke_project_id } } #TODO: Add a validation that the value if default_env must be one of the values in env list module "gcp-project" { - count = var.create_projects - source = "./modules/projects" - org_id = var.org_id - folder_id = var.folder_id - env = var.env + count = var.create_projects + + source = "./modules/projects" + billing_account = var.billing_account + env = var.env + folder_id = var.folder_id + org_id = var.org_id project_name = var.project_name } resource "google_project_service" "containerfilesystem_googleapis_com" { - for_each = local.parsed_project_id + for_each = local.parsed_project_id + + depends_on = [module.gcp-project] + + disable_dependent_services = false + disable_on_destroy = false project = each.value service = "containerfilesystem.googleapis.com" - disable_on_destroy = false - disable_dependent_services = false - depends_on = [module.gcp-project] } resource "google_project_service" "project_services-cr" { - for_each = local.parsed_project_id + for_each = local.parsed_project_id + + depends_on = [module.gcp-project] + + disable_dependent_services = false + disable_on_destroy = false project = each.value service = "cloudresourcemanager.googleapis.com" - disable_on_destroy = false - disable_dependent_services = false - depends_on = [module.gcp-project] } resource "google_project_service" "project_services-an" { - for_each = local.parsed_project_id + for_each = local.parsed_project_id + + depends_on = [ + module.gcp-project, + google_project_service.project_services-cr + ] + + disable_dependent_services = false + disable_on_destroy = false project = each.value service = "anthos.googleapis.com" - disable_on_destroy = false - disable_dependent_services = false - depends_on = [module.gcp-project, google_project_service.project_services-cr] } resource "google_project_service" "project_services-anc" { - for_each = local.parsed_project_id + for_each = local.parsed_project_id + + depends_on = [ + module.gcp-project, + google_project_service.project_services-cr + ] + + disable_dependent_services = false + disable_on_destroy = false project = each.value service = "anthosconfigmanagement.googleapis.com" - disable_on_destroy = false - disable_dependent_services = false - depends_on = [module.gcp-project, google_project_service.project_services-cr] } resource "google_project_service" "project_services-con" { - for_each = local.parsed_project_id + for_each = local.parsed_project_id + + depends_on = [ + module.gcp-project, + google_project_service.project_services-cr + ] + + disable_dependent_services = false + disable_on_destroy = false project = each.value service = "container.googleapis.com" - disable_on_destroy = false - disable_dependent_services = false - depends_on = [module.gcp-project, google_project_service.project_services-cr] } resource "google_project_service" "project_services-com" { - for_each = local.parsed_project_id + for_each = local.parsed_project_id + + depends_on = [ + module.gcp-project, + google_project_service.project_services-cr + ] + + disable_dependent_services = false + disable_on_destroy = false project = each.value service = "compute.googleapis.com" - disable_on_destroy = false - disable_dependent_services = false - depends_on = [module.gcp-project, google_project_service.project_services-cr] } resource "google_project_service" "project_services-gkecon" { - for_each = local.parsed_project_id + for_each = local.parsed_project_id + + depends_on = [ + module.gcp-project, + google_project_service.project_services-cr + ] + + disable_dependent_services = false + disable_on_destroy = false project = each.value service = "gkeconnect.googleapis.com" - disable_on_destroy = false - disable_dependent_services = false - depends_on = [module.gcp-project, google_project_service.project_services-cr] } resource "google_project_service" "project_services-gkeh" { - for_each = local.parsed_project_id + for_each = local.parsed_project_id + + depends_on = [ + module.gcp-project, + google_project_service.project_services-cr + ] + + disable_dependent_services = false + disable_on_destroy = false project = each.value service = "gkehub.googleapis.com" - disable_on_destroy = false - disable_dependent_services = false - depends_on = [module.gcp-project, google_project_service.project_services-cr] } resource "google_project_service" "project_services-iam" { - for_each = local.parsed_project_id + for_each = local.parsed_project_id + + depends_on = [module.gcp-project, google_project_service.project_services-cr] + + disable_dependent_services = false + disable_on_destroy = false project = each.value service = "iam.googleapis.com" - disable_on_destroy = false - disable_dependent_services = false - depends_on = [module.gcp-project, google_project_service.project_services-cr] } resource "google_project_service" "project_services-gate" { - for_each = local.parsed_project_id + for_each = local.parsed_project_id + + depends_on = [ + module.gcp-project, + google_project_service.project_services-cr + ] + + disable_dependent_services = false + disable_on_destroy = false project = each.value service = "connectgateway.googleapis.com" - disable_on_destroy = false - disable_dependent_services = false - depends_on = [module.gcp-project, google_project_service.project_services-cr] } module "create-vpc" { - for_each = local.parsed_project_id - source = "./modules/network" - project_id = each.value + for_each = local.parsed_project_id + + source = "./modules/network" + + depends_on = [ + module.gcp-project, + google_project_service.project_services-com + ] + network_name = format("%s-%s", var.network_name, each.key) + project_id = each.value routing_mode = var.routing_mode - subnet_01_name = format("%s-%s", var.subnet_01_name, each.key) subnet_01_ip = var.subnet_01_ip + subnet_01_name = format("%s-%s", var.subnet_01_name, each.key) subnet_01_region = var.subnet_01_region - subnet_02_name = format("%s-%s", var.subnet_02_name, each.key) subnet_02_ip = var.subnet_02_ip + subnet_02_name = format("%s-%s", var.subnet_02_name, each.key) subnet_02_region = var.subnet_02_region - - depends_on = [ - module.gcp-project, - google_project_service.project_services-com - ] } resource "google_gke_hub_feature" "configmanagement_acm_feature" { - count = length(distinct(values(local.parsed_project_id))) - name = "configmanagement" - project = distinct(values(local.parsed_project_id))[count.index] - location = "global" provider = google-beta + count = length(distinct(values(local.parsed_project_id))) + depends_on = [ google_project_service.project_services-gkeh, google_project_service.project_services-anc, @@ -154,85 +197,123 @@ resource "google_gke_hub_feature" "configmanagement_acm_feature" { google_project_service.project_services-com, google_project_service.project_services-gkecon ] + + location = "global" + name = "configmanagement" + project = distinct(values(local.parsed_project_id))[count.index] } module "gke" { - for_each = local.parsed_project_id - source = "./modules/cluster" + for_each = local.parsed_project_id + + source = "./modules/cluster" + + depends_on = [ + google_gke_hub_feature.configmanagement_acm_feature, + google_project_service.project_services-con, + google_project_service.project_services-com + ] + cluster_name = format("%s-%s", var.cluster_name, each.key) + env = each.key + master_auth_networks_ipcidr = var.subnet_01_ip network = module.create-vpc[each.key].vpc - subnet = module.create-vpc[each.key].subnet-1 project_id = each.value region = var.subnet_01_region + subnet = module.create-vpc[each.key].subnet-1 zone = "${var.subnet_01_region}-a" - master_auth_networks_ipcidr = var.subnet_01_ip - depends_on = [google_gke_hub_feature.configmanagement_acm_feature, google_project_service.project_services-con, google_project_service.project_services-com] - env = each.key } module "reservation" { - for_each = local.parsed_project_id - source = "./modules/vm-reservations" + for_each = local.parsed_project_id + + source = "./modules/vm-reservations" + + depends_on = [module.gke] + cluster_name = module.gke[each.key].cluster_name - zone = "${var.subnet_01_region}-a" project_id = each.value - depends_on = [module.gke] + zone = "${var.subnet_01_region}-a" } module "node_pool-reserved" { - for_each = local.parsed_project_id - source = "./modules/node-pools" + for_each = local.parsed_project_id + + source = "./modules/node-pools" + + depends_on = [module.reservation] + + cluster_name = module.gke[each.key].cluster_name node_pool_name = "reservation" project_id = each.value - cluster_name = module.gke[each.key].cluster_name region = var.subnet_01_region - taints = var.reserved_taints - resource_type = "reservation" reservation_name = module.reservation[each.key].reservation_name - depends_on = [module.reservation] + resource_type = "reservation" + taints = var.reserved_taints } module "node_pool-ondemand" { - for_each = local.parsed_project_id - source = "./modules/node-pools" + for_each = local.parsed_project_id + + source = "./modules/node-pools" + + depends_on = [module.gke] + + cluster_name = module.gke[each.key].cluster_name node_pool_name = "ondemand" project_id = each.value - cluster_name = module.gke[each.key].cluster_name region = var.subnet_01_region - taints = var.ondemand_taints resource_type = "ondemand" - depends_on = [module.gke] + taints = var.ondemand_taints } module "node_pool-spot" { - for_each = local.parsed_project_id - source = "./modules/node-pools" + for_each = local.parsed_project_id + + source = "./modules/node-pools" + + depends_on = [module.gke] + + cluster_name = module.gke[each.key].cluster_name node_pool_name = "spot" project_id = each.value - cluster_name = module.gke[each.key].cluster_name region = var.subnet_01_region - taints = var.spot_taints resource_type = "spot" - depends_on = [module.gke] + taints = var.spot_taints } module "cloud-nat" { - for_each = local.parsed_project_id - source = "./modules/cloud-nat" - project_id = each.value - region = split("/", module.create-vpc[each.key].subnet-1)[3] + for_each = local.parsed_project_id + + source = "./modules/cloud-nat" + + depends_on = [ + module.create-vpc, + google_project_service.project_services-com + ] + + create_router = true name = format("%s-%s", "nat-for-acm", each.key) network = module.create-vpc[each.key].vpc - create_router = true + project_id = each.value + region = split("/", module.create-vpc[each.key].subnet-1)[3] router = format("%s-%s", "router-for-acm", each.key) - depends_on = [module.create-vpc, google_project_service.project_services-com] } resource "google_gke_hub_membership" "membership" { - provider = google-beta - for_each = local.parsed_gke_info - project = each.value["gke_project_id"] + provider = google-beta + + for_each = local.parsed_gke_info + + depends_on = [ + google_gke_hub_feature.configmanagement_acm_feature, + google_project_service.project_services-gkeh, + google_project_service.project_services-gkecon + ] + membership_id = each.value["cluster_name"] + project = each.value["gke_project_id"] + endpoint { gke_cluster { resource_link = format("%s/%s", "//container.googleapis.com", each.value["cluster_id"]) @@ -244,96 +325,100 @@ resource "google_gke_hub_membership" "membership" { labels ] } - - depends_on = [ - google_gke_hub_feature.configmanagement_acm_feature, - google_project_service.project_services-gkeh, - google_project_service.project_services-gkecon - ] } resource "github_repository" "acm_repo" { - name = var.configsync_repo_name - description = "Repo for Config Sync" - visibility = "private" - has_issues = false - has_projects = false - has_wiki = false - allow_merge_commit = true - allow_squash_merge = true allow_rebase_merge = true - delete_branch_on_merge = false + allow_squash_merge = true auto_init = true + delete_branch_on_merge = false + description = "Repo for Config Sync" + has_issues = false + has_projects = false + has_wiki = false + name = var.configsync_repo_name + visibility = "private" vulnerability_alerts = true } resource "github_branch" "branch" { - for_each = local.parsed_gke_info - repository = split("/", github_repository.acm_repo.full_name)[1] - branch = each.key + for_each = local.parsed_gke_info + depends_on = [github_repository.acm_repo] + + branch = each.key + repository = split("/", github_repository.acm_repo.full_name)[1] } resource "github_branch_default" "default_branch" { - repository = split("/", github_repository.acm_repo.full_name)[1] - branch = var.default_env depends_on = [github_branch.branch] + + branch = var.default_env + repository = split("/", github_repository.acm_repo.full_name)[1] } resource "github_branch_protection_v3" "branch_protection" { - for_each = length(keys(local.parsed_project_id)) > 1 ? local.parsed_gke_info_without_default_env : {} + for_each = length(keys(local.parsed_project_id)) > 1 ? local.parsed_gke_info_without_default_env : {} + + depends_on = [github_branch.branch] + repository = split("/", github_repository.acm_repo.full_name)[1] branch = each.key + required_pull_request_reviews { - required_approving_review_count = 1 require_code_owner_reviews = true + required_approving_review_count = 1 } + restrictions { } - - depends_on = [github_branch.branch] } resource "google_gke_hub_feature_membership" "feature_member" { - provider = google-beta - for_each = local.parsed_gke_info - project = each.value["gke_project_id"] - location = "global" + provider = google-beta + + for_each = local.parsed_gke_info + + depends_on = [ + google_project_service.project_services-gkecon, + google_project_service.project_services-gkeh, + google_project_service.project_services-an, + google_project_service.project_services-anc + ] + feature = "configmanagement" + location = "global" membership = google_gke_hub_membership.membership[each.key].membership_id + project = each.value["gke_project_id"] + configmanagement { version = var.config_management_version + config_sync { source_format = "unstructured" + git { - sync_repo = "https://github.com/${github_repository.acm_repo.full_name}.git" - sync_branch = each.value["env"] policy_dir = "manifests/clusters" secret_type = "token" + sync_branch = each.value["env"] + sync_repo = "https://github.com/${github_repository.acm_repo.full_name}.git" } } + policy_controller { enabled = true - template_library_installed = true referential_rules_enabled = true + template_library_installed = true + } } - - depends_on = [ - google_project_service.project_services-gkecon, - google_project_service.project_services-gkeh, - google_project_service.project_services-an, - google_project_service.project_services-anc - ] } resource "null_resource" "create_cluster_yamls" { for_each = local.parsed_gke_info - triggers = { - md5_script = filemd5("${path.module}/scripts/create_cluster_yamls.sh") - md5_files = md5(join("", [for f in fileset("${path.module}/templates/acm-template", "**") : md5("${path.module}/templates/acm-template/${f}")])) - } + + depends_on = [google_gke_hub_feature_membership.feature_member] provisioner "local-exec" { command = "${path.module}/scripts/create_cluster_yamls.sh ${var.github_org} ${github_repository.acm_repo.full_name} ${var.github_user} ${var.github_email} ${each.value["env"]} ${each.value["cluster_name"]} ${index(keys(local.parsed_gke_info), each.key)}" @@ -342,22 +427,14 @@ resource "null_resource" "create_cluster_yamls" { } } - depends_on = [google_gke_hub_feature_membership.feature_member] + triggers = { + md5_files = md5(join("", [for f in fileset("${path.module}/templates/acm-template", "**") : md5("${path.module}/templates/acm-template/${f}")])) + md5_script = filemd5("${path.module}/scripts/create_cluster_yamls.sh") + } } resource "null_resource" "create_git_cred_cms" { for_each = var.secret_for_rootsync == 1 ? local.gke_project_map : {} - triggers = { - md5_script = filemd5("${path.module}/scripts/create_git_cred.sh") - md5_credentials = md5(join("", [var.github_user, var.github_token])) - } - - provisioner "local-exec" { - command = "${path.module}/scripts/create_git_cred.sh ${each.key} ${each.value} ${var.github_user} config-management-system ${index(keys(local.gke_project_map), each.key)}" - environment = { - GIT_TOKEN = var.github_token - } - } depends_on = [ google_gke_hub_feature_membership.feature_member, @@ -367,14 +444,27 @@ resource "null_resource" "create_git_cred_cms" { module.node_pool-spot, module.cloud-nat ] + + provisioner "local-exec" { + command = "${path.module}/scripts/create_git_cred.sh ${each.key} ${each.value} ${var.github_user} config-management-system ${index(keys(local.gke_project_map), each.key)}" + environment = { + GIT_TOKEN = var.github_token + } + } + + triggers = { + md5_credentials = md5(join("", [var.github_user, var.github_token])) + md5_script = filemd5("${path.module}/scripts/create_git_cred.sh") + } } resource "null_resource" "install_kuberay_operator" { count = var.install_kuberay - triggers = { - md5_script = filemd5("${path.module}/scripts/install_kuberay_operator.sh") - md5_files = md5(join("", [for f in fileset("${path.module}/templates/acm-template/templates/_cluster_template/kuberay", "**") : md5("${path.module}/templates/acm-template/templates/_cluster_template/kuberay/${f}")])) - } + + depends_on = [ + google_gke_hub_feature_membership.feature_member, + null_resource.create_git_cred_cms + ] provisioner "local-exec" { command = "${path.module}/scripts/install_kuberay_operator.sh ${github_repository.acm_repo.full_name} ${var.github_email} ${var.github_org} ${var.github_user}" @@ -383,10 +473,10 @@ resource "null_resource" "install_kuberay_operator" { } } - depends_on = [ - google_gke_hub_feature_membership.feature_member, - null_resource.create_git_cred_cms - ] + triggers = { + md5_files = md5(join("", [for f in fileset("${path.module}/templates/acm-template/templates/_cluster_template/kuberay", "**") : md5("${path.module}/templates/acm-template/templates/_cluster_template/kuberay/${f}")])) + md5_script = filemd5("${path.module}/scripts/install_kuberay_operator.sh") + } } resource "google_service_account" "namespace_default" { @@ -403,10 +493,11 @@ resource "google_service_account_iam_member" "wi_cymbal_bank_backend_workload_id resource "null_resource" "create_namespace" { count = var.create_namespace - triggers = { - md5_script = filemd5("${path.module}/scripts/create_namespace.sh") - md5_files = md5(join("", [for f in fileset("${path.module}/templates/acm-template/templates/_cluster_template/team", "**") : md5("${path.module}/templates/acm-template/templates/_cluster_template/team/${f}")])) - } + + depends_on = [ + google_gke_hub_feature_membership.feature_member, + null_resource.install_kuberay_operator + ] provisioner "local-exec" { command = "${path.module}/scripts/create_namespace.sh ${github_repository.acm_repo.full_name} ${var.github_email} ${var.github_org} ${var.github_user} ${var.namespace} ${var.default_env}" @@ -415,18 +506,19 @@ resource "null_resource" "create_namespace" { } } - depends_on = [ - google_gke_hub_feature_membership.feature_member, - null_resource.install_kuberay_operator - ] + triggers = { + md5_files = md5(join("", [for f in fileset("${path.module}/templates/acm-template/templates/_cluster_template/team", "**") : md5("${path.module}/templates/acm-template/templates/_cluster_template/team/${f}")])) + md5_script = filemd5("${path.module}/scripts/create_namespace.sh") + } } resource "null_resource" "create_git_cred_ns" { count = var.create_namespace - triggers = { - md5_script = filemd5("${path.module}/scripts/create_git_cred.sh") - md5_credentials = md5(join("", [var.github_user, var.github_token])) - } + + depends_on = [ + google_gke_hub_feature_membership.feature_member, + null_resource.create_namespace + ] provisioner "local-exec" { command = "${path.module}/scripts/create_git_cred.sh ${local.parsed_gke_info[var.default_env].cluster_name} ${local.parsed_gke_info[var.default_env].gke_project_id} ${var.github_user} ${var.namespace}" @@ -435,18 +527,19 @@ resource "null_resource" "create_git_cred_ns" { } } - depends_on = [ - google_gke_hub_feature_membership.feature_member, - null_resource.create_namespace - ] + triggers = { + md5_credentials = md5(join("", [var.github_user, var.github_token])) + md5_script = filemd5("${path.module}/scripts/create_git_cred.sh") + } } resource "null_resource" "install_ray_cluster" { count = var.install_ray_in_ns - triggers = { - md5_script = filemd5("${path.module}/scripts/install_ray_cluster.sh") - md5_files = md5(join("", [for f in fileset("${path.module}/templates/acm-template//templates/_namespace_template/app", "**") : md5("${path.module}/templates/acm-template//templates/_namespace_template/app/${f}")])) - } + + depends_on = [ + google_gke_hub_feature_membership.feature_member, + null_resource.create_git_cred_ns + ] provisioner "local-exec" { command = "${path.module}/scripts/install_ray_cluster.sh ${github_repository.acm_repo.full_name} ${var.github_email} ${var.github_org} ${var.github_user} ${var.namespace} ${google_service_account.namespace_default.email}" @@ -455,17 +548,20 @@ resource "null_resource" "install_ray_cluster" { } } - depends_on = [ - google_gke_hub_feature_membership.feature_member, - null_resource.create_git_cred_ns - ] + triggers = { + md5_files = md5(join("", [for f in fileset("${path.module}/templates/acm-template//templates/_namespace_template/app", "**") : md5("${path.module}/templates/acm-template//templates/_namespace_template/app/${f}")])) + md5_script = filemd5("${path.module}/scripts/install_ray_cluster.sh") + } } resource "null_resource" "manage_ray_ns" { count = var.install_ray_in_ns - triggers = { - md5_script = filemd5("${path.module}/scripts/manage_ray_ns.sh") - } + + depends_on = [ + google_gke_hub_feature_membership.feature_member, + null_resource.create_git_cred_ns, + null_resource.install_ray_cluster + ] provisioner "local-exec" { command = "${path.module}/scripts/manage_ray_ns.sh ${github_repository.acm_repo.full_name} ${var.github_email} ${var.github_org} ${var.github_user} ${var.namespace}" @@ -474,9 +570,7 @@ resource "null_resource" "manage_ray_ns" { } } - depends_on = [ - google_gke_hub_feature_membership.feature_member, - null_resource.create_git_cred_ns, - null_resource.install_ray_cluster - ] + triggers = { + md5_script = filemd5("${path.module}/scripts/manage_ray_ns.sh") + } } diff --git a/ml-platform/terraform/mlp.auto.tfvars b/ml-platform/terraform/mlp.auto.tfvars index 1b58c802a..f6671a20a 100644 --- a/ml-platform/terraform/mlp.auto.tfvars +++ b/ml-platform/terraform/mlp.auto.tfvars @@ -1,9 +1,5 @@ -project_id = { "dev" : "YOUR_PROJECT_ID" } default_env = "dev" -github_user = "YOUR_GITHUB_USER" -github_email = "YOUR_GITHUB_EMAIL" github_org = "YOUR_GITHUB_ORG" -#github_token = "DO NOT ADD TOKEN HERE. PASS IT AS `export TF_VAR_github_token="YOUR TOKEN"` ON CLOUDSHELL" -#folder_id = "YOUR_FOLDER_ID" -#org_id = "YOUR_GCP_ORG_ID" -#billing_account = "YOUR_BILLING_ACCOUNT" +github_email = "YOUR_GITHUB_EMAIL" +github_user = "YOUR_GITHUB_USER" +project_id = { "dev" : "YOUR_PROJECT_ID" } diff --git a/ml-platform/terraform/modules/cloud-nat/main.tf b/ml-platform/terraform/modules/cloud-nat/main.tf index 8efd57188..1b549d755 100644 --- a/ml-platform/terraform/modules/cloud-nat/main.tf +++ b/ml-platform/terraform/modules/cloud-nat/main.tf @@ -14,25 +14,25 @@ resource "random_string" "name_suffix" { length = 6 - upper = false special = false + upper = false } locals { - # intermediate locals - default_name = "cloud-nat-${random_string.name_suffix.result}" - # locals for google_compute_router_nat - nat_ip_allocate_option = length(var.nat_ips) > 0 ? "MANUAL_ONLY" : "AUTO_ONLY" + default_name = "cloud-nat-${random_string.name_suffix.result}" name = var.name != "" ? var.name : local.default_name + nat_ip_allocate_option = length(var.nat_ips) > 0 ? "MANUAL_ONLY" : "AUTO_ONLY" router = var.create_router ? google_compute_router.router[0].name : var.router } resource "google_compute_router" "router" { - count = var.create_router ? 1 : 0 + count = var.create_router ? 1 : 0 + name = var.router + network = var.network project = var.project_id region = var.region - network = var.network + bgp { asn = var.router_asn keepalive_interval = var.router_keepalive_interval @@ -40,31 +40,22 @@ resource "google_compute_router" "router" { } resource "google_compute_router_nat" "main" { - project = var.project_id - region = var.region - name = local.name - router = local.router - nat_ip_allocate_option = local.nat_ip_allocate_option - nat_ips = var.nat_ips - source_subnetwork_ip_ranges_to_nat = var.source_subnetwork_ip_ranges_to_nat - min_ports_per_vm = var.min_ports_per_vm - max_ports_per_vm = var.enable_dynamic_port_allocation ? var.max_ports_per_vm : null - udp_idle_timeout_sec = var.udp_idle_timeout_sec - icmp_idle_timeout_sec = var.icmp_idle_timeout_sec - tcp_established_idle_timeout_sec = var.tcp_established_idle_timeout_sec - tcp_transitory_idle_timeout_sec = var.tcp_transitory_idle_timeout_sec - tcp_time_wait_timeout_sec = var.tcp_time_wait_timeout_sec - enable_endpoint_independent_mapping = var.enable_endpoint_independent_mapping enable_dynamic_port_allocation = var.enable_dynamic_port_allocation - - dynamic "subnetwork" { - for_each = var.subnetworks - content { - name = subnetwork.value.name - source_ip_ranges_to_nat = subnetwork.value.source_ip_ranges_to_nat - secondary_ip_range_names = contains(subnetwork.value.source_ip_ranges_to_nat, "LIST_OF_SECONDARY_IP_RANGES") ? subnetwork.value.secondary_ip_range_names : [] - } - } + enable_endpoint_independent_mapping = var.enable_endpoint_independent_mapping + icmp_idle_timeout_sec = var.icmp_idle_timeout_sec + max_ports_per_vm = var.enable_dynamic_port_allocation ? var.max_ports_per_vm : null + min_ports_per_vm = var.min_ports_per_vm + name = local.name + nat_ip_allocate_option = local.nat_ip_allocate_option + nat_ips = var.nat_ips + project = var.project_id + region = var.region + router = local.router + source_subnetwork_ip_ranges_to_nat = var.source_subnetwork_ip_ranges_to_nat + tcp_established_idle_timeout_sec = var.tcp_established_idle_timeout_sec + tcp_time_wait_timeout_sec = var.tcp_time_wait_timeout_sec + tcp_transitory_idle_timeout_sec = var.tcp_transitory_idle_timeout_sec + udp_idle_timeout_sec = var.udp_idle_timeout_sec dynamic "log_config" { for_each = var.log_config_enable == true ? [{ @@ -77,4 +68,13 @@ resource "google_compute_router_nat" "main" { filter = log_config.value.filter } } + + dynamic "subnetwork" { + for_each = var.subnetworks + content { + name = subnetwork.value.name + source_ip_ranges_to_nat = subnetwork.value.source_ip_ranges_to_nat + secondary_ip_range_names = contains(subnetwork.value.source_ip_ranges_to_nat, "LIST_OF_SECONDARY_IP_RANGES") ? subnetwork.value.secondary_ip_range_names : [] + } + } } diff --git a/ml-platform/terraform/modules/cloud-nat/outputs.tf b/ml-platform/terraform/modules/cloud-nat/outputs.tf index 86bf7c39d..acd7f8ce6 100644 --- a/ml-platform/terraform/modules/cloud-nat/outputs.tf +++ b/ml-platform/terraform/modules/cloud-nat/outputs.tf @@ -31,4 +31,3 @@ output "router_name" { description = "Cloud NAT router name" value = local.router } - diff --git a/ml-platform/terraform/modules/cloud-nat/variables.tf b/ml-platform/terraform/modules/cloud-nat/variables.tf index 84cd6fbbb..a329cfbfb 100644 --- a/ml-platform/terraform/modules/cloud-nat/variables.tf +++ b/ml-platform/terraform/modules/cloud-nat/variables.tf @@ -12,134 +12,135 @@ # See the License for the specific language governing permissions and # limitations under the License. -variable "project_id" { - type = string - description = "The project ID to deploy to" +variable "create_router" { + default = false + description = "Create router instead of using an existing one, uses 'router' variable for new resource name." + type = bool } -variable "region" { - type = string - description = "The region to deploy to" +variable "enable_dynamic_port_allocation" { + default = false + description = "Enable Dynamic Port Allocation. If minPorts is set, minPortsPerVm must be set to a power of two greater than or equal to 32." + type = bool +} + +variable "enable_endpoint_independent_mapping" { + default = null + description = "Specifies if endpoint independent mapping is enabled." + type = bool } variable "icmp_idle_timeout_sec" { - type = string - description = "Timeout (in seconds) for ICMP connections. Defaults to 30s if not set. Changing this forces a new NAT to be created." default = "30" + description = "Timeout (in seconds) for ICMP connections. Defaults to 30s if not set. Changing this forces a new NAT to be created." + type = string } -variable "min_ports_per_vm" { +variable "log_config_enable" { + default = false + description = "Indicates whether or not to export logs" + type = bool +} + +variable "log_config_filter" { + default = "ALL" + description = "Specifies the desired filtering of logs on this NAT. Valid values are: \"ERRORS_ONLY\", \"TRANSLATIONS_ONLY\", \"ALL\"" type = string - description = "Minimum number of ports allocated to a VM from this NAT config. Defaults to 64 if not set. Changing this forces a new NAT to be created." - default = "64" } variable "max_ports_per_vm" { - type = string - description = "Maximum number of ports allocated to a VM from this NAT. This field can only be set when enableDynamicPortAllocation is enabled.This will be ignored if enable_dynamic_port_allocation is set to false." default = null + description = "Maximum number of ports allocated to a VM from this NAT. This field can only be set when enableDynamicPortAllocation is enabled.This will be ignored if enable_dynamic_port_allocation is set to false." + type = string } -variable "name" { +variable "min_ports_per_vm" { + default = "64" + description = "Minimum number of ports allocated to a VM from this NAT config. Defaults to 64 if not set. Changing this forces a new NAT to be created." type = string - description = "Defaults to 'cloud-nat-RANDOM_SUFFIX'. Changing this forces a new NAT to be created." +} + +variable "name" { default = "" + description = "Defaults to 'cloud-nat-RANDOM_SUFFIX'. Changing this forces a new NAT to be created." + type = string } variable "nat_ips" { - type = list(string) - description = "List of self_links of external IPs. Changing this forces a new NAT to be created. Value of `nat_ip_allocate_option` is inferred based on nat_ips. If present set to MANUAL_ONLY, otherwise AUTO_ONLY." default = [] + description = "List of self_links of external IPs. Changing this forces a new NAT to be created. Value of `nat_ip_allocate_option` is inferred based on nat_ips. If present set to MANUAL_ONLY, otherwise AUTO_ONLY." + type = list(string) } variable "network" { - type = string - description = "VPN name, only if router is not passed in and is created by the module." default = "" -} - -variable "create_router" { - type = bool - description = "Create router instead of using an existing one, uses 'router' variable for new resource name." - default = false -} - -variable "router" { - type = string - description = "The name of the router in which this NAT will be configured. Changing this forces a new NAT to be created." -} - -variable "router_asn" { + description = "VPN name, only if router is not passed in and is created by the module." type = string - description = "Router ASN, only if router is not passed in and is created by the module." - default = "64514" } -variable "router_keepalive_interval" { +variable "project_id" { + description = "The project ID to deploy to" type = string - description = "Router keepalive_interval, only if router is not passed in and is created by the module." - default = "20" } -variable "source_subnetwork_ip_ranges_to_nat" { +variable "region" { + description = "The region to deploy to" type = string - description = "Defaults to ALL_SUBNETWORKS_ALL_IP_RANGES. How NAT should be configured per Subnetwork. Valid values include: ALL_SUBNETWORKS_ALL_IP_RANGES, ALL_SUBNETWORKS_ALL_PRIMARY_IP_RANGES, LIST_OF_SUBNETWORKS. Changing this forces a new NAT to be created." - default = "ALL_SUBNETWORKS_ALL_IP_RANGES" } -variable "tcp_established_idle_timeout_sec" { +variable "router" { + description = "The name of the router in which this NAT will be configured. Changing this forces a new NAT to be created." type = string - description = "Timeout (in seconds) for TCP established connections. Defaults to 1200s if not set. Changing this forces a new NAT to be created." - default = "1200" } -variable "tcp_transitory_idle_timeout_sec" { +variable "router_asn" { + default = "64514" + description = "Router ASN, only if router is not passed in and is created by the module." type = string - description = "Timeout (in seconds) for TCP transitory connections. Defaults to 30s if not set. Changing this forces a new NAT to be created." - default = "30" } -variable "tcp_time_wait_timeout_sec" { +variable "router_keepalive_interval" { + default = "20" + description = "Router keepalive_interval, only if router is not passed in and is created by the module." type = string - description = "Timeout (in seconds) for TCP connections that are in TIME_WAIT state. Defaults to 120s if not set." - default = "120" } -variable "udp_idle_timeout_sec" { +variable "source_subnetwork_ip_ranges_to_nat" { + default = "ALL_SUBNETWORKS_ALL_IP_RANGES" + description = "Defaults to ALL_SUBNETWORKS_ALL_IP_RANGES. How NAT should be configured per Subnetwork. Valid values include: ALL_SUBNETWORKS_ALL_IP_RANGES, ALL_SUBNETWORKS_ALL_PRIMARY_IP_RANGES, LIST_OF_SUBNETWORKS. Changing this forces a new NAT to be created." type = string - description = "Timeout (in seconds) for UDP connections. Defaults to 30s if not set. Changing this forces a new NAT to be created." - default = "30" } variable "subnetworks" { + default = [] description = "Specifies one or more subnetwork NAT configurations" type = list(object({ name = string, - source_ip_ranges_to_nat = list(string) secondary_ip_range_names = list(string) + source_ip_ranges_to_nat = list(string) })) - default = [] } -variable "log_config_enable" { - type = bool - description = "Indicates whether or not to export logs" - default = false -} -variable "log_config_filter" { +variable "tcp_established_idle_timeout_sec" { + default = "1200" + description = "Timeout (in seconds) for TCP established connections. Defaults to 1200s if not set. Changing this forces a new NAT to be created." type = string - description = "Specifies the desired filtering of logs on this NAT. Valid values are: \"ERRORS_ONLY\", \"TRANSLATIONS_ONLY\", \"ALL\"" - default = "ALL" } -variable "enable_dynamic_port_allocation" { - type = bool - description = "Enable Dynamic Port Allocation. If minPorts is set, minPortsPerVm must be set to a power of two greater than or equal to 32." - default = false +variable "tcp_time_wait_timeout_sec" { + default = "120" + description = "Timeout (in seconds) for TCP connections that are in TIME_WAIT state. Defaults to 120s if not set." + type = string +} +variable "tcp_transitory_idle_timeout_sec" { + default = "30" + description = "Timeout (in seconds) for TCP transitory connections. Defaults to 30s if not set. Changing this forces a new NAT to be created." + type = string } -variable "enable_endpoint_independent_mapping" { - type = bool - description = "Specifies if endpoint independent mapping is enabled." - default = null + +variable "udp_idle_timeout_sec" { + default = "30" + description = "Timeout (in seconds) for UDP connections. Defaults to 30s if not set. Changing this forces a new NAT to be created." + type = string } diff --git a/ml-platform/terraform/modules/cloud-nat/versions.tf b/ml-platform/terraform/modules/cloud-nat/versions.tf index a6e8142dd..bb043b786 100644 --- a/ml-platform/terraform/modules/cloud-nat/versions.tf +++ b/ml-platform/terraform/modules/cloud-nat/versions.tf @@ -12,24 +12,12 @@ # See the License for the specific language governing permissions and # limitations under the License. -//terraform { -// required_providers { -// -// google = { -// source = "hashicorp/google" -// #version = ">= 4.51, < 5.0" -// version = "4.72.1" -// } -// -// random = { -// source = "hashicorp/random" -// version = "~> 2.2" -// } -// } -// -//} terraform { required_providers { + github = { + source = "integrations/github" + version = "6.0.1" + } google = { source = "hashicorp/google" version = "5.19.0" @@ -38,10 +26,6 @@ terraform { source = "hashicorp/google-beta" version = "5.19.0" } - github = { - source = "integrations/github" - version = "6.0.1" - } random = { source = "hashicorp/random" version = "2.2" diff --git a/ml-platform/terraform/modules/cluster/gke.tf b/ml-platform/terraform/modules/cluster/gke.tf index e774a2be4..3bbfb8099 100644 --- a/ml-platform/terraform/modules/cluster/gke.tf +++ b/ml-platform/terraform/modules/cluster/gke.tf @@ -19,91 +19,105 @@ data "google_project" "project" { } resource "google_container_cluster" "mlp" { - provider = google-beta + provider = google-beta + deletion_protection = false - name = var.cluster_name - project = var.project_id + initial_node_count = 2 location = var.region + name = var.cluster_name network = var.network - subnetwork = var.subnet node_locations = ["${var.region}-a", "${var.region}-b", "${var.region}-c"] - initial_node_count = 2 - workload_identity_config { - workload_pool = "${var.project_id}.svc.id.goog" - } + project = var.project_id + subnetwork = var.subnet + addons_config { gcp_filestore_csi_driver_config { enabled = true } + gcs_fuse_csi_driver_config { enabled = true } + gce_persistent_disk_csi_driver_config { enabled = true } } + cluster_autoscaling { - enabled = true autoscaling_profile = "OPTIMIZE_UTILIZATION" + enabled = true + + auto_provisioning_defaults { + oauth_scopes = [ + "https://www.googleapis.com/auth/cloud-platform" + ] + + management { + auto_repair = true + auto_upgrade = true + } + + upgrade_settings { + max_surge = 0 + max_unavailable = 1 + strategy = "SURGE" + } + } + resource_limits { resource_type = "cpu" minimum = 4 maximum = 600 } + resource_limits { resource_type = "memory" minimum = 16 maximum = 2400 } + resource_limits { - resource_type = "nvidia-tesla-t4" - maximum = 300 + resource_type = "nvidia-a100-80gb" + maximum = 30 } + resource_limits { resource_type = "nvidia-l4" maximum = 30 } + + resource_limits { + resource_type = "nvidia-tesla-t4" + maximum = 300 + } + resource_limits { resource_type = "nvidia-tesla-a100" maximum = 50 } + resource_limits { - resource_type = "nvidia-a100-80gb" + resource_type = "nvidia-tesla-k80" maximum = 30 } + resource_limits { - resource_type = "nvidia-tesla-v100" + resource_type = "nvidia-tesla-p4" maximum = 30 } + resource_limits { resource_type = "nvidia-tesla-p100" maximum = 30 } + resource_limits { - resource_type = "nvidia-tesla-p4" - maximum = 30 - } - resource_limits { - resource_type = "nvidia-tesla-k80" + resource_type = "nvidia-tesla-v100" maximum = 30 } - auto_provisioning_defaults { - management { - auto_repair = true - auto_upgrade = true - } - - upgrade_settings { - strategy = "SURGE" - max_surge = 0 - max_unavailable = 1 - } - - oauth_scopes = [ - "https://www.googleapis.com/auth/cloud-platform" - ] - } } + logging_config { enable_components = [ "APISERVER", @@ -113,6 +127,17 @@ resource "google_container_cluster" "mlp" { "WORKLOADS" ] } + + ip_allocation_policy { + } + + master_authorized_networks_config { + cidr_blocks { + cidr_block = var.master_auth_networks_ipcidr + display_name = "vpc-cidr" + } + } + monitoring_config { enable_components = [ "APISERVER", @@ -126,10 +151,12 @@ resource "google_container_cluster" "mlp" { "STORAGE", "SYSTEM_COMPONENTS" ] + managed_prometheus { enabled = true } } + node_pool_defaults { node_config_defaults { gcfs_config { @@ -137,20 +164,18 @@ resource "google_container_cluster" "mlp" { } } } - release_channel { - channel = "STABLE" - } + private_cluster_config { enable_private_nodes = true enable_private_endpoint = true master_ipv4_cidr_block = "172.16.0.32/28" } - master_authorized_networks_config { - cidr_blocks { - cidr_block = var.master_auth_networks_ipcidr - display_name = "vpc-cidr" - } + release_channel { + channel = "STABLE" + } + + workload_identity_config { + workload_pool = "${var.project_id}.svc.id.goog" } - ip_allocation_policy {} } diff --git a/ml-platform/terraform/modules/cluster/outputs.tf b/ml-platform/terraform/modules/cluster/outputs.tf index fd87e1c6f..9c813071a 100644 --- a/ml-platform/terraform/modules/cluster/outputs.tf +++ b/ml-platform/terraform/modules/cluster/outputs.tf @@ -16,7 +16,6 @@ output "cluster_id" { value = google_container_cluster.mlp.id } - output "cluster_location" { value = google_container_cluster.mlp.location } diff --git a/ml-platform/terraform/modules/cluster/variables.tf b/ml-platform/terraform/modules/cluster/variables.tf index 6eccda35b..d54153b0f 100644 --- a/ml-platform/terraform/modules/cluster/variables.tf +++ b/ml-platform/terraform/modules/cluster/variables.tf @@ -12,47 +12,46 @@ # See the License for the specific language governing permissions and # limitations under the License. -variable "project_id" { - type = string - description = "The GCP project where the resources will be created" - default = "" -} variable "cluster_name" { - type = string - description = "GKE cluster name" default = "" -} - -variable "region" { + description = "GKE cluster name" type = string - description = "The GCP region where the GKE cluster will be created" - default = "us-central1" } -variable "zone" { +variable "env" { + description = "environment" type = string - description = "The GCP zone where the reservation will be created" - default = "us-central1-a" } variable "master_auth_networks_ipcidr" { - type = string description = "master authorized network" + type = string } variable "network" { - type = string description = "VPC network where the cluster will be created" + type = string } -variable "subnet" { +variable "project_id" { + default = "" + description = "The GCP project where the resources will be created" type = string - description = "subnetwork where the cluster will be created" +} +variable "region" { + default = "us-central1" + description = "The GCP region where the GKE cluster will be created" + type = string } -variable "env" { +variable "subnet" { + description = "subnetwork where the cluster will be created" type = string - description = "environment" +} -} \ No newline at end of file +variable "zone" { + default = "us-central1-a" + description = "The GCP zone where the reservation will be created" + type = string +} diff --git a/ml-platform/terraform/modules/cluster/versions.tf b/ml-platform/terraform/modules/cluster/versions.tf index d4aada15b..b19f861ad 100644 --- a/ml-platform/terraform/modules/cluster/versions.tf +++ b/ml-platform/terraform/modules/cluster/versions.tf @@ -12,19 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -//terraform { -// required_providers { -// google-beta = { -// source = "hashicorp/google-beta" -// version = "4.72.1" -// } -// google = { -// source = "hashicorp/google" -// version = "4.72.1" -// } -// } -//} - terraform { required_providers { google = { diff --git a/ml-platform/terraform/modules/network/outputs.tf b/ml-platform/terraform/modules/network/outputs.tf index bf9d36dad..d05cb77d0 100644 --- a/ml-platform/terraform/modules/network/outputs.tf +++ b/ml-platform/terraform/modules/network/outputs.tf @@ -12,17 +12,17 @@ # See the License for the specific language governing permissions and # limitations under the License. -output "vpc" { - value = google_compute_network.vpc-network.id - description = "VPC." -} - output "subnet-1" { - value = google_compute_subnetwork.subnet-1.id description = "subnet1." + value = google_compute_subnetwork.subnet-1.id } output "subnet-2" { - value = google_compute_subnetwork.subnet-2.id description = "subnet2." -} \ No newline at end of file + value = google_compute_subnetwork.subnet-2.id +} + +output "vpc" { + description = "VPC." + value = google_compute_network.vpc-network.id +} diff --git a/ml-platform/terraform/modules/network/variables.tf b/ml-platform/terraform/modules/network/variables.tf index c7c12296c..44a83c9bb 100644 --- a/ml-platform/terraform/modules/network/variables.tf +++ b/ml-platform/terraform/modules/network/variables.tf @@ -13,44 +13,47 @@ # limitations under the License. variable "project_id" { - description = "Id of the GCP project where VPC is to be created." type = string + description = "Id of the GCP project where VPC is to be created." } + variable "network_name" { - description = "Name of the VPC network." type = string + description = "Name of the VPC network." } + variable "routing_mode" { - description = "The network routing mode." - type = string default = "GLOBAL" -} -variable "subnet_01_name" { - description = "Name of first subnet." + description = "The network routing mode." type = string } + variable "subnet_01_ip" { - description = "IP range of first subnet." type = string + description = "IP range of first subnet." } -variable "subnet_01_region" { - description = "Region of first subnet." + +variable "subnet_01_name" { type = string + description = "Name of first subnet." } -variable "subnet_02_name" { - description = "Name of the second subnet." +variable "subnet_01_region" { type = string + description = "Region of first subnet." } + variable "subnet_02_ip" { + type = string description = "IP range of second subnet." +} + +variable "subnet_02_name" { type = string + description = "Name of the second subnet." } + variable "subnet_02_region" { - description = "Region of second subnet." type = string + description = "Region of second subnet." } -//variable "default_route_name" { -// description = "Name of the default route to internet." -// type = string -//} diff --git a/ml-platform/terraform/modules/network/versions.tf b/ml-platform/terraform/modules/network/versions.tf index e2e5241f2..466fd04d7 100644 --- a/ml-platform/terraform/modules/network/versions.tf +++ b/ml-platform/terraform/modules/network/versions.tf @@ -12,15 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -//terraform { -// required_providers { -// google = { -// source = "hashicorp/google" -// version = ">= 4.28.0" -// } -// } -//} - terraform { required_providers { google = { diff --git a/ml-platform/terraform/modules/network/vpc.tf b/ml-platform/terraform/modules/network/vpc.tf index 37266b5d2..a573374b4 100644 --- a/ml-platform/terraform/modules/network/vpc.tf +++ b/ml-platform/terraform/modules/network/vpc.tf @@ -13,34 +13,26 @@ # limitations under the License. resource "google_compute_network" "vpc-network" { - project = var.project_id - name = var.network_name auto_create_subnetworks = false + name = var.network_name + project = var.project_id routing_mode = var.routing_mode } resource "google_compute_subnetwork" "subnet-1" { - project = var.project_id - name = var.subnet_01_name ip_cidr_range = var.subnet_01_ip - region = var.subnet_01_region + name = var.subnet_01_name network = google_compute_network.vpc-network.id private_ip_google_access = true + project = var.project_id + region = var.subnet_01_region } resource "google_compute_subnetwork" "subnet-2" { - project = var.project_id - name = var.subnet_02_name ip_cidr_range = var.subnet_02_ip - region = var.subnet_02_region + name = var.subnet_02_name network = google_compute_network.vpc-network.id private_ip_google_access = true + project = var.project_id + region = var.subnet_02_region } - -//resource "google_compute_route" "default-route" { -//name = var.default_route_name -//dest_range = "0.0.0.0/0" -//network = google_compute_network.vpc-network.id -//priority = 1000 -//next_hop_gateway = "default-internet-gateway" -//} diff --git a/ml-platform/terraform/modules/node-pools/nodepools.tf b/ml-platform/terraform/modules/node-pools/nodepools.tf index d8535b8ce..26ca6d92e 100644 --- a/ml-platform/terraform/modules/node-pools/nodepools.tf +++ b/ml-platform/terraform/modules/node-pools/nodepools.tf @@ -13,34 +13,39 @@ # limitations under the License. resource "google_container_node_pool" "node-pool" { - name = format("%s-%s", var.cluster_name, var.node_pool_name) - project = var.project_id cluster = var.cluster_name location = var.region + name = format("%s-%s", var.cluster_name, var.node_pool_name) + project = var.project_id + + autoscaling { + location_policy = var.autoscaling["location_policy"] + total_max_node_count = var.autoscaling["total_max_node_count"] + total_min_node_count = var.autoscaling["total_min_node_count"] + } + + network_config { + enable_private_nodes = true + } + node_config { - gcfs_config { - enabled = true - } machine_type = var.machine_type - dynamic "taint" { - for_each = var.taints - content { - key = taint.value.key - value = taint.value.value - effect = taint.value.effect - } - } + oauth_scopes = [ + "https://www.googleapis.com/auth/cloud-platform" + ] + labels = { "resource-type" : var.resource_type } + gcfs_config { + enabled = true + } + guest_accelerator { - type = var.accelerator count = var.accelerator_count + type = var.accelerator } - oauth_scopes = [ - "https://www.googleapis.com/auth/cloud-platform" - ] dynamic "reservation_affinity" { for_each = var.reservation_name != "" ? [1] : [] @@ -50,16 +55,15 @@ resource "google_container_node_pool" "node-pool" { values = [var.reservation_name] } } - } - autoscaling { - total_min_node_count = var.autoscaling["total_min_node_count"] - total_max_node_count = var.autoscaling["total_max_node_count"] - location_policy = var.autoscaling["location_policy"] - } - timeouts { - create = "30m" - update = "20m" + dynamic "taint" { + for_each = var.taints + content { + effect = taint.value.effect + key = taint.value.key + value = taint.value.value + } + } } lifecycle { @@ -68,7 +72,9 @@ resource "google_container_node_pool" "node-pool" { node_config[0].taint, ] } - network_config { - enable_private_nodes = true + + timeouts { + create = "30m" + update = "20m" } } diff --git a/ml-platform/terraform/modules/node-pools/variables.tf b/ml-platform/terraform/modules/node-pools/variables.tf index 61fb2504b..f9cf7e59a 100644 --- a/ml-platform/terraform/modules/node-pools/variables.tf +++ b/ml-platform/terraform/modules/node-pools/variables.tf @@ -12,72 +12,79 @@ # See the License for the specific language governing permissions and # limitations under the License. -variable "node_pool_name" { +variable "accelerator" { + default = "nvidia-l4" + description = "The GPU accelerator to use." type = string - description = "Name of the node pool" } -variable "project_id" { - type = string - description = "The GCP project where the resources will be created" - default = "" + +variable "accelerator_count" { + default = 2 + description = "The number of accelerators per machine." + type = number +} + +variable "autoscaling" { + default = { + "total_min_node_count" : 0, + "total_max_node_count" : 24, + "location_policy" : "ANY" + } + type = map(any) } + variable "cluster_name" { - type = string - description = "GKE cluster name" default = "" -} -variable "region" { + description = "GKE cluster name" type = string - description = "The GCP zone where the reservation will be created" - default = "us-central1-a" +} + +variable "machine_reservation_count" { + default = 4 + description = "Number of machines reserved instances with GPUs" + type = number } variable "machine_type" { - type = string - description = "The machine type to use." default = "g2-standard-24" + description = "The machine type to use." + type = string } -variable "taints" { - description = "Taints to be applied to the on-demand node pool." - type = list(object({ - key = string - value = any - effect = string - })) +variable "node_pool_name" { + description = "Name of the node pool" + type = string } -variable "resource_type" { - description = "ondemand/spot/reserved." +variable "project_id" { + default = "" + description = "The GCP project where the resources will be created" type = string - default = "ondemand" } - -variable "accelerator" { +variable "region" { + default = "us-central1-a" + description = "The GCP zone where the reservation will be created" type = string - description = "The GPU accelerator to use." - default = "nvidia-l4" } -variable "accelerator_count" { - type = number - description = "The number of accelerators per machine." - default = 2 -} -variable "machine_reservation_count" { - type = number - description = "Number of machines reserved instances with GPUs" - default = 4 +variable "reservation_name" { + default = "" + description = "reservation name to which the nodepool will be associated" + type = string } -variable "autoscaling" { - type = map(any) - default = { "total_min_node_count" : 0, "total_max_node_count" : 24, "location_policy" : "ANY" } +variable "resource_type" { + default = "ondemand" + description = "ondemand/spot/reserved." + type = string } -variable "reservation_name" { - description = "reservation name to which the nodepool will be associated" - type = string - default = "" +variable "taints" { + description = "Taints to be applied to the on-demand node pool." + type = list(object({ + effect = string + key = string + value = any + })) } diff --git a/ml-platform/terraform/modules/node-pools/versions.tf b/ml-platform/terraform/modules/node-pools/versions.tf index d4aada15b..b19f861ad 100644 --- a/ml-platform/terraform/modules/node-pools/versions.tf +++ b/ml-platform/terraform/modules/node-pools/versions.tf @@ -12,19 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -//terraform { -// required_providers { -// google-beta = { -// source = "hashicorp/google-beta" -// version = "4.72.1" -// } -// google = { -// source = "hashicorp/google" -// version = "4.72.1" -// } -// } -//} - terraform { required_providers { google = { diff --git a/ml-platform/terraform/modules/projects/outputs.tf b/ml-platform/terraform/modules/projects/outputs.tf index e087e6c85..431fe53dd 100644 --- a/ml-platform/terraform/modules/projects/outputs.tf +++ b/ml-platform/terraform/modules/projects/outputs.tf @@ -14,4 +14,4 @@ output "project_ids" { value = "${google_project.project_under_folder}" == {} ? "${google_project.project_under_org}" : "${google_project.project_under_folder}" -} \ No newline at end of file +} diff --git a/ml-platform/terraform/modules/projects/projects.tf b/ml-platform/terraform/modules/projects/projects.tf index 2b5c6b020..39250a5f4 100644 --- a/ml-platform/terraform/modules/projects/projects.tf +++ b/ml-platform/terraform/modules/projects/projects.tf @@ -17,80 +17,96 @@ resource "random_id" "random_project_id_suffix" { } resource "google_project" "project_under_folder" { - for_each = var.folder_id != null ? var.env : toset([]) + for_each = var.folder_id != null ? var.env : toset([]) + + billing_account = var.billing_account + folder_id = var.folder_id name = format("%s-%s", var.project_name, each.value) project_id = format("%s-%s-%s", var.project_name, random_id.random_project_id_suffix.hex, each.value) - folder_id = var.folder_id - billing_account = var.billing_account } resource "google_project" "project_under_org" { - for_each = var.folder_id == null ? var.env : toset([]) + for_each = var.folder_id == null ? var.env : toset([]) + + billing_account = var.billing_account name = format("%s-%s", var.project_name, each.value) - project_id = format("%s-%s-%s", var.project_name, random_id.random_project_id_suffix.hex, each.value) org_id = var.org_id - billing_account = var.billing_account + project_id = format("%s-%s-%s", var.project_name, random_id.random_project_id_suffix.hex, each.value) } resource "google_project_service" "project_services" { - for_each = var.folder_id == null ? google_project.project_under_org : google_project.project_under_folder + for_each = var.folder_id == null ? google_project.project_under_org : google_project.project_under_folder + + depends_on = [google_project.project_under_folder, google_project.project_under_org] + + disable_dependent_services = true + disable_on_destroy = true project = each.value.id service = "cloudresourcemanager.googleapis.com" - disable_on_destroy = true - disable_dependent_services = true - depends_on = [google_project.project_under_folder, google_project.project_under_org] } resource "google_project_service" "project_services-1" { - for_each = var.folder_id == null ? google_project.project_under_org : google_project.project_under_folder + for_each = var.folder_id == null ? google_project.project_under_org : google_project.project_under_folder + + depends_on = [google_project.project_under_folder, google_project.project_under_org] + + disable_dependent_services = true + disable_on_destroy = true project = each.value.id service = "iam.googleapis.com" - disable_on_destroy = true - disable_dependent_services = true - depends_on = [google_project.project_under_folder, google_project.project_under_org] } resource "google_project_service" "project_services-2" { - for_each = var.folder_id == null ? google_project.project_under_org : google_project.project_under_folder + for_each = var.folder_id == null ? google_project.project_under_org : google_project.project_under_folder + + depends_on = [google_project.project_under_folder, google_project.project_under_org] + + disable_dependent_services = true + disable_on_destroy = true project = each.value.id service = "container.googleapis.com" - disable_on_destroy = true - disable_dependent_services = true - depends_on = [google_project.project_under_folder, google_project.project_under_org] } resource "google_project_service" "project_services-3" { - for_each = var.folder_id == null ? google_project.project_under_org : google_project.project_under_folder + for_each = var.folder_id == null ? google_project.project_under_org : google_project.project_under_folder + + depends_on = [google_project.project_under_folder, google_project.project_under_org] + + disable_dependent_services = true + disable_on_destroy = true project = each.value.id service = "compute.googleapis.com" - disable_on_destroy = true - disable_dependent_services = true - depends_on = [google_project.project_under_folder, google_project.project_under_org] } resource "google_project_service" "project_services-4" { - for_each = var.folder_id == null ? google_project.project_under_org : google_project.project_under_folder + for_each = var.folder_id == null ? google_project.project_under_org : google_project.project_under_folder + + depends_on = [google_project.project_under_folder, google_project.project_under_org] + + disable_dependent_services = true + disable_on_destroy = true project = each.value.id service = "anthos.googleapis.com" - disable_on_destroy = true - disable_dependent_services = true - depends_on = [google_project.project_under_folder, google_project.project_under_org] } resource "google_project_service" "project_services-5" { - for_each = var.folder_id == null ? google_project.project_under_org : google_project.project_under_folder + for_each = var.folder_id == null ? google_project.project_under_org : google_project.project_under_folder + + depends_on = [google_project.project_under_folder, google_project.project_under_org] + + disable_dependent_services = true + disable_on_destroy = true project = each.value.id service = "anthosconfigmanagement.googleapis.com" - disable_on_destroy = true - disable_dependent_services = true - depends_on = [google_project.project_under_folder, google_project.project_under_org] } resource "google_project_service" "project_services-6" { - for_each = var.folder_id == null ? google_project.project_under_org : google_project.project_under_folder + for_each = var.folder_id == null ? google_project.project_under_org : google_project.project_under_folder + + depends_on = [google_project.project_under_folder, google_project.project_under_org] + + disable_dependent_services = true + disable_on_destroy = true project = each.value.id service = "gkehub.googleapis.com" - disable_on_destroy = true - disable_dependent_services = true - depends_on = [google_project.project_under_folder, google_project.project_under_org] -} \ No newline at end of file +} diff --git a/ml-platform/terraform/modules/projects/variables.tf b/ml-platform/terraform/modules/projects/variables.tf index 91f8cd0f9..378b5610b 100644 --- a/ml-platform/terraform/modules/projects/variables.tf +++ b/ml-platform/terraform/modules/projects/variables.tf @@ -12,32 +12,32 @@ # See the License for the specific language governing permissions and # limitations under the License. -variable "org_id" { - type = string - description = "The GCP orig id" +variable "billing_account" { default = "" + description = "GCP billing account" + type = string } variable "env" { - type = set(string) - description = "List of environments" default = ["dev"] + description = "List of environments" + type = set(string) } variable "folder_id" { - type = string - description = "Folder id where the GCP projects will be created" default = null + description = "Folder id where the GCP projects will be created" + type = string } -variable "billing_account" { - type = string - description = "GCP billing account" +variable "org_id" { default = "" + type = string + description = "The GCP orig id" } variable "project_name" { - type = string - description = "GCP project name" default = "" + description = "GCP project name" + type = string } diff --git a/ml-platform/terraform/modules/projects/versions.tf b/ml-platform/terraform/modules/projects/versions.tf index e2e5241f2..466fd04d7 100644 --- a/ml-platform/terraform/modules/projects/versions.tf +++ b/ml-platform/terraform/modules/projects/versions.tf @@ -12,15 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -//terraform { -// required_providers { -// google = { -// source = "hashicorp/google" -// version = ">= 4.28.0" -// } -// } -//} - terraform { required_providers { google = { diff --git a/ml-platform/terraform/modules/vm-reservations/outputs.tf b/ml-platform/terraform/modules/vm-reservations/outputs.tf index 5a4562e1a..11ffcc6d8 100644 --- a/ml-platform/terraform/modules/vm-reservations/outputs.tf +++ b/ml-platform/terraform/modules/vm-reservations/outputs.tf @@ -14,4 +14,4 @@ output "reservation_name" { value = split("/", google_compute_reservation.machine_reservation.id)[5] -} \ No newline at end of file +} diff --git a/ml-platform/terraform/modules/vm-reservations/reservations.tf b/ml-platform/terraform/modules/vm-reservations/reservations.tf index 177b0d384..d7a0c1ad3 100644 --- a/ml-platform/terraform/modules/vm-reservations/reservations.tf +++ b/ml-platform/terraform/modules/vm-reservations/reservations.tf @@ -13,18 +13,21 @@ # limitations under the License. resource "google_compute_reservation" "machine_reservation" { + name = format("%s-%s", var.cluster_name, "reservation") project = var.project_id specific_reservation_required = true - name = format("%s-%s", var.cluster_name, "reservation") zone = var.zone + specific_reservation { count = var.machine_reservation_count + instance_properties { machine_type = var.machine_type + guest_accelerators { - accelerator_type = var.accelerator accelerator_count = var.accelerator_count + accelerator_type = var.accelerator } } } -} \ No newline at end of file +} diff --git a/ml-platform/terraform/modules/vm-reservations/variables.tf b/ml-platform/terraform/modules/vm-reservations/variables.tf index 3a8e3482d..c534c75a6 100644 --- a/ml-platform/terraform/modules/vm-reservations/variables.tf +++ b/ml-platform/terraform/modules/vm-reservations/variables.tf @@ -12,40 +12,44 @@ # See the License for the specific language governing permissions and # limitations under the License. -variable "project_id" { +variable "accelerator" { + default = "nvidia-l4" + description = "The GPU accelerator to use." type = string - description = "The GCP project where the resources will be created" - default = "" } + +variable "accelerator_count" { + default = 2 + description = "The number of accelerators per machine." + type = number +} + variable "cluster_name" { - type = string - description = "GKE cluster name" default = "" -} -variable "zone" { + description = "GKE cluster name" type = string - description = "The GCP zone where the reservation will be created" - default = "us-central1-a" } + +variable "machine_reservation_count" { + default = 2 + description = "Number of machines reserved instances with GPUs" + type = number +} + variable "machine_type" { - type = string - description = "The machine type to use." default = "g2-standard-24" + description = "The machine type to use." + type = string } -variable "accelerator" { +variable "project_id" { + default = "" + description = "The GCP project where the resources will be created" type = string - description = "The GPU accelerator to use." - default = "nvidia-l4" } -variable "accelerator_count" { - type = number - description = "The number of accelerators per machine." - default = 2 -} -variable "machine_reservation_count" { - type = number - description = "Number of machines reserved instances with GPUs" - default = 2 +variable "zone" { + default = "us-central1-a" + description = "The GCP zone where the reservation will be created" + type = string } diff --git a/ml-platform/terraform/modules/vm-reservations/versions.tf b/ml-platform/terraform/modules/vm-reservations/versions.tf index 7f4362ad6..b19f861ad 100644 --- a/ml-platform/terraform/modules/vm-reservations/versions.tf +++ b/ml-platform/terraform/modules/vm-reservations/versions.tf @@ -12,19 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -//terraform { -// required_providers { -// google-beta = { -// source = "hashicorp/google-beta" -// version = "4.72.1" -// } -// google = { -// source = "hashicorp/google" -// version = "4.72.1" -// } -// } -//} - terraform { required_providers { google = { @@ -36,4 +23,4 @@ terraform { version = "5.19.0" } } -} \ No newline at end of file +} diff --git a/ml-platform/terraform/versions.tf b/ml-platform/terraform/versions.tf index 4f0c767da..1b4d21b72 100644 --- a/ml-platform/terraform/versions.tf +++ b/ml-platform/terraform/versions.tf @@ -14,6 +14,10 @@ terraform { required_providers { + github = { + source = "integrations/github" + version = "6.0.1" + } google = { source = "hashicorp/google" version = "5.19.0" @@ -22,16 +26,11 @@ terraform { source = "hashicorp/google-beta" version = "5.19.0" } - github = { - source = "integrations/github" - version = "6.0.1" - } null = { source = "hashicorp/null" version = "3.2.2" } } - } provider "github" { From 44a23b08af1462125aab922e789b388eba0dd3b4 Mon Sep 17 00:00:00 2001 From: arueth Date: Thu, 14 Mar 2024 20:40:08 +0000 Subject: [PATCH 18/39] Fixed 'set configuration variables' environment variable for project ID --- ml-platform/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ml-platform/README.md b/ml-platform/README.md index d5ccd4265..335ff4bd3 100644 --- a/ml-platform/README.md +++ b/ml-platform/README.md @@ -144,7 +144,7 @@ This is the quick-start deployment guide. It can be used to set up an environmen sed -i "s/YOUR_GITHUB_EMAIL/${MLP_GITHUB_EMAIL}/g" ${MLP_BASE_DIR}/terraform/mlp.auto.tfvars sed -i "s/YOUR_GITHUB_ORG/${MLP_GITHUB_ORG}/g" ${MLP_BASE_DIR}/terraform/mlp.auto.tfvars sed -i "s/YOUR_GITHUB_USER/${MLP_GITHUB_USER}/g" ${MLP_BASE_DIR}/terraform/mlp.auto.tfvars - sed -i "s/YOUR_PROJECT_ID/${PROJECT_ID}/g" ${MLP_BASE_DIR}/terraform/mlp.auto.tfvars + sed -i "s/YOUR_PROJECT_ID/${MLP_PROJECT_ID}/g" ${MLP_BASE_DIR}/terraform/mlp.auto.tfvars ``` - Create the resources From ca9fa68e013aa8d764883b116b3f1e1e8c66cc98 Mon Sep 17 00:00:00 2001 From: arueth Date: Thu, 14 Mar 2024 21:02:00 +0000 Subject: [PATCH 19/39] Added serviceaccount.yaml to _namespace_template/app/kustomization.yaml --- .../templates/_namespace_template/app/kustomization.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/ml-platform/terraform/templates/acm-template/templates/_namespace_template/app/kustomization.yaml b/ml-platform/terraform/templates/acm-template/templates/_namespace_template/app/kustomization.yaml index 19240b9a8..5a137938a 100644 --- a/ml-platform/terraform/templates/acm-template/templates/_namespace_template/app/kustomization.yaml +++ b/ml-platform/terraform/templates/acm-template/templates/_namespace_template/app/kustomization.yaml @@ -18,6 +18,7 @@ namespace: NAMESPACE resources: - fluentd_config.yaml +- serviceaccount.yaml helmCharts: - name: ray-cluster From d0861e065a1e44bf774abd56f1638ab6140a74ab Mon Sep 17 00:00:00 2001 From: arueth Date: Thu, 14 Mar 2024 21:12:12 +0000 Subject: [PATCH 20/39] Enabled serviceusage.googleapis.com APIs --- ml-platform/terraform/main.tf | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/ml-platform/terraform/main.tf b/ml-platform/terraform/main.tf index b4ff7c00e..91215571b 100644 --- a/ml-platform/terraform/main.tf +++ b/ml-platform/terraform/main.tf @@ -44,6 +44,17 @@ resource "google_project_service" "containerfilesystem_googleapis_com" { service = "containerfilesystem.googleapis.com" } +resource "google_project_service" "serviceusage_googleapis_com" { + for_each = local.parsed_project_id + + depends_on = [module.gcp-project] + + disable_dependent_services = false + disable_on_destroy = false + project = each.value + service = "serviceusage.googleapis.com" +} + resource "google_project_service" "project_services-cr" { for_each = local.parsed_project_id From ba03eb25901442387e71eaca198ec56752a8c6e0 Mon Sep 17 00:00:00 2001 From: arueth Date: Thu, 14 Mar 2024 23:02:35 +0000 Subject: [PATCH 21/39] Fixed terraform fmt issues --- .../terraform/modules/cloud-nat/main.tf | 28 +++++++++---------- ml-platform/terraform/modules/cluster/gke.tf | 2 +- .../terraform/modules/node-pools/variables.tf | 2 +- 3 files changed, 16 insertions(+), 16 deletions(-) diff --git a/ml-platform/terraform/modules/cloud-nat/main.tf b/ml-platform/terraform/modules/cloud-nat/main.tf index 1b549d755..a85277dce 100644 --- a/ml-platform/terraform/modules/cloud-nat/main.tf +++ b/ml-platform/terraform/modules/cloud-nat/main.tf @@ -42,20 +42,20 @@ resource "google_compute_router" "router" { resource "google_compute_router_nat" "main" { enable_dynamic_port_allocation = var.enable_dynamic_port_allocation enable_endpoint_independent_mapping = var.enable_endpoint_independent_mapping - icmp_idle_timeout_sec = var.icmp_idle_timeout_sec - max_ports_per_vm = var.enable_dynamic_port_allocation ? var.max_ports_per_vm : null - min_ports_per_vm = var.min_ports_per_vm - name = local.name - nat_ip_allocate_option = local.nat_ip_allocate_option - nat_ips = var.nat_ips - project = var.project_id - region = var.region - router = local.router - source_subnetwork_ip_ranges_to_nat = var.source_subnetwork_ip_ranges_to_nat - tcp_established_idle_timeout_sec = var.tcp_established_idle_timeout_sec - tcp_time_wait_timeout_sec = var.tcp_time_wait_timeout_sec - tcp_transitory_idle_timeout_sec = var.tcp_transitory_idle_timeout_sec - udp_idle_timeout_sec = var.udp_idle_timeout_sec + icmp_idle_timeout_sec = var.icmp_idle_timeout_sec + max_ports_per_vm = var.enable_dynamic_port_allocation ? var.max_ports_per_vm : null + min_ports_per_vm = var.min_ports_per_vm + name = local.name + nat_ip_allocate_option = local.nat_ip_allocate_option + nat_ips = var.nat_ips + project = var.project_id + region = var.region + router = local.router + source_subnetwork_ip_ranges_to_nat = var.source_subnetwork_ip_ranges_to_nat + tcp_established_idle_timeout_sec = var.tcp_established_idle_timeout_sec + tcp_time_wait_timeout_sec = var.tcp_time_wait_timeout_sec + tcp_transitory_idle_timeout_sec = var.tcp_transitory_idle_timeout_sec + udp_idle_timeout_sec = var.udp_idle_timeout_sec dynamic "log_config" { for_each = var.log_config_enable == true ? [{ diff --git a/ml-platform/terraform/modules/cluster/gke.tf b/ml-platform/terraform/modules/cluster/gke.tf index 3bbfb8099..59bd768d2 100644 --- a/ml-platform/terraform/modules/cluster/gke.tf +++ b/ml-platform/terraform/modules/cluster/gke.tf @@ -151,7 +151,7 @@ resource "google_container_cluster" "mlp" { "STORAGE", "SYSTEM_COMPONENTS" ] - + managed_prometheus { enabled = true } diff --git a/ml-platform/terraform/modules/node-pools/variables.tf b/ml-platform/terraform/modules/node-pools/variables.tf index f9cf7e59a..f298cf5eb 100644 --- a/ml-platform/terraform/modules/node-pools/variables.tf +++ b/ml-platform/terraform/modules/node-pools/variables.tf @@ -77,7 +77,7 @@ variable "reservation_name" { variable "resource_type" { default = "ondemand" description = "ondemand/spot/reserved." - type = string + type = string } variable "taints" { From 30e88b9c788ef167f5e4c06ce8f11bd631f56741 Mon Sep 17 00:00:00 2001 From: arueth Date: Thu, 14 Mar 2024 23:20:06 +0000 Subject: [PATCH 22/39] Added shielded VMs --- ml-platform/terraform/modules/cluster/gke.tf | 26 +++++++++++++------ .../terraform/modules/node-pools/nodepools.tf | 5 ++++ 2 files changed, 23 insertions(+), 8 deletions(-) diff --git a/ml-platform/terraform/modules/cluster/gke.tf b/ml-platform/terraform/modules/cluster/gke.tf index 59bd768d2..4ec5a57c1 100644 --- a/ml-platform/terraform/modules/cluster/gke.tf +++ b/ml-platform/terraform/modules/cluster/gke.tf @@ -21,14 +21,15 @@ data "google_project" "project" { resource "google_container_cluster" "mlp" { provider = google-beta - deletion_protection = false - initial_node_count = 2 - location = var.region - name = var.cluster_name - network = var.network - node_locations = ["${var.region}-a", "${var.region}-b", "${var.region}-c"] - project = var.project_id - subnetwork = var.subnet + deletion_protection = false + enable_shielded_nodes = true + initial_node_count = 2 + location = var.region + name = var.cluster_name + network = var.network + node_locations = ["${var.region}-a", "${var.region}-b", "${var.region}-c"] + project = var.project_id + subnetwork = var.subnet addons_config { gcp_filestore_csi_driver_config { @@ -157,11 +158,20 @@ resource "google_container_cluster" "mlp" { } } + node_config { + shielded_instance_config { + enable_integrity_monitoring = true + enable_secure_boot = true + } + } + node_pool_defaults { node_config_defaults { gcfs_config { enabled = true } + + } } diff --git a/ml-platform/terraform/modules/node-pools/nodepools.tf b/ml-platform/terraform/modules/node-pools/nodepools.tf index 26ca6d92e..79fd15029 100644 --- a/ml-platform/terraform/modules/node-pools/nodepools.tf +++ b/ml-platform/terraform/modules/node-pools/nodepools.tf @@ -56,6 +56,11 @@ resource "google_container_node_pool" "node-pool" { } } + shielded_instance_config { + enable_integrity_monitoring = true + enable_secure_boot = true + } + dynamic "taint" { for_each = var.taints content { From 3396d4f2b22528c3137099158aa709f390e39587 Mon Sep 17 00:00:00 2001 From: arueth Date: Fri, 15 Mar 2024 16:58:37 +0000 Subject: [PATCH 23/39] Added a dependency on the GKE cluster for the WI SA IAM binding --- ml-platform/terraform/main.tf | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ml-platform/terraform/main.tf b/ml-platform/terraform/main.tf index 91215571b..b0d5e3dd9 100644 --- a/ml-platform/terraform/main.tf +++ b/ml-platform/terraform/main.tf @@ -497,6 +497,8 @@ resource "google_service_account" "namespace_default" { } resource "google_service_account_iam_member" "wi_cymbal_bank_backend_workload_identity_user" { + depends_on = [module.gke] + member = "serviceAccount:${local.parsed_project_id[var.default_env]}.svc.id.goog[${var.namespace}/${var.namespace}-default]" role = "roles/iam.workloadIdentityUser" service_account_id = google_service_account.namespace_default.id From f8e2d8b8f8f75126817591a8ef9b12a94305c832 Mon Sep 17 00:00:00 2001 From: kenthua Date: Fri, 15 Mar 2024 22:46:39 +0000 Subject: [PATCH 24/39] add shielded config for nap pools as well --- ml-platform/terraform/modules/cluster/gke.tf | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/ml-platform/terraform/modules/cluster/gke.tf b/ml-platform/terraform/modules/cluster/gke.tf index 4ec5a57c1..362fa532e 100644 --- a/ml-platform/terraform/modules/cluster/gke.tf +++ b/ml-platform/terraform/modules/cluster/gke.tf @@ -59,6 +59,11 @@ resource "google_container_cluster" "mlp" { auto_upgrade = true } + shielded_instance_config { + enable_integrity_monitoring = true + enable_secure_boot = true + } + upgrade_settings { max_surge = 0 max_unavailable = 1 From 0cf408bbd74d7c89c7bfa215e03457f4c769c715 Mon Sep 17 00:00:00 2001 From: Kent Hua <8052337+kenthua@users.noreply.github.com> Date: Fri, 15 Mar 2024 16:30:43 -0700 Subject: [PATCH 25/39] mlops platform kh (#363) * remove trailing spaces * move gpus to worker, map worker resources to node --- ml-platform/terraform/modules/cluster/gke.tf | 2 +- .../_namespace_template/app/values.yaml | 31 +++++++++---------- 2 files changed, 16 insertions(+), 17 deletions(-) diff --git a/ml-platform/terraform/modules/cluster/gke.tf b/ml-platform/terraform/modules/cluster/gke.tf index 362fa532e..03ebb693b 100644 --- a/ml-platform/terraform/modules/cluster/gke.tf +++ b/ml-platform/terraform/modules/cluster/gke.tf @@ -62,7 +62,7 @@ resource "google_container_cluster" "mlp" { shielded_instance_config { enable_integrity_monitoring = true enable_secure_boot = true - } + } upgrade_settings { max_surge = 0 diff --git a/ml-platform/terraform/templates/acm-template/templates/_namespace_template/app/values.yaml b/ml-platform/terraform/templates/acm-template/templates/_namespace_template/app/values.yaml index ba86e3191..fef2b1b17 100644 --- a/ml-platform/terraform/templates/acm-template/templates/_namespace_template/app/values.yaml +++ b/ml-platform/terraform/templates/acm-template/templates/_namespace_template/app/values.yaml @@ -55,6 +55,10 @@ head: block: 'true' # containerEnv specifies environment variables for the Ray container, # Follows standard K8s container env schema. + image: + repository: rayproject/ray + tag: 2.7.1-py310 + pullPolicy: IfNotPresent containerEnv: # - name: EXAMPLE_ENV # value: "1" @@ -76,24 +80,18 @@ head: # for further guidance. resources: limits: - cpu: "8" - nvidia.com/gpu: "1" + cpu: "4" # To avoid out-of-memory issues, never allocate less than 2G memory for the Ray head. - memory: "20G" + memory: "10G" ephemeral-storage: 20Gi requests: - cpu: "8" - nvidia.com/gpu: "1" - memory: "20G" + cpu: "4" + memory: "10G" ephemeral-storage: 10Gi annotations: {} nodeSelector: iam.gke.io/gke-metadata-server-enabled: "true" - cloud.google.com/gke-accelerator: "nvidia-l4" tolerations: - - key: "nvidia.com/gpu" - operator: "Exists" - effect: "NoSchedule" - key: "reserved" operator: "Exists" effect: "NoSchedule" @@ -143,6 +141,7 @@ worker: created-by: ray-on-gke rayStartParams: block: 'true' + resources: '"{\"accelerator_type_l4\": 2}"' initContainerImage: 'busybox:1.28' # Enable users to specify the image for init container. Users can pull the busybox image from their private repositories. # Security context for the init container. initContainerSecurityContext: {} @@ -167,14 +166,14 @@ worker: # for further guidance. resources: limits: - cpu: "1" - nvidia.com/gpu: "1" - memory: "20G" + cpu: "22" + nvidia.com/gpu: "2" + memory: "90G" ephemeral-storage: 20Gi requests: - cpu: "1" - nvidia.com/gpu: "1" - memory: "20G" + cpu: "22" + nvidia.com/gpu: "2" + memory: "90G" ephemeral-storage: 10Gi annotations: key: value From a8664144d22ad8bb6203226b452d8bb0bb71662a Mon Sep 17 00:00:00 2001 From: arueth Date: Fri, 22 Mar 2024 17:51:25 +0000 Subject: [PATCH 26/39] Added additional setup and cleanup steps --- ml-platform/README.md | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/ml-platform/README.md b/ml-platform/README.md index 335ff4bd3..f3aca0113 100644 --- a/ml-platform/README.md +++ b/ml-platform/README.md @@ -114,6 +114,18 @@ This is the quick-start deployment guide. It can be used to set up an environmen export MLP_GITHUB_EMAIL="" ``` +- Set the default `gcloud` project + + ``` + gcloud config set project ${MLP_PROJECT_ID} + ``` + +- Authorize `gcloud` + + ``` + gcloud auth login --activate --no-launch-browser --quiet --update-adc + ``` + - Create a Cloud Storage bucket to store the Terraform state ``` @@ -251,6 +263,12 @@ Open Cloud Shell to execute the following commands: terraform destroy -auto-approve -var github_token="$(tr --delete '\n' < ${HOME}/secrets/mlp-github-token)" ``` +- Delete the project + + ``` + gcloud projects delete ${MLP_PROJECT_ID} + ``` + [gitops]: https://about.gitlab.com/topics/gitops/ [repo-sync]: https://cloud.google.com/anthos-config-management/docs/reference/rootsync-reposync-fields [root-sync]: https://cloud.google.com/anthos-config-management/docs/reference/rootsync-reposync-fields From 696c94cc4fcdfba885007613039009743f2ad2bc Mon Sep 17 00:00:00 2001 From: arueth Date: Fri, 22 Mar 2024 18:47:57 +0000 Subject: [PATCH 27/39] Removed code for multiple environment and added additional service accounts --- ml-platform/README.md | 41 +- ml-platform/terraform/main.tf | 421 ++++++++---------- ml-platform/terraform/mlp.auto.tfvars | 10 +- .../terraform/modules/cloud-nat/versions.tf | 2 +- .../terraform/modules/projects/outputs.tf | 4 +- .../terraform/modules/projects/projects.tf | 108 +---- .../terraform/modules/projects/variables.tf | 14 +- .../terraform/modules/projects/versions.tf | 4 + .../terraform/scripts/create_cluster_yamls.sh | 7 +- .../terraform/scripts/create_git_cred.sh | 15 +- .../terraform/scripts/install_ray_cluster.sh | 12 +- .../app/kustomization.yaml | 3 +- ...ount.yaml => serviceaccount_ray_head.yaml} | 4 +- .../app/serviceaccount_ray_worker.yaml | 21 + .../_namespace_template/app/values.yaml | 2 + ml-platform/terraform/variables.tf | 15 +- 16 files changed, 301 insertions(+), 382 deletions(-) rename ml-platform/terraform/templates/acm-template/templates/_namespace_template/app/{serviceaccount.yaml => serviceaccount_ray_head.yaml} (85%) create mode 100644 ml-platform/terraform/templates/acm-template/templates/_namespace_template/app/serviceaccount_ray_worker.yaml diff --git a/ml-platform/README.md b/ml-platform/README.md index f3aca0113..672ed7df6 100644 --- a/ml-platform/README.md +++ b/ml-platform/README.md @@ -67,6 +67,22 @@ This is the quick-start deployment guide. It can be used to set up an environmen Note: It is recommended to use a [machine user account][machine-user-account] for this but you can use a personal user account just to try this reference architecture. + **Fine-grained personal access token** + + - Go to https://github.com/settings/tokens and login using your credentials + - Click "Generate new token" >> "Generate new token (Beta)". + - Enter a Token name. + - Select the expiration. + - Select the Resource owner. + - Select All repositories + - Set the following Permissions: + - Repository permissions + - Administration: Read and write + - Content: Read and write + - Click "Generate token" + + **Personal access tokens (classic)** + - Go to https://github.com/settings/tokens and login using your credentials - Click "Generate new token" >> "Generate new token (classic)". - You will be directed to a screen to created the new token. Provide the note and expiration. @@ -74,20 +90,21 @@ This is the quick-start deployment guide. It can be used to set up an environmen - [x] repo - Full control of private repositories - [x] delete_repo - Delete repositories - Click "Generate token" - - Store the token in a secure file. - ``` - # Create a secure directory - mkdir -p ${HOME}/secrets/ - chmod go-rwx ${HOME}/secrets +- Store the token in a secure file. - # Create a secure file - touch ${HOME}/secrets/mlp-github-token - chmod go-rwx ${HOME}/secrets/mlp-github-token + ``` + # Create a secure directory + mkdir -p ${HOME}/secrets/ + chmod go-rwx ${HOME}/secrets + + # Create a secure file + touch ${HOME}/secrets/mlp-github-token + chmod go-rwx ${HOME}/secrets/mlp-github-token - # Put the token in the secure file using your preferred editor - nano ${HOME}/secrets/mlp-github-token - ``` + # Put the token in the secure file using your preferred editor + nano ${HOME}/secrets/mlp-github-token + ``` - Set the project environment variables in Cloud Shell @@ -165,7 +182,7 @@ This is the quick-start deployment guide. It can be used to set up an environmen cd ${MLP_BASE_DIR}/terraform && \ terraform init && \ terraform plan -input=false -var github_token="$(tr --delete '\n' < ${HOME}/secrets/mlp-github-token)" -out=tfplan && \ - terraform apply -input=false tfplan && \ + terraform apply -input=false tfplan rm tfplan ``` diff --git a/ml-platform/terraform/main.tf b/ml-platform/terraform/main.tf index b0d5e3dd9..a8321e00f 100644 --- a/ml-platform/terraform/main.tf +++ b/ml-platform/terraform/main.tf @@ -13,194 +13,140 @@ # limitations under the License. locals { - gke_project_map = { for k, v in "${module.gke}" : v.cluster_name => v.gke_project_id } - parsed_gke_info = module.gke - parsed_gke_info_without_default_env = { for k, v in "${local.parsed_gke_info}" : k => v if k != var.default_env } - parsed_project_id = var.create_projects == 0 ? var.project_id : { for k, v in "${module.gcp-project.project_ids}" : k => v.project_id } - project_id_list = [for k, v in "${module.gke}" : v.gke_project_id] + project = data.google_project.environment } -#TODO: Add a validation that the value if default_env must be one of the values in env list +# +# Project +########################################################################## module "gcp-project" { - count = var.create_projects - source = "./modules/projects" billing_account = var.billing_account - env = var.env + env = var.environment_name folder_id = var.folder_id org_id = var.org_id + project_id = var.environment_project_id project_name = var.project_name } -resource "google_project_service" "containerfilesystem_googleapis_com" { - for_each = local.parsed_project_id - - depends_on = [module.gcp-project] +data "google_project" "environment" { + project_id = module.gcp-project.project_id +} +resource "google_project_service" "containerfilesystem_googleapis_com" { disable_dependent_services = false disable_on_destroy = false - project = each.value + project = local.project.project_id service = "containerfilesystem.googleapis.com" } resource "google_project_service" "serviceusage_googleapis_com" { - for_each = local.parsed_project_id - - depends_on = [module.gcp-project] - disable_dependent_services = false disable_on_destroy = false - project = each.value + project = local.project.project_id service = "serviceusage.googleapis.com" } resource "google_project_service" "project_services-cr" { - for_each = local.parsed_project_id - - depends_on = [module.gcp-project] - disable_dependent_services = false disable_on_destroy = false - project = each.value + project = local.project.project_id service = "cloudresourcemanager.googleapis.com" } resource "google_project_service" "project_services-an" { - for_each = local.parsed_project_id - - depends_on = [ - module.gcp-project, - google_project_service.project_services-cr - ] - disable_dependent_services = false disable_on_destroy = false - project = each.value + project = local.project.project_id service = "anthos.googleapis.com" } resource "google_project_service" "project_services-anc" { - for_each = local.parsed_project_id - - depends_on = [ - module.gcp-project, - google_project_service.project_services-cr - ] - disable_dependent_services = false disable_on_destroy = false - project = each.value + project = local.project.project_id service = "anthosconfigmanagement.googleapis.com" } resource "google_project_service" "project_services-con" { - for_each = local.parsed_project_id - - depends_on = [ - module.gcp-project, - google_project_service.project_services-cr - ] - disable_dependent_services = false disable_on_destroy = false - project = each.value + project = local.project.project_id service = "container.googleapis.com" } resource "google_project_service" "project_services-com" { - for_each = local.parsed_project_id - - depends_on = [ - module.gcp-project, - google_project_service.project_services-cr - ] - disable_dependent_services = false disable_on_destroy = false - project = each.value + project = local.project.project_id service = "compute.googleapis.com" } resource "google_project_service" "project_services-gkecon" { - for_each = local.parsed_project_id - - depends_on = [ - module.gcp-project, - google_project_service.project_services-cr - ] - disable_dependent_services = false disable_on_destroy = false - project = each.value + project = local.project.project_id service = "gkeconnect.googleapis.com" } resource "google_project_service" "project_services-gkeh" { - for_each = local.parsed_project_id - - depends_on = [ - module.gcp-project, - google_project_service.project_services-cr - ] - disable_dependent_services = false disable_on_destroy = false - project = each.value + project = local.project.project_id service = "gkehub.googleapis.com" } resource "google_project_service" "project_services-iam" { - for_each = local.parsed_project_id - - depends_on = [module.gcp-project, google_project_service.project_services-cr] - disable_dependent_services = false disable_on_destroy = false - project = each.value + project = local.project.project_id service = "iam.googleapis.com" } resource "google_project_service" "project_services-gate" { - for_each = local.parsed_project_id - - depends_on = [ - module.gcp-project, - google_project_service.project_services-cr - ] - disable_dependent_services = false disable_on_destroy = false - project = each.value + project = local.project.project_id service = "connectgateway.googleapis.com" } +# +# Networking +########################################################################## module "create-vpc" { - for_each = local.parsed_project_id - source = "./modules/network" depends_on = [ - module.gcp-project, google_project_service.project_services-com ] - network_name = format("%s-%s", var.network_name, each.key) - project_id = each.value + network_name = format("%s-%s", var.network_name, var.environment_name) + project_id = local.project.project_id routing_mode = var.routing_mode subnet_01_ip = var.subnet_01_ip - subnet_01_name = format("%s-%s", var.subnet_01_name, each.key) + subnet_01_name = format("%s-%s", var.subnet_01_name, var.environment_name) subnet_01_region = var.subnet_01_region subnet_02_ip = var.subnet_02_ip - subnet_02_name = format("%s-%s", var.subnet_02_name, each.key) + subnet_02_name = format("%s-%s", var.subnet_02_name, var.environment_name) subnet_02_region = var.subnet_02_region } -resource "google_gke_hub_feature" "configmanagement_acm_feature" { - provider = google-beta +module "cloud-nat" { + source = "./modules/cloud-nat" - count = length(distinct(values(local.parsed_project_id))) + create_router = true + name = format("%s-%s", "nat-for-acm", var.environment_name) + network = module.create-vpc.vpc + project_id = local.project.project_id + region = split("/", module.create-vpc.subnet-1)[3] + router = format("%s-%s", "router-for-acm", var.environment_name) +} +# +# GKE +########################################################################## +resource "google_gke_hub_feature" "configmanagement_acm_feature" { depends_on = [ google_project_service.project_services-gkeh, google_project_service.project_services-anc, @@ -211,12 +157,10 @@ resource "google_gke_hub_feature" "configmanagement_acm_feature" { location = "global" name = "configmanagement" - project = distinct(values(local.parsed_project_id))[count.index] + project = local.project.project_id } module "gke" { - for_each = local.parsed_project_id - source = "./modules/cluster" depends_on = [ @@ -225,172 +169,88 @@ module "gke" { google_project_service.project_services-com ] - cluster_name = format("%s-%s", var.cluster_name, each.key) - env = each.key + cluster_name = format("%s-%s", var.cluster_name, var.environment_name) + env = var.environment_name master_auth_networks_ipcidr = var.subnet_01_ip - network = module.create-vpc[each.key].vpc - project_id = each.value + network = module.create-vpc.vpc + project_id = local.project.project_id region = var.subnet_01_region - subnet = module.create-vpc[each.key].subnet-1 + subnet = module.create-vpc.subnet-1 zone = "${var.subnet_01_region}-a" } module "reservation" { - for_each = local.parsed_project_id - source = "./modules/vm-reservations" - depends_on = [module.gke] - - cluster_name = module.gke[each.key].cluster_name - project_id = each.value + cluster_name = module.gke.cluster_name + project_id = local.project.project_id zone = "${var.subnet_01_region}-a" } module "node_pool-reserved" { - for_each = local.parsed_project_id - source = "./modules/node-pools" - depends_on = [module.reservation] + depends_on = [ + module.reservation + ] - cluster_name = module.gke[each.key].cluster_name + cluster_name = module.gke.cluster_name node_pool_name = "reservation" - project_id = each.value + project_id = local.project.project_id region = var.subnet_01_region - reservation_name = module.reservation[each.key].reservation_name + reservation_name = module.reservation.reservation_name resource_type = "reservation" taints = var.reserved_taints } module "node_pool-ondemand" { - for_each = local.parsed_project_id - source = "./modules/node-pools" - depends_on = [module.gke] + depends_on = [ + module.gke + ] - cluster_name = module.gke[each.key].cluster_name + cluster_name = module.gke.cluster_name node_pool_name = "ondemand" - project_id = each.value + project_id = local.project.project_id region = var.subnet_01_region resource_type = "ondemand" taints = var.ondemand_taints } module "node_pool-spot" { - for_each = local.parsed_project_id - source = "./modules/node-pools" - depends_on = [module.gke] + depends_on = [ + module.gke + ] - cluster_name = module.gke[each.key].cluster_name + cluster_name = module.gke.cluster_name node_pool_name = "spot" - project_id = each.value + project_id = local.project.project_id region = var.subnet_01_region resource_type = "spot" taints = var.spot_taints } -module "cloud-nat" { - for_each = local.parsed_project_id - - source = "./modules/cloud-nat" - - depends_on = [ - module.create-vpc, - google_project_service.project_services-com - ] - - create_router = true - name = format("%s-%s", "nat-for-acm", each.key) - network = module.create-vpc[each.key].vpc - project_id = each.value - region = split("/", module.create-vpc[each.key].subnet-1)[3] - router = format("%s-%s", "router-for-acm", each.key) -} - resource "google_gke_hub_membership" "membership" { - provider = google-beta - - for_each = local.parsed_gke_info - depends_on = [ google_gke_hub_feature.configmanagement_acm_feature, google_project_service.project_services-gkeh, google_project_service.project_services-gkecon ] - membership_id = each.value["cluster_name"] - project = each.value["gke_project_id"] + membership_id = module.gke.cluster_name + project = local.project.project_id endpoint { gke_cluster { - resource_link = format("%s/%s", "//container.googleapis.com", each.value["cluster_id"]) + resource_link = "//container.googleapis.com/${module.gke.cluster_id}" } } - - lifecycle { - ignore_changes = [ - labels - ] - } -} - -resource "github_repository" "acm_repo" { - allow_merge_commit = true - allow_rebase_merge = true - allow_squash_merge = true - auto_init = true - delete_branch_on_merge = false - description = "Repo for Config Sync" - has_issues = false - has_projects = false - has_wiki = false - name = var.configsync_repo_name - visibility = "private" - vulnerability_alerts = true -} - -resource "github_branch" "branch" { - for_each = local.parsed_gke_info - - depends_on = [github_repository.acm_repo] - - branch = each.key - repository = split("/", github_repository.acm_repo.full_name)[1] -} - -resource "github_branch_default" "default_branch" { - depends_on = [github_branch.branch] - - branch = var.default_env - repository = split("/", github_repository.acm_repo.full_name)[1] -} - -resource "github_branch_protection_v3" "branch_protection" { - for_each = length(keys(local.parsed_project_id)) > 1 ? local.parsed_gke_info_without_default_env : {} - - depends_on = [github_branch.branch] - - repository = split("/", github_repository.acm_repo.full_name)[1] - branch = each.key - - required_pull_request_reviews { - require_code_owner_reviews = true - required_approving_review_count = 1 - } - - restrictions { - } } resource "google_gke_hub_feature_membership" "feature_member" { - provider = google-beta - - for_each = local.parsed_gke_info - depends_on = [ google_project_service.project_services-gkecon, google_project_service.project_services-gkeh, @@ -400,8 +260,8 @@ resource "google_gke_hub_feature_membership" "feature_member" { feature = "configmanagement" location = "global" - membership = google_gke_hub_membership.membership[each.key].membership_id - project = each.value["gke_project_id"] + membership = google_gke_hub_membership.membership.membership_id + project = local.project.project_id configmanagement { version = var.config_management_version @@ -412,8 +272,8 @@ resource "google_gke_hub_feature_membership" "feature_member" { git { policy_dir = "manifests/clusters" secret_type = "token" - sync_branch = each.value["env"] - sync_repo = "https://github.com/${github_repository.acm_repo.full_name}.git" + sync_branch = github_branch.environment.branch + sync_repo = github_repository.acm_repo.http_clone_url } } @@ -426,13 +286,65 @@ resource "google_gke_hub_feature_membership" "feature_member" { } } -resource "null_resource" "create_cluster_yamls" { - for_each = local.parsed_gke_info +# +# Git Repository +########################################################################## +# data "github_organization" "default" { +# name = var.github_org +# } + +resource "github_repository" "acm_repo" { + # depends_on = [ + # data.github_organization.default + # ] + + allow_merge_commit = true + allow_rebase_merge = true + allow_squash_merge = true + auto_init = true + delete_branch_on_merge = false + description = "Repo for Config Sync" + has_issues = false + has_projects = false + has_wiki = false + name = var.configsync_repo_name + visibility = "private" + vulnerability_alerts = true +} + +resource "github_branch" "environment" { + branch = var.environment_name + repository = github_repository.acm_repo.name +} + +resource "github_branch_default" "environment" { + branch = github_branch.environment.branch + repository = github_repository.acm_repo.name +} - depends_on = [google_gke_hub_feature_membership.feature_member] +resource "github_branch_protection_v3" "environment" { + repository = github_repository.acm_repo.name + branch = github_branch.environment.branch + + required_pull_request_reviews { + require_code_owner_reviews = true + required_approving_review_count = 1 + } + + restrictions { + } +} + +# +# Scripts +########################################################################## +resource "null_resource" "create_cluster_yamls" { + depends_on = [ + google_gke_hub_feature_membership.feature_member + ] provisioner "local-exec" { - command = "${path.module}/scripts/create_cluster_yamls.sh ${var.github_org} ${github_repository.acm_repo.full_name} ${var.github_user} ${var.github_email} ${each.value["env"]} ${each.value["cluster_name"]} ${index(keys(local.parsed_gke_info), each.key)}" + command = "${path.module}/scripts/create_cluster_yamls.sh ${var.github_org} ${github_repository.acm_repo.full_name} ${var.github_user} ${var.github_email} ${var.environment_name} ${module.gke.cluster_name}" environment = { GIT_TOKEN = var.github_token } @@ -445,8 +357,6 @@ resource "null_resource" "create_cluster_yamls" { } resource "null_resource" "create_git_cred_cms" { - for_each = var.secret_for_rootsync == 1 ? local.gke_project_map : {} - depends_on = [ google_gke_hub_feature_membership.feature_member, module.gke, @@ -457,7 +367,7 @@ resource "null_resource" "create_git_cred_cms" { ] provisioner "local-exec" { - command = "${path.module}/scripts/create_git_cred.sh ${each.key} ${each.value} ${var.github_user} config-management-system ${index(keys(local.gke_project_map), each.key)}" + command = "${path.module}/scripts/create_git_cred.sh ${module.gke.cluster_name} ${local.project.project_id} ${var.github_user} config-management-system" environment = { GIT_TOKEN = var.github_token } @@ -470,8 +380,6 @@ resource "null_resource" "create_git_cred_cms" { } resource "null_resource" "install_kuberay_operator" { - count = var.install_kuberay - depends_on = [ google_gke_hub_feature_membership.feature_member, null_resource.create_git_cred_cms @@ -490,30 +398,34 @@ resource "null_resource" "install_kuberay_operator" { } } +locals { + namespace_default_kubernetes_service_account = "default" +} + resource "google_service_account" "namespace_default" { - account_id = "wi-${var.namespace}-default" - display_name = "${var.namespace} Default Workload Identity Service Account" - project = local.parsed_project_id[var.default_env] + account_id = "wi-${var.namespace}-${local.namespace_default_kubernetes_service_account}" + display_name = "${var.namespace}/${local.namespace_default_kubernetes_service_account} workload identity service account" + project = local.project.project_id } -resource "google_service_account_iam_member" "wi_cymbal_bank_backend_workload_identity_user" { - depends_on = [module.gke] +resource "google_service_account_iam_member" "namespace_default_iam_workload_identity_user" { + depends_on = [ + module.gke + ] - member = "serviceAccount:${local.parsed_project_id[var.default_env]}.svc.id.goog[${var.namespace}/${var.namespace}-default]" + member = "serviceAccount:${local.project.project_id}.svc.id.goog[${var.namespace}/${local.namespace_default_kubernetes_service_account}]" role = "roles/iam.workloadIdentityUser" service_account_id = google_service_account.namespace_default.id } resource "null_resource" "create_namespace" { - count = var.create_namespace - depends_on = [ google_gke_hub_feature_membership.feature_member, null_resource.install_kuberay_operator ] provisioner "local-exec" { - command = "${path.module}/scripts/create_namespace.sh ${github_repository.acm_repo.full_name} ${var.github_email} ${var.github_org} ${var.github_user} ${var.namespace} ${var.default_env}" + command = "${path.module}/scripts/create_namespace.sh ${github_repository.acm_repo.full_name} ${var.github_email} ${var.github_org} ${var.github_user} ${var.namespace} ${var.environment_name}" environment = { GIT_TOKEN = var.github_token } @@ -526,15 +438,13 @@ resource "null_resource" "create_namespace" { } resource "null_resource" "create_git_cred_ns" { - count = var.create_namespace - depends_on = [ google_gke_hub_feature_membership.feature_member, null_resource.create_namespace ] provisioner "local-exec" { - command = "${path.module}/scripts/create_git_cred.sh ${local.parsed_gke_info[var.default_env].cluster_name} ${local.parsed_gke_info[var.default_env].gke_project_id} ${var.github_user} ${var.namespace}" + command = "${path.module}/scripts/create_git_cred.sh ${module.gke.cluster_name} ${module.gke.gke_project_id} ${var.github_user} ${var.namespace}" environment = { GIT_TOKEN = var.github_token } @@ -546,30 +456,63 @@ resource "null_resource" "create_git_cred_ns" { } } -resource "null_resource" "install_ray_cluster" { - count = var.install_ray_in_ns +locals { + ray_head_kubernetes_service_account = "ray-head" + ray_worker_kubernetes_service_account = "ray-worker" +} + +resource "google_service_account" "namespace_ray_head" { + account_id = "wi-${var.namespace}-${local.ray_head_kubernetes_service_account}" + display_name = "${var.namespace}/${local.ray_head_kubernetes_service_account} workload identity service account" + project = local.project.project_id +} + +resource "google_service_account_iam_member" "namespace_ray_head_iam_workload_identity_user" { + depends_on = [ + module.gke + ] + + member = "serviceAccount:${local.project.project_id}.svc.id.goog[${var.namespace}/${local.ray_head_kubernetes_service_account}]" + role = "roles/iam.workloadIdentityUser" + service_account_id = google_service_account.namespace_ray_head.id +} + +resource "google_service_account" "namespace_ray_worker" { + account_id = "wi-${var.namespace}-${local.ray_worker_kubernetes_service_account}" + display_name = "${var.namespace}/${local.ray_worker_kubernetes_service_account} workload identity service account" + project = local.project.project_id +} +resource "google_service_account_iam_member" "namespace_ray_worker_iam_workload_identity_user" { + depends_on = [ + module.gke + ] + + member = "serviceAccount:${local.project.project_id}.svc.id.goog[${var.namespace}/${local.ray_worker_kubernetes_service_account}]" + role = "roles/iam.workloadIdentityUser" + service_account_id = google_service_account.namespace_ray_worker.id +} + +resource "null_resource" "install_ray_cluster" { depends_on = [ google_gke_hub_feature_membership.feature_member, null_resource.create_git_cred_ns ] provisioner "local-exec" { - command = "${path.module}/scripts/install_ray_cluster.sh ${github_repository.acm_repo.full_name} ${var.github_email} ${var.github_org} ${var.github_user} ${var.namespace} ${google_service_account.namespace_default.email}" + command = "${path.module}/scripts/install_ray_cluster.sh ${github_repository.acm_repo.full_name} ${var.github_email} ${var.github_org} ${var.github_user} ${var.namespace} ${google_service_account.namespace_ray_head.email} ${local.ray_head_kubernetes_service_account} ${google_service_account.namespace_ray_worker.email} ${local.ray_worker_kubernetes_service_account}" environment = { GIT_TOKEN = var.github_token } } triggers = { - md5_files = md5(join("", [for f in fileset("${path.module}/templates/acm-template//templates/_namespace_template/app", "**") : md5("${path.module}/templates/acm-template//templates/_namespace_template/app/${f}")])) + md5_files = md5(join("", [for f in fileset("${path.module}/templates/acm-template/templates/_namespace_template/app", "**") : md5("${path.module}/templates/acm-template/templates/_namespace_template/app/${f}")])) md5_script = filemd5("${path.module}/scripts/install_ray_cluster.sh") } } resource "null_resource" "manage_ray_ns" { - count = var.install_ray_in_ns - depends_on = [ google_gke_hub_feature_membership.feature_member, null_resource.create_git_cred_ns, diff --git a/ml-platform/terraform/mlp.auto.tfvars b/ml-platform/terraform/mlp.auto.tfvars index f6671a20a..a20c54ee0 100644 --- a/ml-platform/terraform/mlp.auto.tfvars +++ b/ml-platform/terraform/mlp.auto.tfvars @@ -1,5 +1,5 @@ -default_env = "dev" -github_org = "YOUR_GITHUB_ORG" -github_email = "YOUR_GITHUB_EMAIL" -github_user = "YOUR_GITHUB_USER" -project_id = { "dev" : "YOUR_PROJECT_ID" } +environment_name = "dev" +environment_project_id = "YOUR_PROJECT_ID" +github_email = "YOUR_GITHUB_EMAIL" +github_org = "YOUR_GITHUB_ORG" +github_user = "YOUR_GITHUB_USER" diff --git a/ml-platform/terraform/modules/cloud-nat/versions.tf b/ml-platform/terraform/modules/cloud-nat/versions.tf index bb043b786..b19ea5231 100644 --- a/ml-platform/terraform/modules/cloud-nat/versions.tf +++ b/ml-platform/terraform/modules/cloud-nat/versions.tf @@ -28,7 +28,7 @@ terraform { } random = { source = "hashicorp/random" - version = "2.2" + version = "3.6.0" } } } diff --git a/ml-platform/terraform/modules/projects/outputs.tf b/ml-platform/terraform/modules/projects/outputs.tf index 431fe53dd..a1cc68f91 100644 --- a/ml-platform/terraform/modules/projects/outputs.tf +++ b/ml-platform/terraform/modules/projects/outputs.tf @@ -12,6 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -output "project_ids" { - value = "${google_project.project_under_folder}" == {} ? "${google_project.project_under_org}" : "${google_project.project_under_folder}" +output "project_id" { + value = local.project_id } diff --git a/ml-platform/terraform/modules/projects/projects.tf b/ml-platform/terraform/modules/projects/projects.tf index 39250a5f4..ea3fd30d8 100644 --- a/ml-platform/terraform/modules/projects/projects.tf +++ b/ml-platform/terraform/modules/projects/projects.tf @@ -12,101 +12,33 @@ # See the License for the specific language governing permissions and # limitations under the License. -resource "random_id" "random_project_id_suffix" { - byte_length = 2 +locals { + create_project = var.project_id == "" ? 1 : 0 + project_id = var.project_id == "" ? google_project.environment[0].project_id : var.project_id + project_id_prefix = "${var.project_name}-${var.env}" + project_id_suffix_length = 29 - length(local.project_id_prefix) } -resource "google_project" "project_under_folder" { - for_each = var.folder_id != null ? var.env : toset([]) - - billing_account = var.billing_account - folder_id = var.folder_id - name = format("%s-%s", var.project_name, each.value) - project_id = format("%s-%s-%s", var.project_name, random_id.random_project_id_suffix.hex, each.value) +resource "random_string" "project_id_suffix" { + length = local.project_id_suffix_length + lower = true + numeric = true + special = false + upper = false } -resource "google_project" "project_under_org" { - for_each = var.folder_id == null ? var.env : toset([]) +resource "google_project" "environment" { + count = local.create_project billing_account = var.billing_account - name = format("%s-%s", var.project_name, each.value) - org_id = var.org_id - project_id = format("%s-%s-%s", var.project_name, random_id.random_project_id_suffix.hex, each.value) -} - -resource "google_project_service" "project_services" { - for_each = var.folder_id == null ? google_project.project_under_org : google_project.project_under_folder - - depends_on = [google_project.project_under_folder, google_project.project_under_org] - - disable_dependent_services = true - disable_on_destroy = true - project = each.value.id - service = "cloudresourcemanager.googleapis.com" -} - -resource "google_project_service" "project_services-1" { - for_each = var.folder_id == null ? google_project.project_under_org : google_project.project_under_folder - - depends_on = [google_project.project_under_folder, google_project.project_under_org] - - disable_dependent_services = true - disable_on_destroy = true - project = each.value.id - service = "iam.googleapis.com" -} - -resource "google_project_service" "project_services-2" { - for_each = var.folder_id == null ? google_project.project_under_org : google_project.project_under_folder - - depends_on = [google_project.project_under_folder, google_project.project_under_org] - - disable_dependent_services = true - disable_on_destroy = true - project = each.value.id - service = "container.googleapis.com" + folder_id = var.folder_id == "" ? null : var.folder_id + name = local.project_id_prefix + org_id = var.org_id == "" ? null : var.org_id + project_id = "${local.project_id_prefix}-${random_string.project_id_suffix.result}" } -resource "google_project_service" "project_services-3" { - for_each = var.folder_id == null ? google_project.project_under_org : google_project.project_under_folder - - depends_on = [google_project.project_under_folder, google_project.project_under_org] - - disable_dependent_services = true - disable_on_destroy = true - project = each.value.id - service = "compute.googleapis.com" -} - -resource "google_project_service" "project_services-4" { - for_each = var.folder_id == null ? google_project.project_under_org : google_project.project_under_folder - - depends_on = [google_project.project_under_folder, google_project.project_under_org] - - disable_dependent_services = true - disable_on_destroy = true - project = each.value.id - service = "anthos.googleapis.com" -} - -resource "google_project_service" "project_services-5" { - for_each = var.folder_id == null ? google_project.project_under_org : google_project.project_under_folder - - depends_on = [google_project.project_under_folder, google_project.project_under_org] - - disable_dependent_services = true - disable_on_destroy = true - project = each.value.id - service = "anthosconfigmanagement.googleapis.com" -} - -resource "google_project_service" "project_services-6" { - for_each = var.folder_id == null ? google_project.project_under_org : google_project.project_under_folder - - depends_on = [google_project.project_under_folder, google_project.project_under_org] +data "google_project" "environment" { + depends_on = [google_project.environment] - disable_dependent_services = true - disable_on_destroy = true - project = each.value.id - service = "gkehub.googleapis.com" + project_id = local.project_id } diff --git a/ml-platform/terraform/modules/projects/variables.tf b/ml-platform/terraform/modules/projects/variables.tf index 378b5610b..69dfef8bf 100644 --- a/ml-platform/terraform/modules/projects/variables.tf +++ b/ml-platform/terraform/modules/projects/variables.tf @@ -19,13 +19,13 @@ variable "billing_account" { } variable "env" { - default = ["dev"] - description = "List of environments" - type = set(string) + default = "dev" + description = "Name of the environments" + type = string } variable "folder_id" { - default = null + default = "" description = "Folder id where the GCP projects will be created" type = string } @@ -36,6 +36,12 @@ variable "org_id" { description = "The GCP orig id" } +variable "project_id" { + default = "" + description = "Google Cloud project ID" + type = string +} + variable "project_name" { default = "" description = "GCP project name" diff --git a/ml-platform/terraform/modules/projects/versions.tf b/ml-platform/terraform/modules/projects/versions.tf index 466fd04d7..3609fa1df 100644 --- a/ml-platform/terraform/modules/projects/versions.tf +++ b/ml-platform/terraform/modules/projects/versions.tf @@ -18,5 +18,9 @@ terraform { source = "hashicorp/google" version = "5.19.0" } + random = { + source = "hashicorp/random" + version = "3.6.0" + } } } diff --git a/ml-platform/terraform/scripts/create_cluster_yamls.sh b/ml-platform/terraform/scripts/create_cluster_yamls.sh index 8b46a24b9..36cece324 100755 --- a/ml-platform/terraform/scripts/create_cluster_yamls.sh +++ b/ml-platform/terraform/scripts/create_cluster_yamls.sh @@ -20,11 +20,7 @@ github_user=${3} github_email=${4} cluster_env=${5} cluster_name=${6} -index=${7} -sleep_time=20 -sleep_index=$((${index} + 1)) -sleep_total=$((${sleep_time} * ${sleep_index})) -sleep $sleep_total + random=$( echo $RANDOM | md5sum | head -c 20 echo @@ -42,6 +38,7 @@ if [ ! -d "${download_acm_repo_name}/manifests" ] && [ ! -d "${download_acm_repo cp -r templates/acm-template/* ${download_acm_repo_name} flag=1 fi + cd ${download_acm_repo_name}/manifests/clusters if [ "${flag}" -eq 0 ]; then echo "not copying files" diff --git a/ml-platform/terraform/scripts/create_git_cred.sh b/ml-platform/terraform/scripts/create_git_cred.sh index d48907d45..3c9711558 100755 --- a/ml-platform/terraform/scripts/create_git_cred.sh +++ b/ml-platform/terraform/scripts/create_git_cred.sh @@ -18,22 +18,15 @@ gke_cluster=${1} project_id=${2} git_user=${3} namespace=${4} -index=${5} -sleep_time=60 -sleep_index=$((${index} + 1)) -sleep_total=$((${sleep_time} * ${sleep_index})) -sleep $sleep_total gcloud container fleet memberships get-credentials ${gke_cluster} --project ${project_id} -ns_exists=$(kubectl get ns ${namespace} -o name | awk -F '/' '{print $2}') -while [ "${ns_exists}" != "${namespace}" ]; do - sleep 10 - ns_exists=$(kubectl get ns ${namespace} -o name | awk -F '/' '{print $2}') +echo "Waiting for namespace '${namespace}' to be created..." +while ! kubectl get ns ${namespace} >/dev/null 2>&1; do + sleep 2 done -secret_exists=$(kubectl get secret git-creds -n ${namespace} -o name) -if [[ "${secret_exists}" == "secret/git-creds" ]]; then +if kubectl get secret git-creds -n ${namespace} >/dev/null 2>&1; then kubectl create secret generic git-creds --namespace="${namespace}" --save-config --dry-run=client --from-literal=username="${git_user}" --from-literal=token="${GIT_TOKEN}" -o yaml | kubectl apply -f - else kubectl create secret generic git-creds --namespace="${namespace}" --save-config --from-literal=username="${git_user}" --from-literal=token="${GIT_TOKEN}" diff --git a/ml-platform/terraform/scripts/install_ray_cluster.sh b/ml-platform/terraform/scripts/install_ray_cluster.sh index 01ef1231c..3e95bf170 100755 --- a/ml-platform/terraform/scripts/install_ray_cluster.sh +++ b/ml-platform/terraform/scripts/install_ray_cluster.sh @@ -19,8 +19,10 @@ github_email=${2} github_org=${3} github_user=${4} namespace=${5} -google_service_account=${6} -kubernetes_service_account="${namespace}-default" +google_service_account_head=${6} +kubernetes_service_account_head=${7} +google_service_account_worker=${8} +kubernetes_service_account_worker=${9} random=$( echo $RANDOM | md5sum | head -c 20 @@ -43,8 +45,10 @@ fi cp -r ../../templates/_namespace_template/app/* ${namespace}/ sed -i "s?NAMESPACE?${namespace}?g" ${namespace}/* -sed -i "s?GOOGLE_SERVICE_ACCOUNT?$google_service_account?g" ${namespace}/* -sed -i "s?KUBERNETES_SERVICE_ACCOUNT?$kubernetes_service_account?g" ${namespace}/* +sed -i "s?GOOGLE_SERVICE_ACCOUNT_RAY_HEAD?$google_service_account_head?g" ${namespace}/* +sed -i "s?KUBERNETES_SERVICE_ACCOUNT_RAY_HEAD?$kubernetes_service_account_head?g" ${namespace}/* +sed -i "s?GOOGLE_SERVICE_ACCOUNT_RAY_WORKER?$google_service_account_worker?g" ${namespace}/* +sed -i "s?KUBERNETES_SERVICE_ACCOUNT_RAY_WORKER?$kubernetes_service_account_worker?g" ${namespace}/* git add . git config --global user.name ${github_user} diff --git a/ml-platform/terraform/templates/acm-template/templates/_namespace_template/app/kustomization.yaml b/ml-platform/terraform/templates/acm-template/templates/_namespace_template/app/kustomization.yaml index 5a137938a..5d213bc4b 100644 --- a/ml-platform/terraform/templates/acm-template/templates/_namespace_template/app/kustomization.yaml +++ b/ml-platform/terraform/templates/acm-template/templates/_namespace_template/app/kustomization.yaml @@ -18,7 +18,8 @@ namespace: NAMESPACE resources: - fluentd_config.yaml -- serviceaccount.yaml +- serviceaccount_ray_head.yaml +- serviceaccount_ray_worker.yaml helmCharts: - name: ray-cluster diff --git a/ml-platform/terraform/templates/acm-template/templates/_namespace_template/app/serviceaccount.yaml b/ml-platform/terraform/templates/acm-template/templates/_namespace_template/app/serviceaccount_ray_head.yaml similarity index 85% rename from ml-platform/terraform/templates/acm-template/templates/_namespace_template/app/serviceaccount.yaml rename to ml-platform/terraform/templates/acm-template/templates/_namespace_template/app/serviceaccount_ray_head.yaml index 245824cc0..b88329a3c 100644 --- a/ml-platform/terraform/templates/acm-template/templates/_namespace_template/app/serviceaccount.yaml +++ b/ml-platform/terraform/templates/acm-template/templates/_namespace_template/app/serviceaccount_ray_head.yaml @@ -15,7 +15,7 @@ apiVersion: v1 kind: ServiceAccount metadata: - name: KUBERNETES_SERVICE_ACCOUNT + name: KUBERNETES_SERVICE_ACCOUNT_RAY_HEAD namespace: NAMESPACE annotations: - iam.gke.io/gcp-service-account: GOOGLE_SERVICE_ACCOUNT + iam.gke.io/gcp-service-account: GOOGLE_SERVICE_ACCOUNT_RAY_HEAD diff --git a/ml-platform/terraform/templates/acm-template/templates/_namespace_template/app/serviceaccount_ray_worker.yaml b/ml-platform/terraform/templates/acm-template/templates/_namespace_template/app/serviceaccount_ray_worker.yaml new file mode 100644 index 000000000..eefd56a56 --- /dev/null +++ b/ml-platform/terraform/templates/acm-template/templates/_namespace_template/app/serviceaccount_ray_worker.yaml @@ -0,0 +1,21 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: v1 +kind: ServiceAccount +metadata: + name: KUBERNETES_SERVICE_ACCOUNT_RAY_WORKER + namespace: NAMESPACE + annotations: + iam.gke.io/gcp-service-account: GOOGLE_SERVICE_ACCOUNT_RAY_WORKER diff --git a/ml-platform/terraform/templates/acm-template/templates/_namespace_template/app/values.yaml b/ml-platform/terraform/templates/acm-template/templates/_namespace_template/app/values.yaml index fef2b1b17..48e801b4d 100644 --- a/ml-platform/terraform/templates/acm-template/templates/_namespace_template/app/values.yaml +++ b/ml-platform/terraform/templates/acm-template/templates/_namespace_template/app/values.yaml @@ -50,6 +50,7 @@ head: labels: cloud.google.com/gke-ray-node-type: head created-by: ray-on-gke + serviceAccountName: "KUBERNETES_SERVICE_ACCOUNT_RAY_HEAD" rayStartParams: dashboard-host: '0.0.0.0' block: 'true' @@ -139,6 +140,7 @@ worker: labels: cloud.google.com/gke-ray-node-type: worker created-by: ray-on-gke + serviceAccountName: "KUBERNETES_SERVICE_ACCOUNT_RAY_WORKER" rayStartParams: block: 'true' resources: '"{\"accelerator_type_l4\": 2}"' diff --git a/ml-platform/terraform/variables.tf b/ml-platform/terraform/variables.tf index af2b8a131..7c456a2f2 100644 --- a/ml-platform/terraform/variables.tf +++ b/ml-platform/terraform/variables.tf @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - variable "billing_account" { default = null description = "GCP billing account" @@ -48,12 +47,17 @@ variable "create_projects" { type = number } -variable "default_env" { +variable "environment_name" { default = "dev" description = "Lowest environments" type = string } +variable "environment_project_id" { + description = "The GCP project where the resources will be created" + type = string +} + variable "env" { default = ["dev"] description = "List of environments" @@ -130,13 +134,8 @@ variable "org_id" { type = string } -variable "project_id" { - description = "The GCP project where the resources will be created. It is a map with environments as keys and project_ids s values" - type = map(any) -} - variable "project_name" { - default = null + default = "mlp" description = "GCP project name" type = string } From d48b5be0e6783e17ec0fd94e515555bf7299741e Mon Sep 17 00:00:00 2001 From: arueth Date: Fri, 22 Mar 2024 22:47:10 +0000 Subject: [PATCH 28/39] Added remove_default_node_pool --- ml-platform/terraform/modules/cluster/gke.tf | 21 ++++++++++---------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/ml-platform/terraform/modules/cluster/gke.tf b/ml-platform/terraform/modules/cluster/gke.tf index 03ebb693b..79e84215f 100644 --- a/ml-platform/terraform/modules/cluster/gke.tf +++ b/ml-platform/terraform/modules/cluster/gke.tf @@ -21,15 +21,16 @@ data "google_project" "project" { resource "google_container_cluster" "mlp" { provider = google-beta - deletion_protection = false - enable_shielded_nodes = true - initial_node_count = 2 - location = var.region - name = var.cluster_name - network = var.network - node_locations = ["${var.region}-a", "${var.region}-b", "${var.region}-c"] - project = var.project_id - subnetwork = var.subnet + deletion_protection = false + enable_shielded_nodes = true + initial_node_count = 1 + location = var.region + name = var.cluster_name + network = var.network + node_locations = ["${var.region}-a", "${var.region}-b", "${var.region}-c"] + project = var.project_id + remove_default_node_pool = true + subnetwork = var.subnet addons_config { gcp_filestore_csi_driver_config { @@ -175,8 +176,6 @@ resource "google_container_cluster" "mlp" { gcfs_config { enabled = true } - - } } From 9d96a924fa9ed9a3abb0ad57833d65bc6e29607c Mon Sep 17 00:00:00 2001 From: kenthua Date: Tue, 26 Mar 2024 22:32:58 +0000 Subject: [PATCH 29/39] update to push work to workers --- .../acm-template/templates/_namespace_template/app/values.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/ml-platform/terraform/templates/acm-template/templates/_namespace_template/app/values.yaml b/ml-platform/terraform/templates/acm-template/templates/_namespace_template/app/values.yaml index 48e801b4d..88922de0f 100644 --- a/ml-platform/terraform/templates/acm-template/templates/_namespace_template/app/values.yaml +++ b/ml-platform/terraform/templates/acm-template/templates/_namespace_template/app/values.yaml @@ -54,6 +54,7 @@ head: rayStartParams: dashboard-host: '0.0.0.0' block: 'true' + num-cpus: '0' # containerEnv specifies environment variables for the Ray container, # Follows standard K8s container env schema. image: From a9f352fc965d3dbeb5431405145282eedd7153dd Mon Sep 17 00:00:00 2001 From: kenthua Date: Tue, 26 Mar 2024 23:57:35 +0000 Subject: [PATCH 30/39] adding autoscaling config --- .../_namespace_template/app/values.yaml | 21 +++++++++++-------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/ml-platform/terraform/templates/acm-template/templates/_namespace_template/app/values.yaml b/ml-platform/terraform/templates/acm-template/templates/_namespace_template/app/values.yaml index 88922de0f..f0eeba860 100644 --- a/ml-platform/terraform/templates/acm-template/templates/_namespace_template/app/values.yaml +++ b/ml-platform/terraform/templates/acm-template/templates/_namespace_template/app/values.yaml @@ -26,13 +26,21 @@ imagePullSecrets: [] head: groupName: headgroup + rayVersion: 2.7.1 # If enableInTreeAutoscaling is true, the autoscaler sidecar will be added to the Ray head pod. # Ray autoscaler integration is supported only for Ray versions >= 1.11.0 # Ray autoscaler integration is Beta with KubeRay >= 0.3.0 and Ray >= 2.0.0. - # enableInTreeAutoscaling: true + enableInTreeAutoscaling: true # autoscalerOptions is an OPTIONAL field specifying configuration overrides for the Ray autoscaler. # The example configuration shown below below represents the DEFAULT values. - # autoscalerOptions: + autoscalerOptions: + resources: + limits: + cpu: "500m" + memory: "512Mi" + requests: + cpu: "500m" + memory: "512Mi" # upscalingMode: Default # idleTimeoutSeconds: 60 # securityContext: {} @@ -40,13 +48,6 @@ head: # envFrom: [] # resources specifies optional resource request and limit overrides for the autoscaler container. # For large Ray clusters, we recommend monitoring container resource usage to determine if overriding the defaults is required. - # resources: - # limits: - # cpu: "500m" - # memory: "512Mi" - # requests: - # cpu: "500m" - # memory: "512Mi" labels: cloud.google.com/gke-ray-node-type: head created-by: ray-on-gke @@ -136,6 +137,8 @@ worker: # uncomment the line below # disabled: true groupName: workergroup + minReplicas: 1 + maxReplicas: 3 replicas: 1 type: worker labels: From dd7f9105a903be4fd35b260c0819e3e5ebeb56af Mon Sep 17 00:00:00 2001 From: Kent Hua <8052337+kenthua@users.noreply.github.com> Date: Tue, 26 Mar 2024 16:14:28 -0700 Subject: [PATCH 31/39] Update values.yaml --- .../_namespace_template/app/values.yaml | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/ml-platform/terraform/templates/acm-template/templates/_namespace_template/app/values.yaml b/ml-platform/terraform/templates/acm-template/templates/_namespace_template/app/values.yaml index f0eeba860..e8cf46a4b 100644 --- a/ml-platform/terraform/templates/acm-template/templates/_namespace_template/app/values.yaml +++ b/ml-platform/terraform/templates/acm-template/templates/_namespace_template/app/values.yaml @@ -27,20 +27,14 @@ imagePullSecrets: [] head: groupName: headgroup rayVersion: 2.7.1 + enableInTreeAutoscaling: true # If enableInTreeAutoscaling is true, the autoscaler sidecar will be added to the Ray head pod. # Ray autoscaler integration is supported only for Ray versions >= 1.11.0 # Ray autoscaler integration is Beta with KubeRay >= 0.3.0 and Ray >= 2.0.0. - enableInTreeAutoscaling: true + # enableInTreeAutoscaling: true # autoscalerOptions is an OPTIONAL field specifying configuration overrides for the Ray autoscaler. # The example configuration shown below below represents the DEFAULT values. autoscalerOptions: - resources: - limits: - cpu: "500m" - memory: "512Mi" - requests: - cpu: "500m" - memory: "512Mi" # upscalingMode: Default # idleTimeoutSeconds: 60 # securityContext: {} @@ -48,6 +42,13 @@ head: # envFrom: [] # resources specifies optional resource request and limit overrides for the autoscaler container. # For large Ray clusters, we recommend monitoring container resource usage to determine if overriding the defaults is required. + resources: + limits: + cpu: "500m" + memory: "512Mi" + requests: + cpu: "500m" + memory: "512Mi" labels: cloud.google.com/gke-ray-node-type: head created-by: ray-on-gke @@ -315,4 +316,4 @@ additionalWorkerGroups: name: fluentbit-config service: - type: ClusterIP \ No newline at end of file + type: ClusterIP From 8b59056c5bcdda9a25be2d5d1c6d469bf5e35ba6 Mon Sep 17 00:00:00 2001 From: Aaron Rueth Date: Thu, 28 Mar 2024 08:17:35 -0700 Subject: [PATCH 32/39] Project cleanup (#469) * Removed projects module and cleaned up unused variabled * Added option to have a Terraform managed project --- ml-platform/README.md | 145 ++++++++++++++---- .../outputs.tf => initialize/backend.tf} | 6 +- .../terraform/initialize/backend.tf.bucket | 20 +++ .../initialize/initialize.auto.tfvars | 7 + ml-platform/terraform/initialize/main.tf | 131 ++++++++++++++++ ml-platform/terraform/initialize/variables.tf | 78 ++++++++++ .../projects => initialize}/versions.tf | 6 +- ml-platform/terraform/main.tf | 73 ++++----- .../terraform/modules/projects/projects.tf | 44 ------ .../terraform/modules/projects/variables.tf | 49 ------ ml-platform/terraform/variables.tf | 50 +----- 11 files changed, 389 insertions(+), 220 deletions(-) rename ml-platform/terraform/{modules/projects/outputs.tf => initialize/backend.tf} (89%) create mode 100644 ml-platform/terraform/initialize/backend.tf.bucket create mode 100644 ml-platform/terraform/initialize/initialize.auto.tfvars create mode 100644 ml-platform/terraform/initialize/main.tf create mode 100644 ml-platform/terraform/initialize/variables.tf rename ml-platform/terraform/{modules/projects => initialize}/versions.tf (90%) delete mode 100644 ml-platform/terraform/modules/projects/projects.tf delete mode 100644 ml-platform/terraform/modules/projects/variables.tf diff --git a/ml-platform/README.md b/ml-platform/README.md index 672ed7df6..d6a1058fb 100644 --- a/ml-platform/README.md +++ b/ml-platform/README.md @@ -57,18 +57,46 @@ This is the quick-start deployment guide. It can be used to set up an environmen ### Requirements -- New Google Cloud Project, preferably with no APIs enabled +In this guide you can choose to bring your project (BYOP) or have Terraform create a new project for you. The requirements are difference based on the option that you choose. + +#### Bring your own project (BYOP) + +- Project ID of a new Google Cloud Project, preferably with no APIs enabled - `roles/owner` IAM permissions on the project - GitHub Personal Access Token, steps to create the token are provided below -### Configuration +#### Terraform managed project + +- Billing account ID +- Organization or folder ID +- `roles/billing.user` IAM permissions on the billing account specified +- `roles/resourcemanager.projectCreator` IAM permissions on the organization or folder specified +- GitHub Personal Access Token, steps to create the token are provided below + +### Pull the source code + +- Clone the repository and change directory to the guide directory + + ``` + git clone https://github.com/GoogleCloudPlatform/ai-on-gke + cd ai-on-gke/ml-platform + ``` + +- Set environment variables + + ``` + export MLP_BASE_DIR=$(pwd) && \ + echo "export MLP_BASE_DIR=${MLP_BASE_DIR}" >> ${HOME}/.bashrc + ``` + +### GitHub Configuration - Create a [Personal Access Token][personal-access-token] in [GitHub][github]: Note: It is recommended to use a [machine user account][machine-user-account] for this but you can use a personal user account just to try this reference architecture. **Fine-grained personal access token** - + - Go to https://github.com/settings/tokens and login using your credentials - Click "Generate new token" >> "Generate new token (Beta)". - Enter a Token name. @@ -106,17 +134,6 @@ This is the quick-start deployment guide. It can be used to set up an environmen nano ${HOME}/secrets/mlp-github-token ``` -- Set the project environment variables in Cloud Shell - - Replace the following values - - - `` is the ID of your existing Google Cloud project - - ``` - export MLP_PROJECT_ID="" - export MLP_STATE_BUCKET="${MLP_PROJECT_ID}-tf-state" - ``` - - Set the GitHub environment variables in Cloud Shell Replace the following values: @@ -131,6 +148,31 @@ This is the quick-start deployment guide. It can be used to set up an environmen export MLP_GITHUB_EMAIL="" ``` +- Set the configuration variables + + ``` + sed -i "s/YOUR_GITHUB_EMAIL/${MLP_GITHUB_EMAIL}/g" ${MLP_BASE_DIR}/terraform/mlp.auto.tfvars + sed -i "s/YOUR_GITHUB_ORG/${MLP_GITHUB_ORG}/g" ${MLP_BASE_DIR}/terraform/mlp.auto.tfvars + sed -i "s/YOUR_GITHUB_USER/${MLP_GITHUB_USER}/g" ${MLP_BASE_DIR}/terraform/mlp.auto.tfvars + ``` + +### Project Configuration + +You only need to complete the section for the option that you have selected. + +#### Bring your own project (BYOP) + +- Set the project environment variables in Cloud Shell + + Replace the following values + + - `` is the ID of your existing Google Cloud project + + ``` + export MLP_PROJECT_ID="" + export MLP_STATE_BUCKET="${MLP_PROJECT_ID}-tf-state" + ``` + - Set the default `gcloud` project ``` @@ -149,33 +191,56 @@ This is the quick-start deployment guide. It can be used to set up an environmen gcloud storage buckets create gs://${MLP_STATE_BUCKET} --project ${MLP_PROJECT_ID} ``` -### Run Terraform - -- Clone the repository and change directory to the guide directory +- Set the configuration variables ``` - git clone https://github.com/GoogleCloudPlatform/ai-on-gke - cd ai-on-gke/ml-platform + sed -i "s/YOUR_STATE_BUCKET/${MLP_STATE_BUCKET}/g" ${MLP_BASE_DIR}/terraform/backend.tf + sed -i "s/YOUR_PROJECT_ID/${MLP_PROJECT_ID}/g" ${MLP_BASE_DIR}/terraform/mlp.auto.tfvars ``` -- Set environment variables +#### Terraform managed project + +- Set the configuration variables ``` - export MLP_BASE_DIR=$(pwd) && \ - echo "export MLP_BASE_DIR=${MLP_BASE_DIR}" >> ${HOME}/.bashrc + nano ${MLP_BASE_DIR}/terraform/initialize/initialize.auto.tfvars ``` -- Set the configuration variables + ``` + project = { + billing_account_id = "XXXXXX-XXXXXX-XXXXXX" + folder_id = "############" + name = "mlp" + org_id = "############" + } + ``` + + > `project.billing_account_id` the billing account ID + > + > Enter either `project.folder_id` **OR** `project.org_id` + > `project.folder_id` the folder ID + > `project.org_id` the organization ID + +- Authorize `gcloud` ``` - sed -i "s/YOUR_STATE_BUCKET/${MLP_STATE_BUCKET}/g" ${MLP_BASE_DIR}/terraform/backend.tf + gcloud auth login --activate --no-launch-browser --quiet --update-adc + ``` - sed -i "s/YOUR_GITHUB_EMAIL/${MLP_GITHUB_EMAIL}/g" ${MLP_BASE_DIR}/terraform/mlp.auto.tfvars - sed -i "s/YOUR_GITHUB_ORG/${MLP_GITHUB_ORG}/g" ${MLP_BASE_DIR}/terraform/mlp.auto.tfvars - sed -i "s/YOUR_GITHUB_USER/${MLP_GITHUB_USER}/g" ${MLP_BASE_DIR}/terraform/mlp.auto.tfvars - sed -i "s/YOUR_PROJECT_ID/${MLP_PROJECT_ID}/g" ${MLP_BASE_DIR}/terraform/mlp.auto.tfvars +- Create a new project + + ``` + cd ${MLP_BASE_DIR}/terraform/initialize + terraform init && \ + terraform plan -input=false -out=tfplan && \ + terraform apply -input=false tfplan && \ + rm tfplan && \ + terraform init -force-copy -migrate-state && \ + rm -rf state ``` +### Run Terraform + - Create the resources ``` @@ -277,15 +342,37 @@ Open Cloud Shell to execute the following commands: ``` cd ${MLP_BASE_DIR}/terraform && \ terraform init && \ - terraform destroy -auto-approve -var github_token="$(tr --delete '\n' < ${HOME}/secrets/mlp-github-token)" + terraform destroy -auto-approve -var github_token="$(tr --delete '\n' < ${HOME}/secrets/mlp-github-token)" && \ + rm -rf .terraform .terraform.lock.hcl ``` +#### Project + +You only need to complete the section for the option that you have selected. + +##### Bring your own project (BYOP) + - Delete the project ``` gcloud projects delete ${MLP_PROJECT_ID} ``` +#### Terraform managed project + +- Destroy the project + + ``` + cd ${MLP_BASE_DIR}/terraform/initialize && \ + TERRAFORM_BUCKET_NAME=$(grep bucket backend.tf | awk -F"=" '{print $2}' | xargs) && \ + cp backend.tf.local backend.tf && \ + terraform init -force-copy -lock=false -migrate-state && \ + gsutil -m rm -rf gs://${TERRAFORM_BUCKET_NAME}/* && \ + terraform init && \ + terraform destroy -auto-approve && \ + rm -rf .terraform .terraform.lock.hcl + ``` + [gitops]: https://about.gitlab.com/topics/gitops/ [repo-sync]: https://cloud.google.com/anthos-config-management/docs/reference/rootsync-reposync-fields [root-sync]: https://cloud.google.com/anthos-config-management/docs/reference/rootsync-reposync-fields diff --git a/ml-platform/terraform/modules/projects/outputs.tf b/ml-platform/terraform/initialize/backend.tf similarity index 89% rename from ml-platform/terraform/modules/projects/outputs.tf rename to ml-platform/terraform/initialize/backend.tf index a1cc68f91..8d5d67421 100644 --- a/ml-platform/terraform/modules/projects/outputs.tf +++ b/ml-platform/terraform/initialize/backend.tf @@ -12,6 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -output "project_id" { - value = local.project_id +terraform { + backend "local" { + path = "state/default.tfstate" + } } diff --git a/ml-platform/terraform/initialize/backend.tf.bucket b/ml-platform/terraform/initialize/backend.tf.bucket new file mode 100644 index 000000000..991e86976 --- /dev/null +++ b/ml-platform/terraform/initialize/backend.tf.bucket @@ -0,0 +1,20 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +terraform { + backend "gcs" { + prefix = "terraform/initialize" + bucket = "" + } +} diff --git a/ml-platform/terraform/initialize/initialize.auto.tfvars b/ml-platform/terraform/initialize/initialize.auto.tfvars new file mode 100644 index 000000000..8ef26e4b0 --- /dev/null +++ b/ml-platform/terraform/initialize/initialize.auto.tfvars @@ -0,0 +1,7 @@ +environment_name = "dev" +project = { + billing_account_id = "" + folder_id = "" + name = "mlp" + org_id = "" +} diff --git a/ml-platform/terraform/initialize/main.tf b/ml-platform/terraform/initialize/main.tf new file mode 100644 index 000000000..2232ad86a --- /dev/null +++ b/ml-platform/terraform/initialize/main.tf @@ -0,0 +1,131 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +locals { + backend_file = "../backend.tf" + project_id_prefix = "${var.project.name}-${var.environment_name}" + project_id_suffix_length = 29 - length(local.project_id_prefix) + tfvars_file = "../mlp.auto.tfvars" +} + +resource "random_string" "project_id_suffix" { + length = local.project_id_suffix_length + lower = true + numeric = true + special = false + upper = false +} + +resource "google_project" "environment" { + billing_account = var.project.billing_account_id + folder_id = var.project.folder_id == "" ? null : var.project.folder_id + name = local.project_id_prefix + org_id = var.project.org_id == "" ? null : var.project.org_id + project_id = "${local.project_id_prefix}-${random_string.project_id_suffix.result}" +} + + +resource "google_storage_bucket" "mlp" { + force_destroy = false + location = var.storage_bucket_location + name = "${google_project.environment.project_id}-mlp" + project = google_project.environment.project_id + uniform_bucket_level_access = true + + versioning { + enabled = true + } +} + +resource "null_resource" "write_environment_name" { + triggers = { + md5 = var.environment_name + tfvars_file = local.tfvars_file + } + + provisioner "local-exec" { + command = < Date: Fri, 29 Mar 2024 11:12:48 -0700 Subject: [PATCH 33/39] Code Sample for Ray Dataprocessing on GKE (#507) * adding ray data processing src * adding job details * externalize var * update versions * added more preprocessing steps and * Adding image for DataProcessing ReadMe * Create README.md * Update README.md * Update README.md * Rename DataPreprocessing.png to ray-dataprocessing-workflow.png * Update README.md * adding final changes * adding final changes * removing socket timeout since it didnt help * Create CONVERSION.md * Update README.md with links --------- Co-authored-by: kenthua --- .../images/ray-dataprocessing-workflow.png | Bin 0 -> 92010 bytes .../examples/ray-dataprocessing/CONVERSION.md | 27 +++ .../examples/ray-dataprocessing/README.md | 107 ++++++++++++ .../examples/ray-dataprocessing/job.yaml | 23 +++ .../ray-dataprocessing/src/Dockerfile | 14 ++ .../ray-dataprocessing/src/preprocessing.py | 158 ++++++++++++++++++ .../ray-dataprocessing/src/requirements.txt | 8 + 7 files changed, 337 insertions(+) create mode 100644 ml-platform/docs/images/ray-dataprocessing-workflow.png create mode 100644 ml-platform/examples/ray-dataprocessing/CONVERSION.md create mode 100644 ml-platform/examples/ray-dataprocessing/README.md create mode 100644 ml-platform/examples/ray-dataprocessing/job.yaml create mode 100644 ml-platform/examples/ray-dataprocessing/src/Dockerfile create mode 100644 ml-platform/examples/ray-dataprocessing/src/preprocessing.py create mode 100644 ml-platform/examples/ray-dataprocessing/src/requirements.txt diff --git a/ml-platform/docs/images/ray-dataprocessing-workflow.png b/ml-platform/docs/images/ray-dataprocessing-workflow.png new file mode 100644 index 0000000000000000000000000000000000000000..3d99daae1e15f42be12ee32a8d7b5b4c94ebc04b GIT binary patch literal 92010 zcmeFZXH-*7_dblGf_ej@0!kM|r1uU1r3na#^o}4UROy|lC}TJfc%e7x%GaY+uvI6m-o|?wURTOoSE6P_nvECvu8rY11(jG3-lLANJuEu z?kPPaAvtG2LPAPHeh#=}Fm3XQgycdgSW)qTnxZ1>0}nSlu(K@*$-M|@5}BSZ_VQC= zpquo0z8hD$-b>t(K9%!g==AM-WDGaXUs8YdjMYlDoyJT@NvS-mi0wk@*|-|E$d}4G zTw`aAuO@g`uyS+bPKTE_;hOOSX`V}cNuQ5IwuzV%eBarpS=}^Ix>m|050$UKxn!xW zqUZGosXWOgh11+0PTltK$66aadBUn-J9%Ob@;UABakehGdlyF}8tTkn9PuYnXuI9D ziS#eHoq1=MMw|CD3CTq4;ck-sE6)1swy9@aE}=KhYbDRWKOd9!_>bh*=Py3WzEAp8 zVabEzJW1U3{LJ{z8VcKSat~@RZ@y$p=(ydKo0qq#1B$3TPak^NedQp<9rF3@_qufV z%5*c==da;=!dzzr9yLn7Ogn}!cM%i6T*Z1(uwJhu532nUmA-WUQTq{COPi7+IpR_8(Pi+fV#u@ZanHQcSZ7=T7BW#5 z$8MSye{G3#O2#)2^6Zt^TM;?+F$T+TXt^s__@n6T{dycDbf4R9KEG<2A}=SRS1V2$ z%xXV&u6IB%VwFX<|0%_Y!iB=iA4r}p3kZ-1pE16Eh2arAqKf0=i}pXxU2J)D`kZQe zReFDUMP&u4)qJa#93>9>7Jn%VmAZ)zA^8B62Z2IDI_VrpKBQ2P#98$1zZN*nT6W<> z^j(Jvxd=*+&dBn1ttg5V5`%-Pbm%!o}6O-z_1_6dg)wN+vN|W5v(-pK~hHN z$DRc}Ag_JS^p0_jl&nqX$?5ddyWeE%PnU*HedAvyiM>O$Ojgz=xKG~6O8!yt>Z=O{ ziqdg69#c*z-0!B?3OoJ&(rx<4mzP-c@pKt^*mWGynsGFm#Ckvs7 zWhIGMTzaC6h z=dLl^UAGgn6Uk%rys8@x?ws!0u?`bLRa`B;HO9F58f}$be*P=^YmhV7p5Pwu9(^t8 zRP6DqPjkzXHtFmGRCLiAA4@*Cbgs9J&hsx!x(C-&%QBb6NOU#LhkiE;ItOAwT!b(} z&O@kBu|%af_T@{?cji$Pa`MZXC$HyepLD_rUu0^#r~I?jHRfkseO}!sc9ARhR%mrS)#B)(Py0 z;cV9P_op=;Mcj?KTksM5Qs#nUq>?hbaxI&odyVzWd8rQm4r-3VH#HCA9u|(+<=UuA zK+h#?_NesC_5`72(V;ztP(nVrer8YJR{m;1c>Z|)u_$AIa_4)F;R|3+2bbzcG`2J{ zP;w|Q^aAuEx184tS0&7l?y~ZZ?vB14SOll}= z1Xlj=cH4ct>$0-DqAey!Z%2PoG|_2ov_4NrG)lbJme$Rntys5;8T>KTF7-w#k0_O6 ztz(P>*y+GAqQ(boK1x55=M?C`2~Kjb{KEX@{YpKS+*{p?AKPr5FKG1os>=236rtjx zf>k)rsNB1$JefTEV!G0#BEurQUS`hVm)LQEVYd+$@gbpP`|^=@li%WR499_!9qN(g z9Z$Y=f4x28kgd}){Inufs7tb|esVwJd`J7t;}o^r&!oEPTu9LguUM?P$k)DE-r?ep zYcW#J8&>?HD4SUjgUl4T*1-q8#(%N$Y%d!!Ycgy6jq7!X6enZ11G|{E^kk{2rRu%| zS6erzwRiVQMNMT5!n@Qvdsl`*gaM}cvY*s6vLtq$e(Ky*ozIfb>82K+nLRy%{~p77 z+8RPiQu-dFBjY(H3Pu|yUl|`EaG%wZ-4X?7@mn~O9=@%WnZcAUlCCX9#9Gh2S@Y<# zwJO~-YnxW|B$(}*HI$N>1x{act#BpG1WbF+G;BaO2IY3-_L;-wEGq9Cs}(O4OTY|Z zmj|n0+0OQR+_WXR?{%O}}Sksq1E(_UzNN@KgqMysr+{8}YInO1dA zsXxMm##_dO-Q?QLrO?=jS5k~3l|zAZF)xp!*67dzD`w8kl1`ib9P}Cw6U~eY%M+S* zym-aMh4+ls3|%kPy^*=x6*0vm(Py*mvt_X@x6g5O=ULG^qwm7AVy%)PtZibC0zX=1 z1zLmWZMIy$0*N44xyg#po^w;$84IGz)Jc4at z2D6jRtrV>ch1=+DjstTBbMXzrsqN73;kBBzX+AE879Rxa^&Rw$O!mD++_p)b4x^XVUPU=B?qGse4v@MU9cx&@41Wv6pyb0-_5tz;|Q(cBtT z^OSbXLGHKbAM|SXWfoF8G4jhEt-o5nH#i)Z-LD^`uiqsaOQZj3bY!J%Au0wXH|L%8 zzB6k{fv=_Z`%OtMxq}tV*Gw7hv+0h~{j%nso!|R5s=id6^?qRPGxH?aI#^rLOOYT& z^wf;zkG6+Y9HG1YXSQkR^!Tru-!$)Qk__R(jC9*>qTlnJH#|&D6qjqI_ujp;{d}p` z>D0KZ8NMN;=H;jdHf%L|TOz{^*3{9g51E?tF)s>Uov>Vo$W1K`8TfW>YJD^@bS*l|!Bh|8~Vzj`NS zsj3HcA5ygLzT7ahIy!YIyK}GrvOuk(kPzZzCjR+W)=us~Z{ex5XWOBmP+v>Fpi!d9 zQGoMpB2})u4?@V_b4fc^-ZVTrjDS*~jX_KuSsWk7Q~9u@sytVP1>ZlG6gT;>K2>WJ5GFnx4=QYu3`Rb)C3VsLa>!nAoU>%NvWgpT4`$)kXS!gPbJ6fh6B|_~6mp z7g;&3p)V32ZayQn>x$rWBuO5Grbbk}3cmjIZ9aX8vS%QPpwQ)){v=<=4DygwxW#nP z4auD-Tg3Pox#?CvzWwm~{of&2LMH9~$2FwHYU1hC?sMz0q-Am{&^*R6GlP? z{B;?4`@K8;@2Afhyd(X0`A1_UBzJTb)zpBiu8oJSt*fVln-|a7G7;d$`6u^`JxNGt zIexrPsXe^;9cX_XtY_q9q%^RvfaY8w5lDI_i|{A<-;zWUFq`kuBP zif%4IpI)G!_WD=lU%&iUMOlF#L;pn;|J3t8?gEMiU62*{(`cXz62ulFU`Ix(o==^Um>)nIM` zGLJUOpn4XmEtg?8kZL^PwcRhFJ=5e{7Gz=WwltWT*%`xRI&plC!8z`5Z{74@I#J>x z9sm77d16q&Mgs`mfk4MWu&v%nc8U_Zz(-88g)9OnPlMsKHQ8Q)Dp4lGnx|* z#I^WJuQ#EL=cHSIMJ5F&K&ml4UiLrH>N_{JBzjSI1ZrrZFEzzedK4U02<3$Nu`z{w zrG|RX_xocq(rQtU2zI@&ki!(O?zV8fL?*v)r?9K2{Q&g#N)VE{t!Hq6XiQzG=LrMG zoKURz=a`@VZHM)9VGKw*#P^P#iI4*xORaF05F@l_^8r?SP#cpTtW{-;3&1yx&>{9; z+Vz(1AJbsJ=q8=xqApabAECC<4+UiP=UA`9g@8Avb9Esu&7V8L%W*e4umTq8BQ5w@ z(}o0J%qsDt@VMo{2*a&u!ZY0iCOrr1^g)5YD2L>kBYc+8u#s&pC z8p-)rzIk0;AdB=FdZ*%e*At{BRD=mXQqNes26*C5Kj1)bvI?^qD=dmxY%;vZ)FAas zD_cc9tLajnBMY(e@tx6uT)S8#XY_TETgur7wXXqhqQeljcmy!uUh#XYC3UXDoOb#F z#b#c^9oQy#aGhjyTPPWP$`v8zjhk(m0P8apS9ght%al|7LK2R&&yyRRz+K@{e>!@; z>N8x1yIteA=NGw?Li)uZ_ngYAd^Qi}qaaCIgaL2|VK`eAzK=U0-~>P{#Nh>GMHjPf zvt_OIV)-XZ4aZdEJ9pz&};J|{)Q-XO6nZFkI4fC+Ns4lZlibF7sG zG4AYzBBKOajdoj9?Q|UaA;4u zgEM4Sdr-5B+0!gwF^S1kd$ZNF&J|l&RK~lZQw0tB z)16oq-e~|@YW%YPMEIxbCi%q;Jft%f-2 z5QrK~b89tjT%M1L#0#A@x_{On1_ui<;kX^~^jo_&XWsKLTzM&YQHPs3&foNdOIA{^!nR2NaU5$Eb{o&~;tek3#gxt~hPf%Q9 za20@r@E;Ss!cVzPoo`(7(Z?6ncngBwM_Bf(r{3W-24pJrvI*BJ!vvXD<;e`(TT5|8 zEYx1bn8&ck%5uvcc4v2VrTC1> z-+kTqP{)$oda|n12T1fh-NE}yHJy|3UIk@m{LdE77OZdtPYu*WM&e=Yk@e+x47w?-Dmfk^KTVjWB@Pt$!;vkTBU-|P0 zhYP#oxxk>sgLio>OV(Dy7)rGj!znanaMhlvGQJiSvL6%KVy#%qVn>jIHOtphXL5(ozzi;IF; zxPHIcxqK^6qJ=AXAe{qE zLKWg?urgKWW@~(~gpnsI+s-7P*nv&@t$RA@98JBrWo}AwX4aPY=q)=#kxom2Hx_57>aCYQ1(U$1}u3%hAEXq0tz9TzH13$RnC~jDtjh zgwYW}tEQ;?KVm;XF}gxj7#! z`dt2aMHF}0fqqQ>g$OJ5&b_JixFXXfT3niVlK`#g0ek8f)$U6RLyA;t zmNHnU4nkgA4-k`5VhJa+HBe!Bs4a&!Ir64S^tvQ{}c{F-M+B~{RO(gWt8LOX0~=uCayVqIruUmVAa&8xY&2h7^U5`QR{^EL`q~5rZLEVi5WBA z$OpGRrd-nug~o=gWn+_VKo5B>_hJErcekKB+LOR~mj%vLrqO0PQ5Z}v&_j+Es*pc; zi#v0+O0>ETpI19I?tW}+aI=w5f6aIf0QIq0PRvox{?UMMi%9S{Qk*tnqsnqq>g1Kv zlt(Y$UkTs4RbVOExY6Qn_Y)B(Z$9}5*8&J#YxU=#2KODs2)i9>t%PjwHy!?{alYEG zY{z>aFLFoKqFf2;@`IW=C}_DuD24bvMAPCtt!{kBi6e!9^%P)8fVA)e>?Il*(s43y@OgqO&ooL$ zyc#ke!8tM)E~b@4JJ&1J#r#%F4*kVCLu}kk(Z=JyZj*6LZ3eq|8sx z)xY?IGc=T`_ZHMrN?H-81y30I_mI^HRir=+=r9!fHuLbY?Qpy*Gn6@aad_VI1wZ1X zzwBWwvm7_7D-E6*ARW#XZjczf;hCu^r2?!**h75x^~-;T>3nt8!yqL@tZkRCwmjc@&A3HBEh=Pc zz5lQ#l2~w$}_opNp3$~OF|{WFdAz^sZw zvBH0uReYxdTV2XT-RIkTZc!^uZ(^pRU|i6b2bw#y6Sf$0vw#Ge`y6 zw>tbr7Q-)3Tc#NHhgRKmfJPMeJ3F3(Ki3`#PZg3bUn=%0n3{Y1Gbj|NelTo3f_ec~ z(f8Q`f|}6r8V2)^wc8w3@XZ+r@ffNMXh!q!Kp@28z^pX|h?0>fe7>k)--i6rae?3i z4Ji_-<_X_*>f(0FK@cI`>qI-rqD#mh*w*4tvF2&Dgj#XbT0)&(uha3FBv#$#QeB@>v_0{mV67ji`^h> zW~WHht}ofTi4+xLRfXPs-|`$PdGaWnDG?U9*D!xg2>rDK(R5pRSN*jkmqT zVzZU8R*ddc)D~y`RvwVysa9 zaL^Z3bLG!D^&bTnz_Qg5y*)qc{GvhNSp~U^y@f zQ{Vq6hh5tw4OBG-e7_QZc^OvddmqAb71X7(>lAynpGJy6{ z@t5rx-}T?BJMGLF9+&w_;onL5U=YxDKr5j%<1an`ViAY8fS8(Q2Dg4E<^P=G->Q4} zKd1ONk@)|~DN4@qAoKd~QBY8T`^CcR!1lj)Q}Uy-3-^=1s{^YmS9(zg1$uF>dn>h) zv1)JO0P5%EwFVP%gtApn$}E~z*?#Bi+?%pWkB041k3z8;MO;zXd}BppO!d+MoHw|Ipux77#al>!#2C7LjWJ z9_eH5jy{{mfqQRcJ8D*#x|@cFj=cAzCdA0}w$ zpDgP?l=&qCFxQ4_!^PiXp(72@2e`2Squ&zSK)l=vsY z{?8?Ty2LlgEu)6D%H9zvOuDYG#L(|T$TP#H5B8w#6t54b$eOwN7l3L~=mg!7&zzY@ zg$llJ{))ptf&Ih7XMU}a5j9Re^P!ihpu_GSX@V>|U zmk9PtNHjMARyYxLs=&uSrRLAk{3SR#UIKc2I%bguEM+uCLVnvLInZNg!)W&Zc__lm z7>Mzlv-H5nyo~5CqWr4Z{5wDlCN&9kVKV{?s1g3$F#3;SFtG!vnQIt>Sm3eWpX3;_ z1*lDipA+!$3ODp?H@`IZ?i?T%g~NJHgjH_K&GqoVDB$Oay*yr$D1EAfXI zh1~asbzj)weAM;rM(LypFQt@I>R3-Fc7V6~USglYmvZDmf+Equk1S3~-Q^$#0^jo} z2s!3;%Z6u+&<05;F%G$%wda3L@BMVsOUpNZ%(AwwM@^39BZ&8=(tb@R*jG$s@JFNp zIbpB{XJ9G1W9=ZL3UfyMSsPB^TX2F8IzZHtQq!eowI*fSx!k!1S$q$KWZ*;uAquDY(cHhr3vWewhU zQA|C{W7+T|sBtsXv$0zFgzfl5F zOVd1Y)c)@*Dt!-_Tz7^_nDThe@(9(8FF8r%;94NjS@51p3Ma zxu{ao*BFcRbf+IdsM048qre{n5F4tAf zc3WVNn<&+r+$PFK1#SHenV>v~78}RLy^|Cy^!ml90(6fz+3tGcY|X-lop`x$+9+6{ z{~hUt6*X*WMOCl<%7XXs^PR4;R`C*{dVBYl!TZUXl=2H5hxI;&YX6Puz*&+XmdNu0 z5}B28C1Wo4P_m9n8@VI<-)kRp!FL1pCmOJ|;r6)*qP><-v@Y^OXh=&_F5eh7V{gVs zA>de=0{mqNR^ST*@!o}k584j&@mkt+$Nnaf5U^XZ*|*Wl#Y)HRak4Yx7N&S@mai^P z=;I0BMVY8O`HYfBo*&V-_|5y$((qEZdJoD4rN)Gx$Z%$ChMP_7d_2ms*AC(|GG@DJ z1U}T=g#9-VE5L!&pY$dzAieF|;TGg)93xEp+Gx{=1_$#C?ZIQglfCve#@BtfPeY+% zS+NE0#>}(r55H|FJ`}aW#b;JodZJ!?4QC)dGoeQ>X7bg)jb18lZ($b8v6uFMKLZmP z^C{M*k`vl-c1zRB*aJh_9QU^v40nPu_+cx4dwV@LGH;Hw5iYjvT*ln)H8q!72K43ydk&3M zjSOO7jD_j@y%AEcjCqZ?+RpIgzW8;U;NOCp-~C5Sd0!O@O-{wm$WZhkB`LV?#=WsN zD3#prUq_C(ZSR;58|Efb6E~osB z1`3?{jg0T!Fe*5@)%-X7V~7Tb-Wh&$(lSO;+}h}!6LT`dLdxg!4M`Zd_EFH?fE)O3 zL21=@JY+npua{F|W}NO#iWsIU-M-xNfd6Fs%g(CVoT$Iu)LrO93EVBb88!uGx+L*W zzvqH*Tl`I)#_Rk)m{Zum9|mDNtFTtTzE)s#0ZvxBL1^`o;)E^_0`Tq4Dgx-Vrn@_01>!0rGat|==U4UGl!HHEB+yk-c< z6Sdix-|5*|!8dWYsb{W<-VEHuA=&$3P34js(>^`j)&X@bs3G~_!>8||f*Z4(c66hf z#Lq~G^1|UBW6|Z}ojwHi&IG~|Cz;)GAc5pVp_DsEXnJ#mm1cirkekx8!S-m zFAeXZoW%BjXq`dwr{Oi57P;VIOCij|Yt8ntu3#PhSIKV!Tb0VU2JM$}G){zD{ETK3 zKXr!qpt7_n$d0$1_)@yBEpT zIL0`gE$R9~g_^d1z@Y5DWn3wN&}U85dGv7gnoz!~bO62$TRv^dar)VJ>Gd8%4>zcB zf9O*as`5S`7Yc)q0ogSHbS;)yr*D}Bh}Ay*jDNRTw;fa(d2l>~M1G~i-NoE8nkfLU8@ATslI{WNLPOcgG2nq?a0C26KYIj z{~-mooatQNT9Q{mZB(8a(e|znTBw&dy@}!GSMA&>Ro--fHRRGH3pGD6#d$ ziu|{sPMu}~_72L*$t*T z{7@F8>nUjFA^xbkVRLhD1R3$fYM@yeJYW6tU~aqG?2!oj_|4)V=eM77Z1(6=0bu6= z;FujJQ`3lR*rv-puS=IBKP|ydl&H#oV>~=-qh8!^iIiy(&@QU=1`+Q_U0aD~sg{t# zkvn8vm`jP+9{GW!X|;Hbv@#^Ta>{aASgZpgGTI(bDl0Y5a9^V3)Pd|gvr@O1f9J=Yamb*W{NchP+db30tL*O%SLQzH;uy8illeO> zMVxfUmWg!j?R(VoYjx{S84W1;d?aj_K6;JSy!E^-;xsXXInLXGep1bma2evH_}n(P z6~ZH+ZJEs9gDf^rQ>=&r$w^0Fzhwk|20r!QhIyt1EZV$I2l#gbZ~?0Bdbcxceh$Nx z?a^5N_()RTA;#i*hEUeW`EQJSjEKQFQC4l6nl;5ubcU{;=F!2hF*ZTNl8gJciKzgr zjAe7|nKB7AaRs7wDizmk3^iFGO9)&s)W99)@7Q_(nVYVX#FZv)-eIEey2ht^3GP8*PRD zAVG02Q{K<0qKMo;3lZ?Jk;`yfl+K+hAdRcus_kL_zNWV)O;m_M?}EK>O>&paf-G^cN-%P3K5 zpR&-2r&dzVRWs!_%(DO7I<=#1DPygKegd}?<1&vga$_quX37R^muD%1|EdGtHq+FN zk<6^GO}1EnFI!@&;2aOfP6}xw#T^|6W#x=ijD@Wy>lOwXEMJ(TD@SRHm6XTu^5u4u zHw;G;*ay6izP!W^Hkj}Z)C9IRn6`>%I#E`rpdBms4l-%7*gaBFs`#V~a@nhA` zh@a2TxL(|jD3+JS_c%y{+)wJg2CrVw$PyYXWK81J{*)0VHwtumC1WNdFKL=VHDu960)mzRPK-J#W>tQ znUcxNz36dP`HY0n{FKC!;~VXDc~KW(J8USPT;>)g@;Sg2v?OE&*)_WfF_EDX#VF1n zYm=GlF}{eD15RDnErH*` zGOOP$Z&$d$_ONm5WI4l%fw}nu#y-^?MiFEiUF&j^>zQyEx9S-w-7WV|E1lCUOC6uL zqe$i`8e+QweVEh7n1}b{Kx45WVLFslNxpbQWTvAEBFq)bB=zp_aqVj%9{i<)J({;3 z0;wsZ&K5{%o{I)-j|q=UzxrchLL)!$Z8*KKgw(qQ9L`wUUB%a9pM@{l?QN#XOQOQX zJS^V|D>Qu{L`H(!=4U&MnA+6OI0@NKRRmHP3wIc`TKcP3I&EYcAGt2zjbbFr=12n; z3()6i9*uo7yP*ejDrPt0%5DC6facG{%Q7RtX)yJ4s_Ve)sUj0Q$FupNjxqB9y}yI$ zT1sFs+;0S^$lAUqA|;OJ!qQc$cF2>RRIcaY9!dglk}Frn>ohfqhqz{18fqtWVd#Qq z`K9QOX<Ep7(>qxM%6<#BHw_D%fc%g=U$>h*|dmkiE0viXiih#OJHZV zP<5cDLQw_T5T}XO;FrA#DN!Ttm}m{+=l*sZ(M8s!p>?vnD;~fB#cNPD?Zsf?N-bEK z^(s1JNtmJy3RPpY~VZFuTs=0TFn3_M1Q$5=u?|>!iN;@xaFPkY-%(b+hnT*S&A7>Wly25=`A%oIY{}OY z7Bdl->6^uVKJ!Ojvzoe*t-RYww5$nfbY&H|Wa(lsR3k{orTp@Zk6j)tsmmChn50_* zw+B|o7(JA3>&KA4Rl$%Qt}e|o>bP8vp6R82Kj`Kxm$vR14n_%qMS{hnWQuU+-k_SbU~fF$`+Yzjo-n z5wX4JeC>|@eGXq+*a&Z)cvOY-b7(#&0=wJ;Q_(TrK}T7*Qa|SlTpFG}VF1z~JY|Qp zHy#H8XJQRL|6K-xi3fl@Bb=S;Dlko6nL*vX-a=|vIDJM@d6>x9&hXxTt!gM$sK=-8 zREuOCWuK9cN}QEmemFq4Au=$JPM=-ZUW_Z0%eQ_} zEU#=5q|LOqZg%G(%rHp*hjl*VgWVR~ZC0wUxw~1ne2)<1IIIvYX^rq^ zGm2nkD|G6_gxus|5E7ewFO@De_p~z-I?;gZs@rx$A8MsuD7FM;W-`iDCP(sfM_W5q z#-@w#%bkpDkyho6UyI%tJn8Nh71eH_K}2zO^=q(z0?mfITU$b*%i5kQI|dW=+QT9o z+-W@p-&0}8tyxvKG=aqeLeB%0@=95*-oK0a|IPxd+n?hZW+Yr`NVPeR*GMasCu|*g zE#fUru!yYKs0eJCbzw=e30Rq(Ie_JR=PBh^SAy?bD_O?cgb0@=2Vm-z{xhx8>LW)Q>~b)%u|^9V@zt%r7$Y(8?Ku%1vz$uRREK9vpoQ4JPQYY1Cs|_CoVk?=SSq3j~iD z$LG*Aq8O)3jjN(Gr{R7A=IZgofdSnBb8lSVv_wvz(9HPiwzQj3nf9ch;Yz}idph~0 zJWL^soSHen$D$D*F@cRgk`@WS%)>xUxwaj4f-MSo~qFB_SjeKI3S@QB`?d zjV93O7b`u$x8Tc+cN!vsWBT#O!SVqG6JK0;qz%rPVOZpaB_iF@pYz`>&^`_ds!c&{ z(D1Nt1s~{8&drCkCrf(Yk(MOJ6naN-!!Lpw3?j#`J%>JTWBHrhqO>NEr~vP9dE1o> zOB==?p=82nMz7e4UOraQIvRbEC4Vol;*ZxoiHvufeUsXt=cWg>(vv{jt4YYHw@dKV z)}vH)F=g3e$am}nc0oQZc}XH=#8DaJ(FY4fs_rp9G)5!*rAw1bc>*$mc=3i4HoYfM z+>b=d;rv+vIJr^b^dd?w%0fL^Yn+#LHOoC#KY1eg%X&xoamLw|IKO+4-QeN;V4X>B!#wB1)LAa>7pVqh zFIRu$o#kOEZ<&J}U8x;j>h2tLTk6ZrHiA(*HL}qYdC?Ypq}22QQlt1QX!t$R&|lV0?eGC@YLhb2OOQF(YyqN{ZyCZb`Tx!74a&-ZXF zo_$2RVO;YykoEJ~7#DfJ;u-y9D~nMfvF){0uA?;1-^6btJ|K}b`q5-YOGd;9o01u= znh9NQ(a{pRYJZHM7Cm`kdH;4^WZ?$4L9>DAeWD+wWa{44hTZom*g!~fI^4}9z4Txt zxkt}JhuQ=G6-b*Rg8VyVT(;Jmd>E5|EWMx{0{2ZG>=<1qe4GxNe%~-m7-8o1l@%5V zUZ_+9_gu(ZDO|ga@b{(E2gL>2{9!u6DAy{pnY7@U+cmutee^D9oDVQT>+~5Aw^1iCquhDZ0YH18Gk^4h&Dbs}_P$a`P|r+Q{=?CJ1g z%-ykr(CedN2I!EU@qP~n&-rJH$gu|cTPiyrBU)u+L3KwKlIYr+=F;0n6W}TkmPpA2 z8@o1rt=J3}xSF0A*jjBHbW2+iU&_P)e;-x8&XA;e-xoA?(x!gDgZB7o4|qEZ(PIyh ztKfsDhU^W;`>WT1lqI=0e0qEnNI8};@10@AYI14(0J@J0a zC!gfCw33c&1JVkwn>4M=HmLdOdABFnP~S>eC}vbJ^LMk!e7lNh^clW#v(>5CDCKR8 zyxWmFp(ue6S@>qbLqRVld0;Jl)w82Vqruu1^&X7pOA2G5rawl#6o7x$Hz*DgG@N9F z5~}~$7&m%P-L?N!y9?bD>)&8vZ3Lrt0+SvBJdcEnX{kS3K-_#QxB@!lVBf>FJhH3v3ri zdpSry4(Di7r-W;LZt>rsmx0gl7_h}fvbttaN7L{Kot^3$C~LLm5asWq;bB0>2^aRB z;fY32LH@?cmw5q_YX2Ft;Nw$15*HeKh4R{)y+OGLhkF+EK}sUqZe}OyjhIa-vDNk+ zZ=mxeX_bbFc6U&a`>$thQhAT z87XPcan=5)9{-qr-KQBTMUP<2sB6WH)iWdC*$Xd&_7~j!FH}NtG6Bg!C*R&CQ>^5{n5ae-LnVNyo4)E>TaXb(G z*CZ$#mAnkx!I}jsz3!XSDPo~hUio?10$YtoGxL0+=P_VD#HD-fOe{=!6=0Z~>?VCe zI|BuU47b`v{f4vQ%G-x#P|s16U0!~^$?P&}h>xl9)6<>L#|%aAyv0EG!_l`Rn#5#i z7JPupBNn=R@e2D%2-AKEUEyBjgRM7LR52@zat4VQH>tl#bHo6<;a)j=!d)wCRvcQK zspAk4qSIl%Nrd1{71{w$$OORp|$;7%+sOVw6rsVV6jBUbHU&c&AMRE!UPpJ8My@Fwxg6k zK9C(X_NJ5YxEH%KrCp+r#a)4F;f~wtW#wkFvU;#zfQ-yRQzX|cKO(U60?z4$ElbGM zH^x%YJ!dd|@r#3rgAli>kj#(rrMO^uKcVPg_YkM6e2Kue2FjYdqH;x}L;WJ##eeTC z{1U^x@&PncY)E4%<~hUTu0n7@BFL$;R8^7G(ZhpHVD-iu#3F(t4@*{8SEtjidZwI% zQNVFybcMPExQIvaDcsnL)^*kbvBU!UPK%=tqt9N}+; zo(ph>s8PPmqQAxbx+`?YGQ_$hi~jvIenDVB(;frv-WY&_sOPK2&+TAg3@9p%K-Zd` zW}6nrP^beVaX9!KJhav+=qbxor+1g=(|7tUJ(KXXVv@%o3{-Cs1rZfM&95O{JC$gY)0O7BGc)+I{_1zx$ z?d95PFYa0@DeRf@6WUXQFCnnhke#m1Kt!_*v{$}yFwX1CKxm{3-Qy|-=JXW!>A>85 z>F2M@n1y61%(w7!ALX?*5Ch{9{+(N8E)&?+)N?!)$bC7c8svvBZfeOs!KK{N*f&W@ z1R1VDkq`_jWbhT^gS`?xt@8%zOuzna^WWQB(y9OnliKLeX;LqLL%b6#?Sq?^VVBtA zXE{pbrOsEP>F)5=639zRO<_qF=pIa>Q}&1>(owAiE`}pE%LuBAw{T5p*e%>31uAJq zTTlN;G+fjWCBfbg9*Jc?5rgPawkz&>w1;lUu%u-uMt=66EPZ+47ZpV|WZR*9)O(Cy zlND_79lrtPs$kRF*WM_)-2a88?^w%PiF~twG;Aphrz3X7a6ho+j_7Vv z*b$HDfm%hf`}Pu;lNF)tv(kc0RhSW&Ao`?;!ho$lp!ZfVa@-TDL%c;JfYy6FoDbSC zHr}PS)2XO(-;3_tHZ4XzxUcN}8Q2YTB7#JeNy?iq<;&LDS}s_q-litLG?XfZSgBB| zV14v6)Iz<3fv{HLndM)@R?Sdu|+qHWrD zp4cy9Yc_huuPsC~h~~I4*QlUDni(~>gV*OHeq^`vBtx9C?LjjVu3zKe)~4C}EuGTH zZMW@)YIlckt0ubzA3m>iy&VS2`qN7E-o|=gg9vy5xcsrxp2w&1dN(qcae4g0_0)5C%N1Uu0>QuhZv9^{ z{!aHte&OS+-%#o0;`Rxp{O<+F41tjex|CwM5$dnto&6e{-%cW?$!t5<{QMC0%l*RK z+C|mJoxvk%;%lZUK1I{wFh%W*hTF#{Ff4ZDNKpPToma-^YrUHgld-6`fa)1i8_VRh z(jKI#91MOxH4R?30wutdG9!GKp0MTVJJKgA<+h`JHMKF*lDQL~aLb6T;pypCte(C5 z>*Pq@LU&l-qrP-mcHJ|yLQJDD)7@U(hr4bD1((N}^)KqE4U1kiu~!<*;yTE)q31^C zceJvTyy{5-HnSy^S+8KEM8jnP6j9h9Lkbvxe`Kw#KS>T!Azbcl5JwOXJ1+Hd+D~az zwUKJ~bA5bnM0?6mPge# zKS5VuFf^U+$I9JraP}{_T0D#d^W9=axwW}!$H$J5paVqSR9A5Gt#G#=D-X&T1IA*d zOP(I3yb}b9lhyg$ERGCZ{hp|7 z5VqAh$W3HW;|m@y)zU3rlj{NjAYc=9v5YtW%#E3CP&oPlM`3K%>AZ9g{#s*kqX^r~ zZrz0zp711ez^3o}{3ksScpk53DfL*p8^`NxRSo0~BXHr|-);KE&;hZ$*&qGNoXaTU zZ6sMgaB$$ka9;?JFt|kZ(a=^&@-o#VF^(m{abqLDHzl&9m0JStqTokg-*guH@PBMsi3cNPVlgKRPV$Mg%Xt#Mi4tXcXrj4SZ zT1boWTV|o!4qUPyKV~o?g7WqbwKn>NZ%9bGNweMb>+Pgy;I=bD-w8(IY*0vh0bb@D zJPvs@=ZD;#4>o-ct$)$~W1px2HB{|gTKdY#S(?^%A6(_Zp>82r4;iGz3l(}X_tJ>T z_aj|r-jGm94s2a&0927X6nOs<8c+1H1$>6kLt2XRb z>3ic2&2+^x6gsRm)3bU8woS-!CFWD7TeEj_Z75;MNvR;x%! z%))42UsST%awxrfF#l2jqhNPut@IkjL2Ij=VI(60KV-M6A~wQD)!&0j3H+**pA3nT zahc^@uNi%swZ(Ne$K$Yk>^by(!^+Kp-ROhEUe9RoG+XiO3nf^o;@4;CPYbqo;QJ*T6xeQK<7bNql42*|c6ihDvFBj5b2EJB9E z#-;qF3%j|J#v~o5i*K+wg3anxVRHVF z@$jqggUxrc|Bt=Dj;gX-`#@ntkWxYR5}*j9TL(F z($d}C=f-&Ue&6TW-*^5v=Z|yF9)q!lLl*Zv=T*O}=DdhMW9(G2m>EkIJ-!S>O<-Tj zO-i#B`5Ikl{>@9t-jOB~#g*k$HBd6jDYh}&8NEDJ$?~`s`5t|3QE!*uQ4E znsJxa5N?(w$2fRjyBJv&{-ThqtFuyXR=BpHGHDJzl2ZiwOy7}4ryf-xeZw|@4>-kn9DXNFzKUGZ=zuqa!F zX3sgr12g9p2C?czGM7w3#12`M!j^7RAAiXwr+t;`*y-@>SjH4%H;!cJls1&8Blue8 z+kDd^V^6 zcdGZo$EyL)vFVM1=XBbi`q^_SQxyP+a={Z2JMs(kpu}PWQh{sN`}(}6!?U%b=pqED z$=L$|#?`6*QX${jT3`*;)vla7PEMjOCW41K-@10+FVSSCuJ(vlPq-jFd=|2ycJwOH zHYhiN%GNTGs)L@Icx3L-=`~-N&D4q8hdo*kP(nlIbGg7MPY@@6fCt63E%jji)Iy7s z`b|DK!ISDXiKB5Z)D=HU=GF0d33=vH&<~@{V}kqHMoc8Dw^uf_pB6Z{mw&)sena0} zy_AvcC_>prD$$rV=V2?6^|@}RMt1>=sg+dp!C;Q*v4**%IfHs3R~6P z)tOlF$|ATt+5(Qpv)D*bs44?DJM@PvaNPV=t@^ird|m{aiis?>IrUXBAhcBRbHW%A zMmIzlj{eI*A+xVER6T4+^u)kr#C!&NgF+M+No5pkMlKd=QvX+MNcatw}|Ylr_el61l6V>bcXWfp@<< zvIK|uFYkx*w7fHNYamTITmutLgz=S#q7%8``3LYmTb*HH)v{YHTd+;k0gqU69b^<* zwO-WQo;~@{H{Rtgc0s?6$84zGkU7owW_rnwCev^;0Be;5huE6IbV--NJI@Mb2sxK> zMVl4dW;Y2L3*w^XTw7IP(`sodqKefKstq=MwEj`lV6}vaCNtz&jAt67L9mrkw8b<9 zeO=+B&CNc!E!+U~<6|YI>ldfi)jPY)s%1I5{}6NnG0SJPyLR<`O5g*lkqDhgrgttp ztAXpshxnGkJB#x3Geg#Dtvwe7eK; zfA3Gb`=6Rj0QazWgMp8I)%}52qodY9>zS+hXkly#U1o3)O=4ef6czi;>lK?Unkgr{ z-J`D<`D)VGkmQyR7=6-njSh;h5Y?)CK!3OQcmHE2^RpEU@2-iyaN$cD&Gk;T!uj|d zPpX>Ehy7~DZ;ZCs7n>xBQI~-%mhjk*R^*(=_gG9Mun#!=B&t<=7!NLDW?Acp* z^{)I|H2x=;187!607!TqmZL!}E(|7}ZR*KK#!h&BhXzfbbl(PA3Z4u_ngIwI@X^Kn?ZlF(9efftRn<{OpEg z2U;RqE&YYW$S;tkWI1$=cFKpSj=OkNbBWOK=z+H2SfE7^NC*IBR1@}6;{QOIk~_j% z-hJ#(tq&;Vlo;tm$a;?DEwrDd$}?<)TNUVTJE6hLN{=Nr1N!QW z-7Ktb_eu|kSWo-Kfre%(W#f)ZUDbc2^cph2OfhQ4RiM7H+*ButCl+3E5|C_sIDW*? z66N7ulSRy_rWW}rDqAtI{p7Z9*pp{}=!(VBLssMAKXof3V>*rg?6i(X0c3mb%Rdyb z?%2%#2%tK_z@#a5v!tuUB*s6}6LSq8SX~E4rOT6M$visTe@KtsSw$~~y9#Bi203(H z*vSr5uiCWVWJv?EUbbwkhpTs@a4kS%%Dmro2F_Mr{-`%k3k{waN9chdJ? zM*d&UNT6l4?qF|ueSMQ}D$wvho(=jjN?$bFV3iT4Ac&#!yT(oi7mrcw3-0f!RlbkG7lAU5g#WpNT$I03)p2t;| zfX&S6KO7}@7^^N{uL7Fg0L zpklO;+_N~S-+#~?Ou#Ay>9)-w{0BMb|15d;g^aG0pwWLuSU})^J^}u}Pxvn#ep%lC z*U%y3>kV#CA}8|72`Wwe>;09j|cCK(=6$KJs<}hMYTKVq{>Qin9TVa zy3T)x8%1tj+`Dzm%eT)oZ%`ZlepbQgIVlvtwKD}+0ir?CK?k39io=rv7{l933cw3c zfxq0Xw0sOx7BQgs--^o5{8PmzF!b#`fFl>OT9*K_^G^Y`!~C#>?SFzJunnLThYabO zYccV}fx#T~k?x#~Q2;V?b6|f1aU;sRMX_88!1^$N4uTxhIbbVB2>^DQYmfcSe~A1^ zfO8G#b|LU(*)2HW%h|xuRYle)oWHp@y}+PQW z-$3gH+|W%1rSU||yH+VGnG7VlgJ~7ufKhH{3EAIv823&W!0RofVlF;p2qde(9k8#T z&4DdeIRE{miadbb2%u$4z4As>2!y-%{vE4L0B>qk8S?M9>i#sqEWrAJ6vH(sWEH8o zYh$F`5@1HzU0(|uqv_wQ_%G9YMGresZSFFma)1NAtV|EMI`LgWf7{{PpTsLAY2COY zg{mdhoB-#1y($0?adO4L{BJC$x#LS@qu=NaEmUp075z@IbAd=eMM>@B|AvYI={w>X zpZGwNNPt*DX}keo#sC|z#i=^j-%b-#15fdeI;bp~)IkL0C4IjK$gX@jz;P4Wzqgje z1M5Qo^?G3D9s1}R*l$d-1I(BNgvK}_ru=U#&$#2u!7XqFzAjGNZ2LT)FF(BGE0*U0 z*>=W5ZSdav|Knj=Hp4s(sD;gL!2 z0Qp}ZzHxBPsLer!T0>#Yl8qkj89H+f$xru55is{En^HxBGh99pG@Ly~05v$tusr@7 zBDcuj;dfi9#rmplhU4yRGUv}fO?h|2sC3=GHKFs3bzFNhn32C|A$VMv9P4j7`VjFB z(ZQS19_SEY76jv?XKY73K}gTvy=z1uU^NjgS}0T%HT~P)!FfGI!-QeN-c%iO)PK}% z%fVm9;%{PvM*w*^E0tmYZJn6k>hAU_x#IeC^%3?&x{%F0L!e{$Uh+>)1Ana`P zO!k*0cw>Ce8^;@`pNj*q1S4SNdkA3Mf#$AunFI=2+Q~(#Lymtr?R;p{Zo?m#+|K0r zJP{{De;O11iYKJd=g@uFA~wuPwATldzv=Zo5Hv?UBlih4v}rNRaLamgEdNtg6^HTn zBE5Fn9}Isz&`TPHPNzW-m=XcP<|9bSajAOxTYKy9em!&?>t7EYgYM%ec0B*H8(uto z@R_-@=^i%%h}ScWh2-u}NiR|FOoP+dB?<(bwjzK``lA@GIjG|Y82yi3*sBKqqhE)# zSL`mT_;v!=4Fpi~H4yqsWXcY4ACd>~`UhM%ez%awnaWd-Cr^AvSP{WOmj^uf%_w{K zsoy^wN&d8kc@2~ZAhJN(&u7ktb*)6Q60~GCE8$npw&D%o`8efP zs(`ICkU%K0+{2?NHwDggeY5l@G~vuvAMPWRl)^PjI(S^VIsPF~)7?^SnRk9!PSV7m z5fFGY40yvniUiX}9(jrLY3DmvWs8qYBIHX8f>>Ur^LAEe&g24wxFN~gPXjKH<_rNW}3l4fQS)n^OAqvG*<53~DlA}_+|&bKS)3{H1u6pQr{ z0|Ns&RaKPaD1nzUd3EIf!3};kfZv_(8IuY_?)V>`@q1AN2p}~oX!XJZa&@Dgc8=oY zrS39tMK#mj2LUF-J}@vDMNa&dUbO;NwmTrZQ=jEbsyk8PUdv+U=~4jKT5*nU=`2ewMO2K7Ni8eRJ^*z`)5kDq z)pqtJ)%HzVTGk%pK4mfbEG!}-VO2C#Vt}?@C>UTh0n^Q}TOTUXy-_tZhpaMa85}I- z7)J*N1cW=zdPo7&8^)SeQ)J`mY*+i3SL?MmXks)KY4@HYt!ls1U7k)*B>#iD;ZXqO zl^jeh*ZvgaH%Py9?}D>g5~y4pWYcA6c4`i#dQ~=eWy&Nh_0`5py1CV@mjYwo!ny1> z#I=s)A}+>rqw?PD)G><4MX3N+pr0=-hDpbU<9J7zPgO{K6=jd#bsrSL>r|5CeNG{% zNf#VL_Ynn~GJcanDq30Md^|$RkIrRtshTo94XrqDqr1g_ktQWpML5*f7q zEN2`#h&CR>`1HBVrZXNaQCd8lQYOALjPK^nWO`OT0jEu@u$Y+t(cGx%nE382V|V!J zEDn!I^lXV>5tG9f$9hIX%5rEBkbrKCocYEM>!1-lFcClf;Vnq>3`P;;{da~7JMhrA|0+K}sx<)b>qXJ5{E1Yt#x*89O)Iau>8zN#YJ6Q@on4 z`xw)f4O8h7O2TqN6#zg;D0|77bwaCa1l(Ug#>U2Ws<7XiaReD`l&zOI?LX#p*osxo zR~ONiKVIMck`kko@lI&gwS(5;q-G|?lUE9tUY*W#ydWwlidN$K!hX4rdndfapb(j6 za}=dHNOcH|^v4o^BLE~$ZIS1I{%3Ber0|zJTA!lZrdE_H7PD@!RtlT%~47^If3ym~;bT z=1nmsV6|y|S-s!B)V8Li?!1y#iUHxV>-#Kc+iIKTrNzL-`}0BQaOxE%ue%&J zj>^DO>v2u)T$tI=oFGoi>Fh5d#5_!U6WN2Wmqt}D5&QCe0Pi?9Tz_+HZs-x@NLiPC z#!W+)pzN825uF_Ge$_~}!b~nh5eE7fcsw~GWf)Z8ZU4n|qHzo)f3gdh6QhL3yjoZ{ z)Dhxc%sbLN*b`T})Plws8P8g%I2=k}#v5Un>?}E1=?W#b7#A3Wo%34pobk)eoV0Dl zy^UwvykMW}i4h}l6_ia@7RvNyBbsf^rs=nPX1m-)k>GjGxtF7@vpS1QuTp!9tXT?3 z8SnMd;^E^0Oc>PCi|3Dezk5dnC^WZZq`*%=BX#@jjVB?1`he+TF`|+UU|>|R5Bu@8 z<^Egs!ZHgQb-H$z$2 zGH?+ebw&II?nViyol^Kq7U!xKQgRikm77GJ^FkJfpB1NeuQ#Qu7V?@r5l^*G&~#9H zYCM(s0aaXGe74TH*PiEK@sk`?CrqvM{<7Wm+g3?4iS@DaU=*czR!gk28VUWI;jrPiANSkQ0v(-P<$6Vri83`@I!&XV zqvIdCB4MG%1(W%6#>&c(c1CluCZoADjTRGRyY=kCC%b&Q-U$<>ZxP^MY5C(lYYxKe z0^k2jJ-F`FV^!-w=9hY8{ZbE}pXy;_o&h$Q%nx~_rdBi`8#e^XkI|@;_tmslOJ7oM zJzo3#d67IUE+^;6&vEW4U1eTQBhzD8 znT2gioW8XGlft@bvB?T^^W-m$!)P6dDO!~GFKs7&xGUg;3Zwe8|;$N}nkz_Dd z3nQZK`qIgJR0JuNQr8q-xOA0(3aT_464^VzlpN5( zVRM-zrcMOem9I*CM?8RWVHOCs*9u!Pw(31T0A>XOQ(8rqYH>QB2WvlDsedjcQ{0zy zE0^qa^=cD0j>S0$0Po&JBqUICnx|rzrh27C()3Jvbv4y77mHVr78r2juwrum1cF|4 z)ci#eh&r@7D0iG7)j;{>QO=`45)nZesAA&D--f?pa5I`+HXxLfx_4!N`r(S2cdU|| zJNVN1Fp+qY5F9}bVoM2MG!B8Dug<-o374aHDr`w6GZ_hvQSwUK!~!)p`g&d!i;Xhw zr(jS<%z55w5wJFNY7PgtY#J9QJ^*nc=c<>#C**M%2~mz?v$y%id{jfHlBZ=bhEx0u z2E}o^y~cQp(7kFR98w@=Ng){-UTympu_Y9+ZQGOfk>_Z*;tV*MFs)k3my>O|I${ORFEA zjfzK;$W)WC!<71iqjBjqy1SzCUT|@#kL6#^mQl!mET$bCkI%*G z%fwF;X-*UNLrfditf=bkjbv;02{lQfoJax6UVqXLZ^-X?N0}X!oag@Z($o}xWfDSV ziWcagf%4U21U`~x^^C{d?h@P(EM~;-%WdT8(yw?-C^BjgSk(=_o;Xb$%X!CJ&@R`o z>Qn4kMQElA#$``)f-xX{#{?h}4-SaSsdJlK=ls|A-+I}p{HbZ?)O!M=fhZ(LIVZIz zo^uh_9E2`Mt0GIhub*FBqTq*8dA%yDwEPu=kDBp#3rn+}v*C0WQetwvVK)xt+-_IG z0|qnE5OIOY8qH?S3hGYO`HZc1c~;KMnk7HvnRI&Hw|-o=BDu#5%wvHaIde`Jw(fng z8J1yxj{pkB?PEuqL7KBY+tYJC$T93mJ!~FVA}LgT9;7Xlp?(~1v-d-Qx$Eu9YDxwJ zB?iy*>*Xd6? z{>=g~8bCaA?CgzW*c_C=NGbWjH&BIKpMjH=R@9$LDzv$)@%F~F4Mo~_HsAv$ErRBq?ynIx{iRxQAWRkcWF{1yE~13qPENnteO`=8N|%Dn_j2NzBE-n!(JdE@ZZ2!_QDnC7ox(WH1VDI6?!F(ZS>V>#`>{q|~gF^)v!!vnebc2L`@5`^03p13s)< z?v9o$))(n5z8NeU{NQz>(cJaSXEp*T{r5*YR}3=`9;IJHX7c@%bj`LYH4R5zF|>QwS@jkQ(?e)~CzhQGp=8q#J-Cq$5gP8~V~_u5HQ zHg~=GKqLuxMWu>dzWu6}%<@v_r_A-cP^*TPKSv`g^aUkFkhbGeFyl(LaZkkRE7z@J zv2@dzKI2g6AP!VXW_^)XRh!0qvW&?zkPcckaOJJ7GrsBR_67b)JxAS5+jW#mF47o_ z@x+35_3@a-(v(5B$~Wf85R1W61OSZgz1PkXuFP?(wkgqXb7XRB{l>3lhz7Asa-okh z{#prRr zuj45b&u;m)XV%TM=;1TA7R_-!z+V8S!$q&0mc2CXS=%f(9goOW$&FRulmae>u_`PC zfYfC>ddz%>c^BGvYoS#uE_3W&X*NOjx+7F|S3eG=nAgcOBu8#;cT}`!A&RB{mxBp- znOEeWZ~1(+Rrk;K7oVCOWh;;`JzynBeDvGnX_5N{1KJWr3v)m+yT7BC@yJ%>1%~y$ zOKZ@mL`)85n$)-jE!=klElJx#A$T5@NbPR&d@RsKCNR9R)BTjsD6>^E?{WR!#o8db za9C(@g?7CTq_K&_FzLYw;EKx6x1(^{EE$Oxr3!sMF?kdz5{!$~rH;gDyApYI&d6e# zl#U0>zKxB@O5S@Fi8XJNV`B&K-vr0#BRYjX?T zDSVAj=?^b=8~qVL7sgVQ-iH) z9FsxrVmlrPB7JRnN)fud_~17V?1}!QT|#nZGEhwK2XZg_FVu?HeP1BvZ(28atc>uzucFk zap!;cgRdi5jKUx1uJzlU_i5GS*o#GE)eVi?QKx1NZyarT8u+VA^E$N#-DonJ&0V)8 zGkM;?42L`krf8yX51V>zM;V zcugX@M-6q#*A1d|8Goh=_pR^6LT|Gb*j>=jAxG83Jj~{kc4~D_f#(TsE-s8bk24%o zVtUI3{ZKpON9Wd4z0Ajyf7Er!8>VI<-dnyvQLxS@o_RW#8^Er?yVAqJ}B?NC0%+$#$RRbcF!j=h(3c*D&yw+U^Q`Ei9)`;?T@B= zIr3lX;E%RVr-4{)kMrf+P<>;Z{Vjq& zspVfiAbUytKBfiirubaOXLjYHX*ai&X1@UzGx@jsBJGXy3axdQCtMfTyYj~|fB4!o z=L^SzCF2U%=6s2q@~}LS;wCIny{44~r1&VLN}T>p3cH+B1*hhgQAZr&k$9Bj9FczJ zeb==wNa==QcrtoK;Cl8q*$;`YLOI9D<&)hO_L)_mP^MtQd^>|bKm7~f@llU<`I=zG zMwP}v5%zUL9i8jA~ZAJVW?;QXaOW}|dp+#F4NV5!;P1L7~9jwu_Rip#4zmQCK z#n0HT0Eb3VI8*vYSu!e!b#?yAN0%aT4&7YnPX@WKu<89HzOE!k*5a9f0xhoY*I_*R zwP3{Gp-7YKFX(b1`b|F_=uzk+>W{S zYP90p>a5Ebn}y7#g=fn^JMY=D%%JQa$MeS(atI-(`o+m+%Ffrq+8y=tJ)c+W(P@8t z{6wAKs|SGhfBr9zEvs z^VF8o-g}zpn0w;yblb>{h<*M%@l074$QIbx`)m%GU%$I|y*%q0pQRD{C!t~G@jHXz z1?0!6Mr|IaC9jmLu`Y@!dEq#8i;W4#u50t*9xWG?15B(+VKT zt{7z$v}f?h$jzw0V75P-B<1mP*yirKI!Eh+(lIe9j@=ZArlzI!LEmHi?q?ze{}zeB z|MOXSNY|5KAS=Hdl^Y}aUP;Lq4SrPy$Q0KN9_s#iaT7bdQkHDYt!u;K)2XR4W9+9# z?VE;IiGRJ;?e}Xx067A3=%7+e6g}4SQ}!NEloU`iIYA0Q@+$}G3D8Dj0{=e*^z~~B z#HLEWi#GsM$uhMek4EjKuW{C~DiN{4{Up#B1bXXK`=!ftCoyT)q>34~T83ocp1?|iqOAG_7>5zzxgz^rd&6-ifRSmoLO^8&aZ|eZVM?l=r(rjaO zmv^`nLZB!a3v|zb>hYV!n10ot1)w{FO^B*#Yc4cVZmRlCD@7)SRwWODQr(X7Td;f! z`5iDIHzaUyx3`SLLFIogkJ(B>a#9u8wU|EMB5()A~4oD@1V1jVB5WBPV z+X}MROr49_`A6^+=%|O%1E0{4)%!>LMknv}#=T6aO)&4Aov5!p?>~G|>)PEjJvUSD z*mvE}Nwt+u9j7)C8RzV~u%MsgQ9t?Q;c_5V^5+f`&%}-6n&6EjXH~zNy3;U;GRMIe zz<@P|K{EZv%hB)&H< zfq~@jTFr}SQ@9AZk>A1n{y9r(vh8ma->o|=$e(dApBe&U1k=k9c2YR_Uq541P5oYH z&f$#xtLNu8z9`r*&!SX+*OQ&B>1nW}j7;iCq|JToFc!aTh4KX3cs&5q?J{Tcc2^KYjCr>SZ{~8VfCI{Do3Wi*r z(EAlBzS7HZOu&~6Vr2AivTgCuNf7Js%L*)26mx1%9E_}2>8kx2iL4q`ZF)#k`Pob- zt>`fcwkT_^Ki2rYFdRd>Vp#6ucSVXS{ov4Ne-;3TKr5#EU4-1IH=fmI)emh&0YDmB zRjd>Edzf%Pe~^*yA%if16}0*MT!GTCQ7*2A_SN2s?=?-GY{)`@b&9&g{78-ji-<^c zVFHf|F04NbOhpXx4@*vOJ$oscgDY#CijE8t&2`clMd63$Y|OzC^8JfT${|gjx7>co zac6|b+f)9$SGtmI4((%qG3g!^oE~CJ&|HHrCWMv}_M+XdGm$ciQ$WmCE+P*M(zM*X zP!J>xBuyJe%~JbrpB=!q#ZZPuo(YDOgr%e}ndmY3;`>H*CE{#hcOj6%d=DntioO1b z$h&vv2uw^&*{gB&>uq?+X;KVN^*rupWBnlzKj`Q7%RGSj5JJ$L*dqNmhAWp=SXy-X zE*z{mI5a{ZL_~cR1O_F#z}kWdCpbf|o_FEfs)6*Dp?wm$G+-4cS7gxJWULV!)Vr6r zsEyb@+-Pezu%z7%9}@3qECXs_kgMs&hW{K7|{wjX&Jftf|bZF7Gu*tx=^RCi8mx5LxbhfJfZBp$Ic zhQ~YRGDYb1E+-R#M_a9dHlN=-$ZJ{nbTr4wS!QhcF+bzG(c@WiEM?UfzLolVdIqyK zZu6sp$~@ZCOsV|P;)OWD6nF_hDM{BgYRz#WXE%E7Jgt*rL9=(#@s8&n+hduax99vNoS$GOE4_havoDVpOV)Es#y^TD?ivrNWQyzH?dh4X&4t!Q0{%p)2n+Uh( zwu)gjqUQ&0vvckz%WgtYtfNS>2YPLXswgdzIUT4}Wq34DYD(0WpjFF971=?6XehUp z?##Rb=^;L-bgV1FL4L1OPpSNA>YG&E9HO@hJ)>ipH2G$nIDQ0 zIGnvqs%vH193%;tB05?Ar*W8&qhx9W9bhxvPs@GYf#C61tI1x^+Dyy!ybg@#Cyhiw z*iip6=?BP`$mOooqpRmIgqWfT&_Tgfbw)U9aCZY4CZr4cg9!O0pf6{{Rb+5JQ#eoQ zknUk38^Cvc^DRm}r2}08`_=BtCPL7}YU1d}d3qSV=%G(6HG%;DMVeH89Cehz7Y7+M z>8WG#Lnq2F>M@qw<}Q(6q z%oisw5oPoE{)5WnJ`@)Oc+$@)@0ggL!`2M)w?N1s7tS;XOe+z%MX>6d9*xk;KxaAy zY-C@if8UFHju)RV&UW_Tomi&;7I>d|kJJSHfg@%TogH9+npl2vNS{D43y0>8M@nEQ zGW;Y8AZJ9c<~>qVUa7-*$&+GcoBEmHwV(ozY9wdf^DjyzVkH2!u*VG0+v5STaBuSu z^wdv&1>SJTv7$ePN?K^6jO zFd=1WqW1yfc~HXXwe>goWOL$A3)qzmr~pzv{ELOg;z^M7yPeDK>pn*|qIzeY?9l3a z8R*e&bPqw$i~n;0POjD+VwK&7jx>?91UZZt0btwI52)DLQzuMTBp>boS=<^_cv4-0 zdj@X=p|2k03$ESbCPmPH@QZ87Ib_YQxa~}!LgrI|= z1djl#coqjRhwnZ{>WZ!v(qN~cDxgf#u#Qe@XjrI9I;-owO$|Gz(%5 z$O2-y9f0zm6zso3`G2QS!W&qnfF+*b;^NDv8l%b&dvWTP0YWby`jh#QLa#UA4RjEL zi&{?ox3DSf$ZlU`P`p=?ezJp`>G51XZVm;(Bysih{v%zVsuToF-uMSzq-6hQ0{HO$ z(F7p2{NS}q5BGp3N~mJwZaXSEBT|$Z^ICLi|n4upQyY< z22~?c8BckUP1;t)q6W(<3Yu}0XFrp$S0tk*ya<|QL^T-~CB<9>nO5CEF!A{G(*S1-$79R^iGOJ$SL0lX#V<4=E=9Z-{+Cec5Hwf;#6H6eRW4!@QR|U8qAcKuEcU zcB^=Znm!_aEHoX8MNCJC9#$Z)SKWqu_Bzq9Sm?yn(TZ=M+lB=hUf|!>KIzHv%aUw_ zn~-vH#)T4cK344T^+Dl`#X$u+_`6~(v;mKnx62N{4e)l@cG*P6A*q{`XmwthOiv&J15`eGiaK;6{X>^q6xb`hn5&>g%S~=^7 zaS~Umms@#CNp88~Jr#IDUl2tNSS4Yu0-RYZl{?RdCG{Dam921{P?1@rfA z2+PN;?4dGp2Ro@op{!LwoW3gkLL>G$yp#%sJM`eyYwZKJi~9mD3Dmi0k{mB!bDtl zpCcpR#P}yAx%-Ba@X6Y5jA6O8Dho%VV(BuDM}bro3bhRtm6etKPTJa6ewqBkSgt&Y z301Er5RO(=*_*pCNSfO>M)Cw@)mzfnN&6`Xtccp`Bi&aUfse;fuXf-Iw`dkdJ^usB zP2;_hA&P9-VZ{mJ{=S3n#xqx>cfYNpL#aDDOTo+xRyrJDARyheWvz{wHZW#Sbowbp zb#?H{9`Rr@gG(Dk6EewKvQwqE-wuD_DEA5{dhBy zDp`4-2gf;jU*A7@`t*W$0|}il-Sg($pCF4pv;6}>P9c`|eR2Md@|J|J3V#=sVrBq? z5do*E_=iNXcuWsV)wpH%7fU?CRK&bc`A|&gbd<^d$D?bh?P>E=gPD)mA}j~DV@e)g zLNItCQ}AJxhx)^$VNO4WC-740CAS~GcQZ6bMAK_9xQ**zK& z5h0kwogx4<2WFVIYi>L0`G`bm8y-isyJyJ~t#3XZ?mossyAt~4;~zkP$juk$M6#1t zThLX+=Hhyy&5pJ$W=5bCNzzT4V>FvsmPU|t0tlkfgA+kSLut$9p{LzX?4NK|=NE_H zLbhgTE1&tX(!M@UL63 zhc+dXwyI^_{sWh@FGyyW2hSP|KM?S|M@2&uL65g`IqYRO8gRZ(!^fARUVh@%*tpx^ zYBZf+`F%7HqXGM+ZQ`n{rC{j;fc^5AknahW&`SK8I>)*>IdGT~-}RR51yI{J+8i_2 zPgR|$;$vlv5)EqzDrnqMmz0(+JKpZmb~l=9a0h~8SzFJ`whHioWyAT3M$PK%r2#vo zjEmoUEhn!jhU=?C#-owp{(+qtgYna)r580SnmU$sU#`A?H#dM9)7~Q*N44b2{uu(b zCF};vO=aJC)hRFa4FSgu4altMH-FOq$PlMzwYn5FfHY?~02A@2R`pezpJ*;X!@e5ELAo)@fPyw$fErwqQ?r{l#0u7+y}GW9M{j z%j4Q?swerHe8%B2$>II_k7l1W`u!i$j%uZEAJv~+yqV}(j?t1_P3CTT{iz-4T>;z5 z;$^=lrINICNS=(e4lhtyBe3nYFVXf0s5V(n0s1h@Q7adBG$BKGcw*-x%0!($9j%7 zewdAndBP0ckL_`}6%5a2qhAQ)6i~@*Dakn1?lI*>Wk|&g-p2q7yQNsGeox!Uz|;B6 zWp6UY#;mPMHi1o5)*9M%>vVAn`*(PVRHX`lZu+ z^&#j(OIKcDgEZcw+JU06o9(i7ok1at>y#O%nJUEKl`x)*0CNR{_U{!*`Ckxk!1rTuX);?6XMnqkaQPUivw;lqFj(j&71el^;X9U@q2UT7T9oNVtz_4N%#a4 zjkw2~k-}HiseafGcX9N;wTw;3%vxpOCji6nXvDp%S<$k_-?_#^3uIn2e=7WIE z*)r33e!1yzhAF%HC}&gcq$7fQ4@q(Pa&4}I$1XSLhNB&l_1aGJZAYh*K2l-xZ_{CU+hgQ%^zk-Nxiq=YAcHnvpPS|`@}9*m+yZ%`fapR9duec#PtdHFd@*m5 z)<~bwSO~D@qA_AW_Z@lSjp~|YrH6qhrss!<&macP#Q?y3-81n59-SmkU1muD z^Y}mqhG_v)WHX^nLGE{9cR@WDgj%HxHfnQhlwpQeLZvrOE zO-(l&_G?U@KthSQMZb!Lr1vINelstuTCj;f>rtw>4r!4Dny+&V`-v;;+u6Uier;*? z>0@SSam7ZCebvYO<=JyBztBqwYGjZxncc)Y^cx(AQ|dtYb0PDES8~aE81uw4wMsw^ z)%Yvlvj9KT+0d`W4c95`>xs5~(R2NP-5Pmim0kMGqAHZjeoH)o%`&rc_96`Y_Js0k zd6gu`c$o4a7R20+zD`oQ;xlZWzn9tlD~hce-@ zh^G%Ksr3B5nrMwwzJHjnQW_!ntr;d5PXpFpDFh2noh~a#+i{djgk{}mDXneEb}G|1 zEEKDr=}l#7aUKm1Nndy<;4b5E1Rj$0ed6=OLDw3n?Vn%*OKne@Cnt3u-W7q8Nox z05MYQ9M*HUC!CGx-Bvn@cz!~j6z%V)D3TqiU@zmzo=?9C)})uzoo7qJ=^KrpeLOXe z^Ss3*5KM=KlOpv4HYv%tNT`m7(dau?phdOGaeMXrXylVzB-;76SLY-yYkD;Rvg82G z=MLMPwFS6n#b=Y77N3D$oG$arqjedj%sRsu&u(g1ndMya+nZg}T9x_vB#NCT7$6{Yy~Z;(e*6D}J++S^Eh2=Ghy5^tag2N+iNKFhsKF6b zf<--=Rof zQV9yQHK1tGfC;LK6k0KEblmn0&d;M_be?>HFhuNrxNQ&f3=c&eOgnsb5d#$SNA+h2!ZxGrx zai4#YEv>NOJ>coSz`HIsYIqVd;({Lh5}-M3U(4p{ zZFf&kI-a&u;Ej5<;!3fElvLWVUsi9Gb*22JL&iDEtm|&$(z@o`H(Oa9DNc%Gu5HUP zo$`i^jZ6|fqB>$=jjJ#bv{q;LaKHm+sB~6#x;I*=qtl>z+i+=t>wf#y?A-cu>>#h( zS(}NsLN##JeI3%I%))&F{uvBljW08Bg4qS2=IosqZ8Y5!f3UaxpAji9{>-j|xh zK%!({oC5z{K}51Le1F6#s!1?u9h76DzEqJOf=st?GyYlB_y%|RdcjO1sheZGDJ@Jk zT&AdM|9E_OyA{K7(!oJ1>E%4i#n~sh0FeZSf&32lG^H)C0b6Ftr2Y0hyuE` zn0(RRyxKhtEjP&+JTg|wQAm5Q?HMlzGJp=B!H5j0jB)F)?;kCF5APT>na(A! zVM~T=e%Dx``gq^3TL>dGMz3Rpi?~q+?@QeZ+83Any_bg1bVI%&L4iidk0i@%)p51* zx*8Q`8TVGBU9R_!&-`}{qr0P2#vt$UK`wmlZx!A)xA+ZZa@egE5tg2A4f?!)RLdXo zK}oq>e{09%Va-{XB9_Nu`-h{x+S`of?ii)^hKIRvkWcX}#)4)V5Ak(L(BnXc7-?a= zH_*^>)qHjO>PJ}50~xE|aPaIe4VSo!6^hLoW)`tJfp7hb3Qfif==&$CQ$5ffp9Z8Z zs?YaY|H=&RuUsww!yAE2UK>@&_oqP4#2d(9Lif_4=~8d;rwe?erWkYN$onMHx)20A zc_G4iY(!a%9-REo99JT-xe51+1BSC+Kh~PA%shINBb{BGSf^iOI)posTRL^4 zCnLV_6i9TMReRO0-msCI=Qm#rwWu)Pq&%O@FvZ|;iPZ8z`~Qfuo7>b>g3>l=r^p9RIr9 zF;SXr&>j6hb&se8)is9@OR#I~S|pMpqI(sk*e8ZFn)-RuI36*reVfkl*~^9gf*|&m zpvf5f8!#4Eh@s^!76t*c-@pPL?8RIPR5ynntU6>WCLiL&zrG6`@;V)=yiK4#XXa@uY>23^08l*u=x`tLdbSP;lX^`$lkPx3W`2GF&cOQE{$KJo^ z)$`*2S`Y8>UhBTrTIY41pEV+!r@95W zzTaIMP3)I4ay+c1_BFq#dL)U!5sfqb;wy#*)S`Hewo}Et>W_0nNIZXqdXMfQ}akq@;%)xDiB)(Z>N}&ZI}d{UfhV zoVz{AS>SR%svWcBhxI)4%08iJ2%mpL#u*c|-`c0T`KEiE;W{dauw}{H64mvZ;p*~0 zmO^mMXi?@-qx>SK(67Ux96}MZ_ED#!QKn1|JOh3O(Vp~+k=sJg-->aP{VtcJ!3n)X z6>5To*_$ko)hA70D&usrFM&ua_tos`cMu`G$6u(evDN$&<;)?Ib%*wf zlg(Bjt)KdKnx~+aSJbW-2h~P;8#9N02}~+W!Q|fLd{U68a+znP1xRz#q9|8ntgakw zO)7*?HHh^GPDRC8zOQ$!P-Z0cyPo)!KNteF{N9msGjO| z(hSQliwEI@nX;xQ50mm|rU%Hd#az`aqDZVgp!|V*)*1rQL~@#5?wXj#uW!WvQt%*i z=SuzJp61Ua6+pf5mWD|tpmrlL!j(<);59E<%xI~BS%OTcb0N~8dB(ibR=+xG=4JZ^ zYiMz@+>~FFg)^7t(0iR^GUZWRoxh{X;jt-*3 zGr^GimB*B^nn$cNyr0OveXO+c=q6g(sl{I@ntLR*i3=uyDJAhyM3VRxPEyN2GS@0F z1xTw&h()anJfjoUZdX)`lk-29GVv2q@#v(DT7Vu^X~cMq83c-T)|RP!HkKY>km$2_ z_OA~A?3r4J9~X27PXK4fVac}NtiL~e4Os1J zCj4+6!g)t`at87;SO25+pmIyq{4gaLkJ9-&4l2p1_p9d}Zl(L34it!W8IKJih+pr^ z_~?{0U&vb>?yi`uoU;*|@3V=e6?e38UGVhtF5Vf@jig~}@hu-{$>H8h568b^&ZK&g zlg$_k`#syGj?-}ZhSENo0gv*%=)nYq_$wI)6`vzdz3xFKf5oh5nc(uhE=0H}UV1x- zGnKt?Y>UgXc~cC|gqE=VJ?Es;rIjjz=jU|1VvXgzSC+c8eov(;D`XwjuG}9qPl({% z=9s0#)iFyHTIg{*;&&p8&M-8Ws`<+eAZl*1Mg*`6wifxu)WD3xV(P!99P<-5f zOw^=S!^+n6lm?Pvdn<%R%cIG`L-ELGP908lk*lF{#2ZYM#!+I;V$_F$gTq~qM>zasL8f;MCn{!>Now6mOiIj=Lh`~ydFUtN%fmzotfqJ8>|wW zraM)`!^2L2%Ph#)NlCV65NUiE@a0ME&&5qBjRjq8ft zntxaAmgivLsL{7UvN7BeSsIzlgN+ys`6Z`{1wlGHH#4;H4W*;H@-@tYXumq^)qFp8 zB92~x!=$kY8z53az=m%57NHQbH}jkms;$=T6eln3HY;)-?#m;Nqf#Ejg?OZN=nxa*do zzvvd$i}LJHHOi*@obmL;_-jVXnv+RW$*ka!g@Io&5cK3l_c|xJSOTY)zM23!LRAw* z#{}@klNfm}Eoo}ci$=0N*{ezE_|nA_Ek|Kd+|UwNhN^AlAgCp&74+gL|K3WB3j&|& zeR$=rvI~@G&9biOztJp{*^z99&H=eT%oahxWyaGJj9s!nc*tpTYyB+-O0Jz$!H%#{ zkQ~p~g0)8dZX55W_~g9|Y>q$=U(}(Qy^62j&TK)W zch~G?7i{c#;Mr${r4Bw3oYdIUm8fw~-iqUHmP6)I{aM=C>ZH9gEy>+c5u6+C^j{Kj z$uCqY@)RS}rq@gd@H@_#8^Y_It$3}+C07Dd`>?%N!${!FHD$W}qdd7E`Ct?DAN!dEYwLH}gN&<}|@jJe? z%k_rQfd-t3a=4djYE;dGpB|Pdxku45sRTT9VWV=}Gaf||DNCbzsO z7u-^V%LR6X*iNyw#*P&IuE4p~-QhcGbR3lOa`w^n?gVx+<90p>8?xc#oBt{xXGFO= ztm+;`@W*$?AaoBPImW$dqR_B2ARi^?s;1r-{n-3=LBzO&{~27reYh;WCuldpHty=kR#;x%uoEPHt2gwID4; zLb;hefzUIWw;2s%;&Yh+#3ysZ%>_fbLjYR9CQ#^oryRw+*&?@ zi|%Oa!4ct7{Mgf^ODA8~!Z@71871=i1==&W~h-aL#cmj)C59lPrzNfIB4UpwL4L zre2UlAytg{q-hjz>p=UP4JSbwq;2pFQH`5jW_l>=+*6JkrSg-sU@KMiN)^=OiX;?F zC+%nDH`etuz%%gZ{6e@tlUZV59baC~kyVDmKM*Xv67m}@wjP++zpiAWJ|?zDkfyd0 zEuivsizxOUHuWq;QqvuUyba!PC)a2qfQ89nK^~0A8W+?!G~+;oosjvCkQKR715wNCu);`V6!|^G-lt*cE zM)S5Zd>OQ1!SrAJ%D_>iIdOr)TT4Dfu#+=L{dyq_JO|zKI8e)xkI}dEaXo&RR$UZU zW|~smE+lUU8go&K6}VfZ#{4^KC!fforS8bAOZjPP5*436uhnf26UplL@M%8=t)i*C zKw7UCg7~*7;|&GFKB-CgsZmY5z*&)jFT8D`aZsc@qHxj?oRabK5^fY;QwHWgHJMCrz!7Rv zu2JcQ=3-HKtn`l`KV_Jk)_puF$MHZx15s@bHy@fYrMC&}=BjHJgI-L32)-Hoh@0vn zM2rrkQLV|p#hvA2uWT+|v1KTD{aE*_Q2pGh72Q6{mKi}qpp)t8p`QQVuwV86o&0F0 z@hpO3<6HdMt^}EcaO!|g1=ky67lHO=b;~ZZG-uL^L_0R0$knNwWTZ0)h97=Qwjc9E z*4%x4BznM>w1DVH6x9ems9FqZKvY`zs-d+U8)+;^irE_kE64GScl8$`k0PJa3*VC^ zZLifig+&E0*V`auXD9XFk!5A544+?mmkqM!|E57!Yyi{9pEpHWeM#b5jfncOCTVfC z@kg&Q;VHzRxXK&%J{*B7c?5-*l5*Gt@scxOrBjkpo~%Y?RXx0tLghC;*v2o@YXFzK z6KJD|{OPP$`xY)Q<1qP6g+FcTmaY&qtaAKMuNOv^mu_@0E|(rk1Qk@n>QG-s-K3y2 zEi6-PXYQ6ZxuV{nv`aFfyNwPUmarPFgH@g1H|i z%fBBNr3-_x#C$0=tW~Ni{~3wkzc)Tziwr8$WDLljX*|QaCLmLladF4_Jl{Oys1WTZ z^y)`elp-0BRAjZ$b46dPx_iefq7%K(^?9$T?gU>8GQ){ImSdeoATE9J}%l02k zi0lIJnc(iJ3#3%;ep9y9I8i@1Yg4_{+i1f^br!DiG$|O0E$cgm8!{eS&6cFz=4+vG zH>5)<-oAQf*)GWWBo#mU4O+-$Kx!|Nl$BK+NOFyk%(b)t=w2Ve_<#Xn8BsJnIRMSG@lI* z(sq(_Vb~-aF}*`T3~eGN)~(6kEkD2b^zt=I#H)e;4uY zErr&C#54uj7$@lp8-Ao$1E0T1JZ-v%>8qZoRjb=w!3(!=X;_bI&-c4m$AfcbSQ=im ztT<5^ae!t0D)QG_>eC2Pyl@en6g5%?0UjQ$<=nY4Q^Vn3Au$uqX$l;oAJNBR(f~k*nHE=){C*z547Bkf^{GGd=WOrv|T?seyju8c>uk@PNuOE`)s_ zZr&N4!#yBG8(5@%t-)?nrTQypbI)8gcLQ<#nDZVK#&y?=>gEP=rtzt!3*WV3ZWQ)$ z;8`0YWZHVf6r~Ev1_3VQfrUPLnqyyd^)%8C7d00Tr$#VH{@pIkN<5`u%i(9k= zI23H>4zGHQ8~YZGtbprc*#DE{Nt)PW!6fsD#Bu(k_6^(iI76$SmOfXWkwC{V7v7rK z+;-*)k&2lm;X;~lkm`eOtnxxDvSzvW-9VQP@B+kjYW_6+yy8FWcaK)SqFm(J?#Tz| z6%IY}{;%R2$jYbFHlG}5bsU+IyD}Yf_vE`4lY15i6n;l~rH_Q-(TK7Hyq^rRj68VU zN$St>F<2O=O3Yc%ACDUMuU+$Vc#myD?05FVQb|osH&l8*uAgw061b| zui)TQ??0#8opQdVPDwbQtN2)M<(B=d?{O+HX{526;PjKl<&TfJ#%-8NS$zFYUv=wE zyT`da9GrN?ypO&cb;ZFMGPqZorMVK&GLf@4i5Iw1B>j3(nG;<-EZ{0mN*T(m^s`J? zy9E~)7q=xma(g;hro{exxef8(GoCc>4Ke;DF4=UUi#KVH%dTFd%nRx z-S^_gFHqY)h8jf1f#3L5=+~_-QO|~J-mpjtal^2-(15YS0xI2or>vF0Jgy@^UFDQ> zor#olrg`m5lQg^18xxb^A>_^4M|N`l6DAfeHO`aVXs9G>CqwqU>3p6?sNLN0>EVdk zVq?GtsHe>W5uPVau1Dj)7CjO>NPTuRz>nHQ^oiE8S3R(6_NBsIoi*{w>nG~3^B^MU z^jBZhcOfy-HmWmrMVl^){&h`(nY+{Q+a!`O$wyFGnSH9XPvA7+u(-k8{guY3t>s=3 zIHfR$CiiUt8H+*qpl2{pZ|2Aqw2~N9m&HtT+t?ZU_7OANZ??|Sz71OVEj=yJW&v&v zk+tJU{})ZCg}lq>e_Uv_=?>*XnPZ8Nr4g*qGT;aEG!C&8`!F z69Aisf{t;lh;iJ|zpJIDZic}ctP=yY7?y3ut}me30vDFyd8jZdmkTm|8kq~k%2 z+7wfw1_Q9MC(i+-%GRw0NPFO}Uw_eqejSZRQX^gqT>0kfhF;v(BS-6TEiw)iygGe^ zH$kzJYNX-x1(439#|$~E63|2n zLqzSo3>N7$XnOD5Im==8l9H0D?X6W=VnTwy71S6nv`OX-5s8nDCE+URiw>RkMDE_C zdbgJp+^RkD)Y$DRpZsa}4Iy%e#H zjpG~1INtnDm@~d-N1eN|U3TF2_Ur+%E)&=Zg!$D(uSzaq^UlbCbf20oR=*?7f-t0K z4rp9^-d_xWDr4EPQuFwajdQ`h<*@&<&*$<%CsUdX3fKu|$-my?c`bFtImt85KI9Kh zdIDu{!i0p9bH(r5iN zWHa2Yzb&_9L+ci}Gni9vYY^c*`(DboR18|98gcE{KBTlu&)|HVqAv%m1*4xUdrF$; zpAo4@sY$b5dk#$W0YGtb`-s|MJtu#26!z^=A3H{NJ7 zGc&6`P6_Ck8DesI|2=^l`s_bTDaX3qt<2ddjZXOY5+f{G_GQ|+q<7~6(jyRUahy$w zN(YF~)y^xbLf0a3QQ@Z0Z0xA_^L7VKz3-}H+l3??7L&XT$6l`-^f$+LZtEd65n?dA zW?QOhuZ`ZN#2Vp*8H$gEem}gRJASPlpO<1HB}mc)8ixzt`u>^Ed9cg*++-B5Y`_yz zVA=i5+E_p{gt3-^acrhV?!#|IEb4wpl+bmm;8uBrMhJoS-C+X!7XdF&@Ybqr&Rp|k zS*KH(HW%YO*!nkyE=K`n1O|^=1Ol%QzYQN}WXHI8CD6}IZY;K_l*ar(TVq2M3(`Qn zQaUaEN1X4d~GR~SJ zFnm$7z!A+aMN8&=sMCG+3l3Q|%v6jMQc`5I6e2U9m)^N2`W9Hj*h&IOjXPtK1@XHk znnx$@d5G=CY`14gY$>Cf@sWan$YmhOAy(`+8V_6s4U@7=a02?IChL96{IkN3;w zqf^Sk*GXsnh#R3mP-|R1f7SboHv7Q;{U(aOD$W1G(=lsYS|yqPBiDTLF!;ijZWy*p4m<@ z0sbXri`b@n47QL27wryfVMx>g>@Iqubo;!+!}fX3=_2b$8Zo7m$N*wkg+?0<=Qehr zouH4P*799=mZ~M3GuGA=>VAuhvd@Ufy(!&^g4#@#Y&)TA9CcP={4q1-;XHE3r+JDA zrED`pVK&Z0y-?3YZ;&9ciTbBsSarPT}ATVp|}nfuPcc1zpg z=r`%pg>;OGXbr{$lMw;&`~cF0J1goBHUy9f_&3yu`LBBKWdx9+O;H(gt>;+-juLw1 zIg&w@a&C|2V~P+U?;W*F`{EIw-Lwg#7*y2?!*oi98($~AitGMI^;&BWpBDNWkK5yb zUr{*w{^ewOTs_pu$?0j(*c;rtU3F8o-CIui24uSWRW^?R6UeE$)|NW(1#oYsei0Z` z7ZtHKH836!pfU{;6Y!wwf`9IzFspPx&AEuU2&Q^O$8gF~TUv)}uPybaMU-HBS5S>L zH%~3{m~FQw@ia%#2t~M-b>MhUF=9!D?>N6?L&pQCBLD4H!i^c(aJpW^txe|r;gEcf zTEKpW&D3BXg!)G0?wG~-#KAafSxO?&+xmtal85xwkpws9v*&o9hyYvd>(O{ zZBBAOk9PbaZFqkQ0A+ktG7H%H9~Qu1PEJYn%y7n0Tah?Obsd)D9=+?m?0k7dK;a;@ zVCa#U>yscQXpu3t?DM5w#2Pl_;qnjZ$uaw#6U zSA)=EeWe{@*XHFruGVh!-u`9eK{`0J*E+7P?Dq_*b^aWOZWSf{(nRkPcnac6l|fwv zGIfHfBBW7$LLel&Dm*h^qZf1BQ`N57WD%2vh1JMJfib_7j9k7$&Ldf(?R8&HGIm=_ zpL7?5B~@igeUrQgrF6pchABh-Q<$T1J{Q-Shs%^&1Xh0}u_rU)i^1|$q6e)rW6vD- zDxP(c&-ftTl!Z(Uc|^sqKKT6OnsX=(cAYsP#(Befc&Gg!FQI(mjcEJklZpG!_IUCk zB~&w{Pb{*TlaCEYHIew76!Z(=o}7Tpy&d(-IU-RW@l6YR+ZVVc3^p=M`<8% zEQlG(WI%at+wla%Q4ziZoI3WLy`c$*FZ3}U-DiU*ldA*27iYeWy!0yI^ zBq`t&NEK78Gq=VnpN(cN)VA^Zj}X4&I=@J6FA^eUHH^+x-noK%Dwd=-CQ| z$MV7p=Ue0VX&ITAIvp@YA)3?aLjAWprkB4ygR)T(<&Wz5BFMkO@7!GghP4#0T>bK$ zrlUvbEU^#MnxDV9iJl5fqcMUgrgbF`OVZmpYT zL0}i_T!E7!CTBZHt}9;KDOLp2B8@b%Tgeyot5wPJnlBa~hCoE4-W%w`!aUYvR(JA1 zQLUxr3p={B>}PlX5lxlA`=7KG|0h#S@AoWfxZ;X^1^ay>J3+DrII@{DfzRe! zgZIbAiF05zJQab#tW~NVlyCbS9th^#S>*a%ZqxDD1Y$l5pp@qc@d6Qrla=yX-16w*DnB>vK;`11 z0R#U`nwUAT{fV7QwWiaGdXyniIxJ-&!x^Wqbj5(FYjy4}_Q!W{hci|r(o$;a`1J>$ zUtC?{9JyGbM8wpp#X}3A`|w;HwB4)}*>% zWYp@-W;ylrBSVu|FI^XD{?2+k^uJ01DL=P2C63>Z`!J-F+XKk1Kfb z3xHBpsIp>Ye9C{KP`x;u4U)&RCc=2XbtrTU1Z&T?+s$!k?SLS3164>~~vzaGMsMU^KNFE!It_b6I8^Zu`AI)7IGAJ70gu$}oQaf#P?Gq#M|to?S<-li`3X4>rIZi@BSdHl_=)lM9CI z8?CRY$Rq@-J5$|f9^Am!$=lQM{J>^V6;>BQ`PR%6fbg<)!}npG-Q6)5J@q?HkVpmF z{#qTuj2*zVl0Q$Ln^%GrC{h>tHLTNLfy?2Ks^^c~eqEmll6jf>tkT{pY2|@#Tm8q_ zX4={m;libOMqa9S+d4w>Oft*dyG3RoOFhHFMxl5RKz-gB_zvQjClN_1(PCvIo0=fR;)p-3|;ah~&e07xQv%e~3Op?5SiG^&;w z!dm{X z8RVShpy^2_C}k)vSOx5M6kg`#6a~Cd$Zd+?;!Hti-7P zsYe;PiO&E8p2OfOxWN;jqyRn)Pd1t2`#iU|yD%Y??u%dXLwT$sFlSw*7v=V}g{zVG z4@y!7xTEGxmB5FCdXBj9u2hzy+~0k9?6#1uYYwCQVA%{`f&VCpHOqcQ+2<7)$j}pg zwj{t*`73I9&I$+x*qzx!)I(VX7C#2P6OH!)Q9=n_VXi43Uk4kB<8H0F#z-01+9oH)!If{wa_gdi;rj?gBd8PXZF z#(1B8A&5kUbgj+XQb+vy{7>fdepGxA^!WT4v+Zoc3dwWa zC=v6iATJly6_PH(;7y5NRubXV(8zCgVp6v%aoYBucJ*@Rt~FC<;y3jZpj?u!7P2mO zo0bm6toC)^RezUn|N6^Xg_cLx?e(Zk{_;j?(DnL6Nq}kf*URTcBRl{AqH&gYcsm3w zUFy1;c|N%Mk7#CWj0t~9aU6;K*4;SOZyy-W+9+{WXT{q1Ig?p3t{RQ;#h0w(z}rGr zaD4YAE^`eLf58?09%n^Xh=>qF1tYs^SJISy=ofZP*lbw(-V-k>aN;KXztK-kj@KJPo!@mwJ7(^DCUK%FwZmw2JY#3iq}>$QqjGm%rnC z-qqxw$f*O1g_j4!^g@6M?)g|5eX()l#C@a#HQ;;ZfIZ*1Dh$q<3=Wpv>Q7_tt3;1f zpew(&$!_7_c3;~em=FcopCui6Hq5FpiAq#{HPU{G?s5)TbC`%J}|IelDeE*=YD)ynNan%T~lOox1sD_W0Vr$sYe1EQ&YS zSkP)aBQWRV4h823;B5L~v=YW|aLmkNZo~q`4V?~UC!yFWmBsP1e@xMzJ_~k4cSYRT znwgqlz*ef;j|_*VoJd*g48auCDd`1`{>OhkXpJ$>+_aXKd{Kptab+t{^)zC{rq9e z_mVJhAxHZH2_cO87r7+`woUip4b>nUUs+?MmzM0~}Rko~-;kGKNFv_nFb zEw_GMSR|E2C!0uPIT8*x>0ua634{yts=e>6NBpc{2%X?BmN(L8?ou}!@)MH`4WMwQ zD^gGEQkEkE;HQk!;&#WEFDx&=pZTOXP6L)RRss}u$4NgS+=VRYt+42UNa=1i!Fq6t zApZnjIRh#rR-;woD8y!xC0hO{EGz!Uy$70$Qr}$(pf<3*`!CSOw;N?=QUf3UuN}0|tY-b2Ib0mQdd->Ij%^U)ms0=@SId5g~VtrQTZ;{l#rC8P~laf$Dz?&0Gv{m?$fH>t~Rx zc2=p(!p#t7&G<&cYfJIe+uLde!%Xv;+-FKSVuvq9w$9I5Fz5c8q4+RU;dffn0~+w* zJQZGCGJK+q?4Toe@?!y4)o{*{I8(Y4O9zmsFkqke#u>f`l#B;j9m? z0XyCbyi@!>O^DEgw1Bv5>!`yYm9|!c!(xZiC>d32Iy4*yhwPPP9Q|JIaL>tV`vSb^V#~z@c*>+-A9j=wlpU2|= z1Uf=#qvlJw5n^g7p4$a=)>s-j&Xv#Z)akODaV>p{GF^*uH^l602U736SUZx5=9LE3m=6$^X~Q&A-N!ap7$Y%*4}ZsMEVkNQXx_Qj_wJw^4#|QT7LdLBV&F zjBB_A&e(>@NoI+lC7=<8O8eXO<+-9Z0@@OXt><_%6{z@1DIaMqFJkt``%wIGdyS(MfHPR5`=-XAfxIp5SYmp z{I45l`OVva`XmKP51WT3+aMFzrh-+x!zpB~0N?HFpsPYE(e7e75izN%gob7-K9vy=5j-0lc9}U#T*zHDnU#8X2rvgtD_Fu?0bT@59 z3lt*o#SOG}1ZarOZ996tx&~NDeKIlr$lf8o}oRdXBiy z<<$dS2p7tT5HeoH(|W5;{Rgr&Ei6$cZRKSwjY3&(`JHOyWHf*~RTs&*UsRS4M)?Y+ z{r~EdAti){gsLgcnXZ@KdbT%_#W<)kP70>=(5WNuehTm1L7upZ$k4(%&szWX?_#7jOfkbuO zEn)|jh-MHt<5LN0N_$E_aoohM?%r}&wB#fch!cWF^wWL6hpyb>ZuE=pqmB~53-O5f zmAx0cfSrrosS@2P#(u=jnU{e{_eKUI?|6m=2yuO&T)t)eAMu_oR zdXkZn#24~{PUexO)c{LcyUF-lecmu0DEycP6@uo(y?o2`SN$okD0RVvuo*8dre`-w z8_wMdNMUJeO$@#m&e)S_(k6_~g*p{{L74{ML9|3r6Pn&;cD9gOk~Q|^B(-DK?D@GI zn4YqNAeK6?Bq7)(N57Vf_2*?(;CMP;3_M%dMhKC^))<-lPU0-OIgg4lt*0MOa7d8a z_ojLN-sH%Byl2hUWyYj&yHe;j1wv=GCd#E~>liH6{;<8;N zGxulNyRtp~&4-S{MDOxsN3eT}_4A8Y`V!&pB*^!!CmeHVV200Y3A32tXB2+)obj-^` zdzY%Eh6bUWJk)O4S&#i_C)PJd2ByJI8XWkV&7rI&D#}P!h#{k*QUNg#1=qe8)kaG1 z#HbMB9aETLS}IETJ%GSvtR8WL`pCsG{>r->zwV+Aa_p?LZL0OzuE&K$wSTi9cnEMx z@)AhR4sicgfpMc(uA9PTJfUs)nqxkj(iGl1Z2Lqn1}xtc%+fE$uk=tN_IbNbbI;Qk zT-C53AEQyNviS~}5IoU6!b`YBM&Wvg5C++zRi;4hX8`tPJ7~ z$Vbs|mrQ)$6-qIqQyK2x@yxt}jd}eQI9xVJ9vbK-gL~#O9>o}wkJi%f>yr}`FF0%c z*(Xrk4uw3Lze~mxU`4~Z5~LounYzMq2{yIqCYU2Zcu;e2czmMkL&9NoOu7-VMX@=r@p~4d8Kzx* z4?q&bQ@h6ZC}eJY`iJaT>G~c#zHY_ww|XHB`m4>54M)U zGmDXDt>b(Jp}VH=awc^I?b|I8LrEN6%OQaSyrCJH6TAAs3^_ip^HUe4VmrGas_Jl4 zCO}PsUY>e1-;9I&4m;EOSov#@pB_yl`XKXcZn(79c)69z$16{aW^TYgwoLeXbWJW6 zBJCcm>|cvIQ6-WMDyfskw4#eDZR(91>#8;k!8Jo+MWRih`eQ*s81-gco=^a3 zqzSc+dK)I1sr6d@-)+zTFty-qubA5yjJR9Ai+g1T`RTutGwBnO3=FsxR3A+z?OU$y zxb#VXxXJ2Oy~;-RQIml=p1OWUu|7s4 zu_#mi^KzGTQy_4}%XWfB|NNLRat7k_^I&Fu)g61hBlOwW&yRMK=H6xRcJphUp4isS zPwDet8`x5XVPVh_XkS7Ab2dK!XKa4)!oq?rK25)HQG&kVwhkbQ5ka*BVd`@YSAK6k zh^22Fc3OZ)+}2KG-fRO00@(e6m4MzprLbK)IDQvu<5a<15ph-aD5eOJ4bIS)<#<5V9scd(>}Waex{3@TLeJlpw< zk*DyS3C?8rlzV%CKT1AL3#cC9d5J6=cS}^^ru1MwBQVRNggkZ$>XzR7H3IvVk(^qO+qoul$Kks%( zD^FJ0J`(#7puR&41si;NwSUkTC;XoR5bV86rtDRQD{v;s@2}tM8KFXiUGSRs`vkn# z>g56-F!w*)Qnh(<11`7vAU1_UuKISV!nL|wy4X-*M~@pX379|0FSoTQs5?1X2hm!6 z$Fu+R;zIjvx&p2LIiX&snH=Gq-bly3>fq5Y(BZ)jNmdKJJI5{q9RUf}4%i;ok=6yG!oo0g^{@3;4cI?oS44hC7wg%@_YE_F_q}%njHc1VQp542m=su;GE*diz&#snbndbM7#(Pr(SkoN= zfEd#nL+Seqzn{3ytsTcAzJQ?l4J{S; z_uu`1_i|sX09odwA4*o$z^XDhw8aYWdqA=(TmZg^hpL}aVhQ9m2nl9L$s2Amzmx=x zxW|PRC?$}LRC&DitvGp_e%{YC@EV$#Y4|q^acF6-XDC2?cW(5Wg}eiQh?D|-7rwsN zq~2FwPlj0D@CESxuYc7XMO|6gBNdgn;AEjG-2{%*ZdOOIMJswj1f{~eN-IT|4=B%; z{-><0$lEj^X{Id^`{p*yQ}6D7`ol~6g`*}@E-cm-t*p7>AX(-7Ar_tnSFf1nOI`Kf`8{R4jJ3~7-qFPVKz2_msf7v5z}R219 zulD{?%zbl8o!wM43R;_hivQNt7(j2g#2;iqGBU*hKElkjxP8bk<86sKzxg!$Z-AT~ zOZ0c%bmrZkw}xJKSf-~c_N%7kr{bc4+nBf*z+(44B31ZC+h;)Y?zejNd5gC1*$n`R z(f?~rZ73^?PD*0B9!Ed?Q8}T4RQ%7Iprtd_CD?rlc+^L`Fqr3As(X$@Pt}_{H;##U zSTgYlFn#~driN%;`weCaMVB;J7A5%CuUr=;%-5^D4_u8CLl*IRfK4NjhjFvoUoeXl zyujywh@g!ta%cqfp_BX;`o)1qJ*}vyU}0iv^UH`#2cTI(I=5RdS*8D8rmey2wgIf} zhiF#!mFpO2pt}G2mhVUSmLD*HX2bqh1FC@q%_sja8S?)Jf_(*p29&*|oqajSHxO&@8>tQAl zn5J}prm2@&3VK$qE`?1E0O`yK^~VPw$b z@hdm0A5dsufpY%#uYqS){pW!%`I1;obxeE!%e;@i2HopH8ZUpjbZrT`|^%5~6Hk^@&2xi)FCUD|sSf*$K-pegXV-;vr(;6l)B z@IgYYO*WtUVt@mq+vKCLgQFb~R=)Ob1eM=Jqmg&CB z74QC!+oa~)>D@Plq82`|Y*tqo!rT{N32WTM? z8PwQoEk|i9G*$8w8-}j2CsP1v<&#tO{kVmR^hnNF3B6tFGaVAHRU#G%MnS4k21{U zaz&tjU1G>b={S`%(OT(AvxV;r-?xQ36ka)|T^t0X$@yvepIrNh>5<3eSKUV!FW<;6 zKqdo^$NT?g(?dezWll1V1^h-DF6G59j;({FoH`H?lEZb zzxZs6Ew2sACc)FcFzdmWD{o_ph6rX21Gq5(!oAE6gelSZe5~uzN&i^iKm2a`+nouZ z+ykubJYm!W%1__PM4OhD^=tQf(Mx`5`yFYc)x(ld{n% z5@HfySiQZjIR5)A*%fbv(Yu0avagdTUCqhO(sCxr_h@=L=~v%b)2q0M=H{+z{-2dP z3T$6oxr*U><2uAOc%tG3@pjciH{@G@Ce9JrY2dM~U5dAw_+UjnshWrFw-}``nEfb5 zSjVS(>1f06<9Psb?1Q9d^aGgHyOILZt^>$tB`q7W`(OF5c6i!mQI*rpGT$5YcyZ+= zJx1dxlBF5~BShXVh)Au~qxp#g6<#RP#<-5jdyfugC13QzJk_DvZnS{UvL`WW=4|FT zR}ixaK>2S990V|n(l;(p?{FsW|9&fwos9hc7H}M|eJu60 z_1bn$7un=;e@FatKcU$$CF^`QP~vgFM?23u10l}FiVcCr&*BCdx*cgtZZ{z47tZwm z`ogP<-{Nr8&`~@0a9IJnkyV0`+at$J8VQeS*HNQ2fZ|aTnDH>10Bq7B_ojo?4}HQL zA8hMCVux(`Zap_qwoLQ1nDE*te$HNj@|5`b8GlgOct3i6_PFV|k(6Vy_tu$fbDh-_ zgHC+7QQ`Halei~2%HP>-EMDXCD+0&m^V9MFhrRcXYI@uLMYk1fARtYe4Jn}riuB@E zKtfYW2tf!%q=P_^4gxCD6$rg3RUq`AEo%g#P zDmk6Evz7P^*aKs|tR#K~!J za3n0NVi{M^Cb4sCuR!C;o)C#|-#YkPB@$)_-LAm}Z)&GC&4-p6IalC>&7F5*-)S+Z zn1Pd|=Py0bkjhaB|G<4l=$qvk1Fh3OAyr1=vzjKAYrPP+uf8m0uawYGR@XTL7WvW5 z2Ek5V_pPRaFHfS+xt7cqk7M&TJLX<7Sl$O#Y%7l+1;bOX-b{5}!PsO^DYH9!rzdSH zV+el&7os5fg4?RARrqjRV$-f!@< zutb-G2UfPfJ=-;GKRUc*FkN#lD3<|3<$t^$hdCG=-7g}wRT7;qxH3c^Q1F-Cqjp_Y z53OFw;@EN0^B*9W;A!&RVJBjq7QeJkKBHOkRL^NfH9l&xH8$F9$q#uUQ%#2RR#>c< zqfLwVA~;PQu}%aSAGvM?x4iq63sN;x!Z)u z(Y+HT;p#MCSw8=&HOtX?q{2~})>PT4GX!yQly%nKDW^Tj9+9HbVHt~(2kfPyd zgJX{5&fk&tpcV2xYS5xMd94pwy)b)TL;2O6x!DryaZ_qRbDfGRu&by=j-ic5E>+QsWKP>Yowl(S z4LoMl8WWjtW|UgdT8tJOZLLt6U@$GT5QM)y;xJzD?G-~WR~{FnLBQXdUIV4@r8)Pp zSoo$w(Dp+LULQ<^(VI z_@GIQTC#?ggG4#)m8*Fd0)08+lhkaOGAC%W@^+jTjAxJ-s@ch7-?Jq8Y~pQmR3bOp zYxnsckaQohd*^NUHL}g+$&z+^53i^CQ-dYf=<+bciFikMaeRK8MEQrs((LUNdvhSt zK_6_zAEZrW)pBm#%VD2tRCcpr4eC;9yj-K}C`_6*!Gh&v+5heh0h&&0*7)z#O)^Zc z2Q{?q)N|)p4mPp+r4h(|zqRE}p89ce$=#>;>iyYj&aDZH?@NrUY7*Z)$kmqLefJt{ zV$+uvE=4QyWp~Vz)4Z{oc#{@fZuyjV^xBP*7ifYqODChy z+3tI@cl|OaLPnqqQSTfoP=ly0MaUEo59i>=l(=>hXe&PidqwMS4S^mp$%7S1i2G(O zi|^EOix!H^>w&JgIXwukubrr}RT&SWMc5w1A?@P;FuU`&^RRUSTDFXG@+S8HAQVX0)ZY9e`@VMEfI<*-y3Y+ z&NZ(pF@Jh6E?mdA2)4}=hd(CMdL_=Bt;IZU_!B|s9sG12u;TKr2Nn_p)fak4gRbOJyEE`tk!6h z1XyZmwJ+55bG9f5Pu!2+8Jg-m*l|C|p0*V_Srl}KwR&fGG373~_^nvmR^fWI+13OE z|2$#37wnX@sIVoQzgmzgo>}6_dy-qndA`sOGL}&OLfLgjl_UN+b@|)7t@;(=)AGV6 z_gz}dbX5H}D2r`{-aG+VplWa4Ds}qVw|;{lMe9Och1`IWnYqpJAm#Yo2{zTl!FX^U zL9QE_gNnImnV8=cg~s{G5X=Sb-|BW*_NHYy;%1fIx4Xtlr$Tb{4_4%ibCu?-=0@|n z?(Y-=bdln`SSeU^)k%&2{`PKD#4-#f}9j}sDeS+C>{ zdt-eb{HG#lopS(J#$uOBqul3|w|={#za?^n|PJ6O!kj_$8pAml$nhyC)De$$0wt$;>PH%R~fCmlp=ifBIm*J zwz26)^Y!W}v@puQYO@7qm(X)xEOpc6p!#5^dQ36qE%ks7n^uOsp9&QgrBWf#28%Me|$5ugWs3hc^8>20m03i<|fobA1g4$E_yqdKnqVSPTBWrEg!D_0cwpiG);F_)Y%{n$;$E_EJPj!Y~ zL*9g2WK!(JmgjRK{OSg`(?Edb33yD%GmfPa&~F6b># zk0Ba+{a(o!FT$le8diYZdTiN0V_pAHNDI@jWSbjSxC4xEC9yGU{Ub!p`~N8UOyHc#OqLVhO;!{716r zMJ0oy_#S8LU8|WGDMvTzk(B)l@S=KX!D;`t%=e=txlr1GCmHm$}c^}(Lk0o znKbmsV{L@A3v?T6Hru~g+G_$JJyqixj~XfI;11=6EuZ)u?Vvq|Ea+@Ka>u&)0=Ft| zJZ*W!EH(C7`n}oE_o_GJy*fUzO599TAKl$r@upGTYWJ2ph#_#JDlz5m|1 z-)1Wz9$J<2L1@H}MBjbPVGj>bUk*MY26VQgzPnsTX%lN7!$MPZGP_Lcy*e`s%NL>M;?1~O! zdi-+bHp3cU+}+|J@unY!lI0#7i-V*WzzLZA&Qs?vwZDE&0|{Wy+~v=D{K&EPS?GC< zRy@l$KN|9u(C~7EoFs5h+onlPfgRPrjEbVx@%TFGT0_6LB~hDJ?mL4yEfZ2##PazS|&&!~~ZEm|LOEw{zE z^kEU)u`7B+?a|xfxxA(k7L)3Pdbt7_dJ+6Wsc)F**5kaA;9M83EFu5>c!SX~+(#Q6 zn`u4TTW}bqiW|P3p9M^OtN_+H^B{Z7kL#H&hl63sgVbr4KW9`5s z=xJOxu{|A2TV-<>#%0Bqe3TWIG|}bn7%Rp^?a4z%mcxp<${qONsDe2^w-v=o3L+%5 z{}XNdp$)@>mhOy22nS~CGX_WCXzaISw-TgnN|o~Cy;9?xOgr>N1GUFXtfns*;&a~9 z#a{~yAE&!P(k&2lB+<^MKqio;gHHxp&>v`kHRh%R0lKnc?n0(Pf*q=$&S%Xmpr#+7 zL&sgFz>d?dUpNpSC$j8i&2u|vwygt$%O68pUyg~gBO2@aZ(3VYW_~wF7i@rDHRRfH zDRZsXh^NLs)99grlpUSb4_R9Nh|KAn5278sD)s#1@=;GvMiAk{p0a0Ci{@RC+#i?h zq^;hpAM@VZ^r-IOP?AVEwPzTv4YNg0mvJJ~G*IWxvzc^I!((1Z3u@d=bx}uj>{jjO zaxpV-z)Jz=>c^kp?D<~RQcq>qAxumEf@Dg-1>IYDdp1v--{X0mMcvN&-{EZ1H8kG1 z)x;yZ>UMJno&AKXjPFeRCSPt0E@Xmk_g3*m&L>FFeDPBL?6RiP7OpB(>k9P_JzF=} zgWA=)Q@rz(S&UfKW<4+S+-WYi~p=U>y z?|Pw0!^cT7hc0GMyQTE%+}pUh;TU6TqW$BuMuhjK2#anhrVkl_& zGZfT7nT>AD=VF=bO&#YdLbb?J#j=KT^l@$3C3H_f-pS&E`O;P+MAnp<>L+zAa7QL8 z;Y=n#kd8^-BvaV+@kf_t)Gc#WLK(CTrAsLFu>_s}SG9(_^N-9FED_jQH^;BMaDq_bx$~C*J@;HUndhMv&C^fAG!1B; zYrlJx3h}P%&lVPYb`PctuWT3(3q9Zp>+K!JGyZ<^@%6C z#FGmlU)RrE>X&{&GOu3!iM6Dshf@vo>L71+mfwEPRiShN=?53aD?@9b{ttu8ojw#! z=Qx5?sc=2xEX;;h(92avR<1)n$PC^W43DlVyZf?VFJM+@B3pQV#NRk+sS8py!B7(y8 zWHfu*s}0QpNo7X%7VODgjr-?85*y#dS0ACl7pUhiN{KDEmTXY*Ib%8|B$Ogl-*6JCA=8^R%C816MqJ_%%%ucm9dhJ!!z9q;SkcR?b92g^NbS8S#Ni)9D-Bf-?e z{En~c_eHG6_lpkax;@m($>Ezt)jHb$;Q`^?Au*EXh${<`%`b=_>^p(=gjG&#V$Ru9tzXK zoRxr?`J-L(G`YgH`p@!cjJ$SxlPG~U@bi&UePswDAvx;c4Zn_^H13@fmm;@OPV-PlniQT=tg&5A=PP zWc9^9GS-EX3#Qr@EJcb=M56)7vi-98!`Lt z1!4y%FcWUU5Rf@MW3_L|8Tz`mbXT{k%;~8v-i;K73z_lbDKn0chfhFGpUGdY6Dhu* zrVt~P8;sSaFEY1mQE7CwqHPM3d>(d0wMxXg|6@|F{#l)i?<3m{=?rS^aUsnesD%FS zA+w1mnpQHt-(FL?gfx8QQdIY~_HODjNimT-^~xV#xn69EsWX~w=Wx5=vX+6=4qfp> z?j|^V*wy2g$gu76ze~&Ow&`~#Dp5>z@%WgtW=^N3J6M<3EK*jV9f#yrK9L}?l@HZ_ zw>9KjPKnLFn_4o>#$&_j^?JxcgVkOR);NqM)5cJ~m&3$sbzE4n^?N%87qqt)FfWZ@ zStO9Vd_9IHZN94Zd3c{p>-zfTlkY}}>A@Besg;3}V&+7cPt!SemiV2e;q!yM@ohQh zrbJt^;>)YK1AW4oDkNIX^hLo6G$c|Lv9VrU?b`n&n!}*qSG=U_Pmk93rv~X!8x_@& zqsL+5N3=ccrUd$=@7c#N3`5Co`d(k(f=lA55)%AUr;UV>isAadth|acuWh%wx7x<3`ZG`-`!MzOx-_zFe)=?5MMBK9ph)qU9N%k~)ln90@(DmqilNv~A&^R&cwfJL?V!g_nPth7y)-gFk`m+faT$Cm?W~qFx33odvezg@S-^=TNSN-`vf>wM`RwHom)SJ zM4ig-kYS)U#BO+}2_6kx5m{W=Nm^+LuzWw~Yid|_)Xzd-MVPKB{8&HM+S;X|A$Oq7 zeSamvZ*$6yKAxDlGA{ zR<5&JU4WETL)p<3Eme%gTnnr*3&^D$GzF|+n_$&aK)t;R<1BW6$}iM+>ijS*b_pfi z01agt4#9^_)jQUJy&zTmle^66A1Y<1=(xf{2U}{O1E_d|Xs`|VBRgo>6i&6%SVSkz zt<4{7qk9!pO$xgT?D`rhwBr)Ce6v1^u&{+`sF~z;vS?(O34FDCKmwa5W|>pf8~Wfx zG#uK0_`RhA#IYg-`pA`9!@qu3b>ELCDZwn=F#EJlf$_r*&ubfwk|pMk(!#I|X3oX>W%CR_WfwQo0Np_r`^*C>99~NbzzoGxA}C6?W|UTd2TR?R@MZ| z1(NW9ENvU)78>lx>Da%#fI?orMn%mS;*uWAV$$)bVew#Z15H0}9W>J&+gr50y$40A;tkS;3{{UT0*@I2#8Tsk^igME2Bo{hSI1UhL8+AYFt?9-$(shdX8H z@kVw97yN0sJ8(~@yFp@Leihj-fN3`7uo-cjqF#|P%~i3?j$D)6UkaklgyHsr!gjV2 z(x6q(&EbA%cG1a2Kn2xHOe`7TTGXRB44x)44^8(NNtbGK!_Ml&j~X53jW#*K>rKaM zqCjm!>Ye>jK(}EfSYD^PeA0d-*AvS<08Ko)uNra z<(bM3)CnDfHcWLGXIu8Noa>Ty7t*=dbV0Eo$FKM0lK4ZiH)6r}Tx9S(m;BlejK$$r zWnkw5-$e}ITq{eKOGcKN7b`%`sFeY5suk9}#qD!bx{NnvA=R6VjC6E#)_t}c$oeU> z0tKJ2OIH$(GL%?tzsq6Ip;h4PLo)(8y)UMfj147p7&Hp?2DPoR z+k(;$LvHh1W@FTz@mgHDFr@Y7*rp(217i$M65R1!j4&=#QECZ$nAO)(eOPxXPcMRZ zP)B5aEZdSO<_5LK)c^q9MR{LREi4EPp7v0=ME*2BA-ZU@+#dO1V!t9YB1ia^gGXa-XL=^Jh7;LqBLQMOo=N-bV9N5EtG znrcpG6f)5aJ*9!b+~Ry9d~}`~q{KRHh~zDav}AqeBS)J~ug>Hr3}xv`=j55kn(yo# zW}eRwCn$kEnH;Hsc0K0K{Q9Dt^)`e8&f(GKx?o%XJfXv)rkwx+v^-m!!&tkfrTzUw zkdGqWMAnB*X?UVrMIf;s?}Kfhg)cue2ii89NQBQgu1z!Vp{{I0Xf?p!bGJc(Ypy zy`7$i^)*%r@N6a^Nq{KAi+>wsv-9DSdM-ut!&z#4!A-2{4qwaKl;INiA8qhj+6hHm z;I+8@PhXbO4Ludg76~4@>gnP2KxvlkMlhJ0w~q$|M>phlxX|}>xz7AGP@PK~8viMC zlX-dm(hb~?1lI^za#+67G)ktTlly2ra-V-bdT}SoH4J*?M%am;Ym^!PsYaRjJL!2k z!=re)6y4-}-bapZtUrh%|7%}`tgc0_w}7-rLY|d^uIW$fp}?Sa_tgJWQSjmA2aM9L9W~H= zcL=6GYA}0(mcu~<@o|ymqD03Ag%KuMxAU~N4LU*{T=&fccE-j?^2{( zTI+XsKPjpj9mlb7ySCajXrvu)2$tdv6mM!{Ka97F$wY&8RC*&>%LXntgW3Z?ut1G#4^virTZytqc) zTpqbI(AP;J%b4pcZl)YA<$j70s9FgAH(RH(V=vxywgokxBydKc;IM6*xWTlhBj<5a z6!fd-8J_#uE_jHj`mPQBG>j9C!+f#r`-j?csR z&iN2s{BxA!z8DVa6;Pw;3xWsR_msr_t4xk{-%P-|Q z>zhf{O@?yA2?%ul<)Z>X`J_(64Ksz2-#q0{?*WvllJli~RRn=-H}wV2)p3JKN5?C~ zum5?SX_6zvkuzI+kfV}JgF$8DzMS`n0;?jYpKkJ#wHqNHBPcdO2{#HGynQc<-w;~% zZCf%4$Uy@lrzU;=z7ANm`~G;KcmMK0HDHGVz3U!)SHI_lS6=}|f%zdTambASpu3q{!BX)0%6WiY(?YbiyrcA}+oRPvo7%$^vUUP_Ew$8iX^8gs^Z z<3s?!e1TxhAX(*thXj>FZ+=|tLfOuOo-@cSt0bAM>%RT3U5XO845X8ROjN&Ae88eu zyNi_;Su*5u^QV~uBkp)OcUy?Av5R2BVsw-1Pd^qoYC(A(qjvcO#3KEH@8x!*qga-M zDpbu&+Mb~d7VY)Df?6`sZkJ|-Lo52z42&O10nw64MG~|oks;9zW?y=@^dPq<^j7N{exNS8dGoLPk zBE6YOwd;oVH-6@SJ{a{G;zUBSE?c;$fv;b{6y&OK_)482T6~ml@=Kfn^-))@8wL&j z1$`qsJ;Svc)!S%72%Ag7f(^~z&qOq@w}C`lG%}uMU+_u+9 zJ|{TzF3pZh;UABs{v*;Xo&cVV0RGKsboaO^(IB`y(NR6M3P%%FfZS+65^NK_nzFv< z#p2*Dvi!L=8iD=v4|{k3a3CqHY+VPKd77+Qj!I*BHKNbd?;OQ+40uIT2%PYUNtNHj z^7QVcV0daRnPeV85k&kff@t`o{#7T~Kc_L2XsKD1c>J3JX#w44pEpb@n?r#?CBgQ*J<;FXOwxE3@ zlDE4hK)~w(D#a?1cJ%ExDT++t*Of!F)C_ke z5x~}ZG@v>B2vf}7HLy+t*@LXh_kr0gMOmh`u(2h{{ci&l;aWzqg|aTq|8&v^TEzRN zY2OsNp#e-|eoB|MY!C6Kd^bv%xA;M1<+YZ$>RZyERm`G<+zJaiM2?w^+ci)hX*xda z)uYd!O%8mihZaPAZ*&(Ag>mA?#jGk3CY#UirG1J=pUE@G!!G<-_Wp=6L?_mErUlc{ zy>}7Ag;0^(m|Cb>h|a$zzS-WX{|GI^i47H-#5K&4B?a^{-=D<@ZRDg5s9JCD%QM zQH7~EXB{HJ(AAk7B-j;s+XCUVH;W7(ahRYmndwgRYBP;@jYls4a1f(bYUQVaIdKVdmaRxB`%$U=lZu^)2 ze!M~SI79f$=?SJ9sCsd{Cz>w8ZQD!U_FY~QU6MrEQLRXm-xrw8hdB&s^^TdWo!XP! z$7$OS#|Db-R}$>2%bi{-yOz%vDI&L)zeq#E;7=D_<02OTgevHPX(b~MW~LyIilfO- z!)DCRz|v(r(c5XttMkRiYssXgF(cW{ta|xvkQ*0}iIvs|TIDrfCuogsI}MmNaeIru-C2x} z43wD6Bv>45^%flLAKc!I&^3Ic-#rXs7m~jQ_aUR~S%j1-Sbb7T{t+%cQh+mp-*PtN zxVYeHGiz7iKHndYcqS79cW6^19U0Naxp{fRZUP9}ghE_skL@gajiht3>ttu01c7;O zQp-{f$YK2Q?y_2xd^<__bf#5V%d%8DQQ7RJw#V?(+H(Y~JC9L92u9Q93K}~F%I<9BU|gMUGe)aH@Ns4a5wU5M8V?vQr9pCT5?BM0Yv+hkWMm%!%;w9I_m-{;}c=c@P`#v|E7idVf5TuXq zMVeizf4)%}L)5FK9=IKl_-ViPAV_}qIIuLe2mIzTC<5_UvU;-uDUnWAg+SkIi{q1mulPu+6BhIeb z(Wm#EK%>CPsrrDWW5-PThM&nVIVgCdJ@rZtz=A;uFui*1*`o$fsT}?k9VO*gKUuuQ zA^a?Zu@Wub@&rWQwivlFb%$V8YV^=DGtWlbO_2`HohfPe2od2GS?Z_|hBld}G{`Z} zTi9HkXu+n+Q8|ZaxqS6vfdWlTA;f%IBLm-BWejnfP}kw#h!gZkO4E~G;2XH`MNB&bd+`C4MhNw)5 z(;x&7pN8p6ZI>^@Ig6(js%a#85pv;||>B+&j&X zJ5e?K$`bf))PPAU?FPB)uvr&YV!dPRS$+*JE!B-D)@H_}zKo=D_2X(}TTEy}{qx(7 zWfQ@RNgIqCBQ;L}xXR($hzAyS1+JN88UaiZfcl2+CPL$%kpP4QfR_)`s`MnHDQ4;> zyXZ|xs&ITl?@V&Z1>r4gE)q_29Yv>wVDWdRNIFwW?HV^d@(N}~Nlh&)tsa(HJ!8%` zv^IFJ7+NaBY-SjJ4x8H2%W&ViI$X6}3&W+Ecg?=G(AjD8n|Bv*ttg(-%}@Ba5gq5n z=qJr3)pSO9BgsAMB?s_I!1f$`!}vPp`_}pnF56QTHRQ~nJ?9T5#rWhRSokh@qH8$6 zo%G-XYZZM6Zw8z6-;rM`p7d~q%%_!cI_X6 zN^FK`wy|Pf!%RkUKp*|*_qffmXN509X?W%8OV|%g86P&geSe=Utg9I_$@BS$Jeh^5 zBE>re+_7G(iUp|w&TwJj@Pq7O&UG`Bk+=PQmdf+Y2KYty181k2BfLCu4Ip_AlUS`G z=h_I>pF;!PUuyj>gAF_7uxBPa+>vmHgm(D#IVJ+UvE8yu|@+E|U|e zZTfiL$uqF9agHb8eR(|518cR9jv7bu&52^Pd zO6waDjyan?!rkl!eps8YHj8ttgg8xkUu%skx*VybYAKoY->^~$Hv1j*EUKnvx02u# z$&w^eX(Bf2jahXtraICxOwPko#^fwi9Oz?Fcj3L#an7#gel9Ig#-(f;c0<2cz~q1J zfJUu-556UlDjYq*UoJlE(!@v+?%6Sn~xS?$)2k4wPgY~xnuR`L1OLM7&o&qpONN(!! z$>;E^O)d?3qSIgM;+I6e(Rk#t1HG(7`8MO70~V z4=p2sutThWH)~MgzO%XLd-me9Q2i}gn+;uii5#I}NqZBDEQOYDP1j2+&aJGo1-iQo zb_DdtY{~mwn)Gm#?+m%k6pwuxd0rajy8T^tpDosT!Lj}s=pY(jowstoYEg&}t6eu( zpA#lb8;nG0`83N<#qRVJfPNO*AYKMcU9*?Yv!@Gjv#)L$;zC!47au)BZK=~do44yr z)~K9$bLF$}*XOiJZShN`WABaTN|mEPDV$FUx0tisbSECT+e z;oVeM{sw06f)3aAEOl9s36(L_-~@!>K9*%yuKBblkqY%Pho>l96cj3wE4UbmEIgP$-G!D}wT|hVhn28oB8nFEa`JR*KA}FIL540=tYqm6 z5|I_qwamKqOJ=F0i!^4l$&n$#A8N~wap z%>cjE9oCkJ@+4BVNy%-8$VxJpzWrj^>;h_NX;Bfh3G@i8?=NX~B;bc^tD=eqEeG|V zWa_dqZ+<0_jH%iWu#%`6<1Z#k!cvNvZCAezvFvTK5h4usZNIxYUUl0I;=$?X?7Xjm zin)PxEj2`0xbP9Ge-!k}36k*V`#E}|^=^6D1FUhn2B(XD>h&=KggXDm@Sc#LElT+h zi;^Z<6m1Q3>k2l4Z5Gd`oY0iL)^|}5Q!j^Vva6M}{SbT%Ey@B{ISWH(Y3v7NeblMg zNm^YULa>%?I63IC#_sNU`-O3|^;_z9D8l`#DOC$zN5*Sa4M`fwftun5l49s=?Dl22 z&t4@Spjd)yoSF4rK%=mlHz`Ij;g&TZa!#wrV*|Wemet20cQ?$weR@e~V6fsWoA<4m6C89|8QRmF?-jb+ zmtnWFzhgu*>@#1`rDOuSD$@!-f2}Y!wCPfG=WRfIF-c{k_39#*DDh+f{hiK$jwDDw zrQ&|1um*A;^{X||y)yk%M-u^2z#oMa-%JD_jhwv>&To{$_jmZU)@FXf;MCQeJ8Ak8O=uG zOI(0GFK2@j$CK{OY!)x{#d4EgIuH-lkPm8bbQ%3|WFCgVZkwK$$p8Hezj@pS8NzXh z1x5+N1IVP4$0nE=U^9Ij%I-AddtnXJH)C<3fIpZWMMgh&khNoN9fj}*4Odm_*ICrk4; z{gF*l!-*HQsKM1Oafmxl$q~LNYl@S@<2yfLm!GeVIt%SB6-n2gOav{WKQ4T}2G=`i zULKqbC#KDbvR~H+E$PKY&6Iy;{coK9!_ifNqdOIxfG7{3Tsjb}h|IdxE=h^tR*;R} z+WCgn%BXXxfwD))co9?a3h4UV-#nh3su=qko*JSglB-?M;r;hO09c>*BYFyY{=rfS zuw{nDg=5tndzCV1MOn1nW4HIbyG9of6^|+@FkG_wGV!PHuiBh3Q9u21VC0@8K`p20(Q3 z1@`wEa0uf)a@l@lCNRKAbSJXHHQh6<3C~kA7D@{sAx>H*?$w9~EQ5US;;oaN$rzdL=23+bAn zfRt2sd@w+uuPvN_IO3WBz=s3%L@6?PS{El~e9FzO=WK6`}$l!NE|n zu$tx@o`XRH>Cbs{W=zM1tg63k7Hq6jb;T)1y#YW;gNq;G1vI@@pqZbLAHRnZ`Fi5O zYI-ik(j7D)D;dSDz0{8d$BM9OZx8$V0sNanH&~XVn~Vcx8?!H_!|U9WSUO-Q!~xmxn! zFwma2+=uG1a@QJ)&uh3yAa~z%jS|Wg=~XCQ4A`$EvWDMe86%&x#f3-!;a2i752hCOr=q_z^C8P()mS1pFVBVWj{`r;PCVFYNYv)COr> zqAj1by<*+#Jy&NLvqDv0T-LI@@g{D+&WAuhsCa+Sg5T{%L}TQ=sA63c-?b;rE$?xk zpuUb*756AH_w13Q<*UJZP=~01zyn6?z;tAEB1G%Bva*i_uRHy@ZqV1 zuGCbv#mx!Ex(e|<&@QJ;Ociq7636q-4&Jdp4G8AKE2OZB%fAGyBS(KJE^c$feJXt& z9^)o@I=4$tG(DL;mt8T={JaF5B|NqMR#Q&LsoPzS%|vj=CZKErF_VKc}#B%t*Kx3(BE`u-D?O?~OD?`WizV!~svb03j=MU6}m_ZK@AjMX6H)NA!!7hpL zFXJR9JG08?L1iEU&=L+&Wq@h50ev;;w`s2b9qoTiFWA8xtm?p1y99yPe{(wv!}UoZ zG8AIPcq~fRr3S=ExbA8ti4~OuYvp5ZW2BqB9EZ^Et&d{#-)+APg87Cq8fpgY&WG!= zuG@7f3UWVlYz(gF@I*)SYTL)I*7L?hJ_AQ&kGbvkHR|Gl{8ygWc?;4BI(XI0tuiu; zpw9T;rdWzcqV_kjlF8F5n3&^`#4=Mk1?JIC zkZf18W6%NV=$xOJo^D}5@5K!2Z@31>vtg&kSr(TeIK-Q~p`teLY}}dU4QmqRHG~3j zzC}zmII`#fW#u7pAGmX8_7f^3||^bPJP))1u|Y%#Yld`r=?0KA*pfOr6?j= z9u%2QCP@J#NYX9KM}=$ctl=);h>VgAvR8l;t_e>f5L3)Z-DE<#!HHiMtLIXe57)o> zOn_8W>_y-{fhCiZJf)VE_SPW%3m&piH{-Q_!AxcQF;9=VU&1qJ`XpD%&XXz2ngy*G zW<<#?ZWHlxIjWN?15Xl{Ofmq0)f0Zo5-~VF*^9Uh7oJhHV&oz=tZ+66YUb$b4cIhs zcM2nSpu|l({IH_*n2P;#-l*c5|_!%kru+%U8~YWYJ94OpatO@i&RVr zkO#r*vc14rDJwe*u|(d*9Mqv54K+(!nfX+nwDXmksI7@VbWB%QZj1vbqUCENu3?MSD-E zI-ShUC5b(fS55`ZRC7bB&qf4V;w2?5@>^~b!mBq;a;uexp6w)vW+C(x==}=-r6*4& z#!hS_RW!>geX`xflC5~G6pw2M2@YI`gU)F17I>E4x&dN#gU2b zeja<4QG=6}*v>_@3Qy4(x@;#=b=8nmSClXv3ii%#CK zrEy#(h#~E4#w@T6P;=w0oR;QgblHt_UXUZg9|YLPgl}JeJSt~X6dx<82$cUI!7?$O0-RPWn?nsuTtzkUolG0;hM0CePcau@GjbtTN@ zce!q~urvFC!hzQX#ZeGe8jnL1HJBhj(a%4kCRi(onj9-)rMxU0%z(aRR5?lctj8gD z*|{sLAx7$&UjsePNwxfLV^)D4l+al2&nh!be-YxuuVk*-8|SFZF4>HL`b-51*BDRF zcz<73a$inlGU9-Xg?X1jveycfg-ja!cUJO3x{wDpcbEDX-Q6vmdNRm4EUUpL1@hRc zy=vdVMTe>KF8gX{6KiMV`!;_v6cN>P?z36tmSk2>c#B)}no6tn!SdE43oKh z=67!IjDc1?V{|`@{Rdp)DW8jpTWj9yoJG1DbGp9{ntCJ2ukwN=UUbjD;BjN{n2BPn zh(IOi9F*_VcGfQYp5Uj!3HLdwEV;Kr7@R`B0W~U}P!@wsMyKi7&Xj2LV5Q9opZbdT zdJ25C?&r>gW=~akGg~^u=9T~n3|Ccp9I@@@=LNg{+~M3XtYP|z#yG4DJoDiTj+;~5 z9EhCe$m(0%F6Itpo%e}VMCBH3SBp0!$)o0#*7fr<`Ap>pvQ2NJEw`Nki=iYcUP|}0 zYPfA3oZhBvKPoN`n6!0ivxsNm)#+h?&!}4VM60hceRZVzDRUyOr4X>$4h05d&SdJ~ z3JfG2BvH#J9)&U>0fzuW{CekwW5bd~B^PtL&%k=&u7!uvf|S`n=`439^w~`wk~Vw*|_%sxR`Ld$!?8 zr2v|Q4pACnTi7Og*~fj^o{kUu*=*?fb(O@tyZ0t=moLDn)wp=Wn>HrDi$Yy(~!3;ElDaGI|(Q z{7z+P^acrMhkpG%-CF>zr)P(OB)12#1O4@~AwMp=O(X?_zlb!eujx6Iu^5E$LU9wQh5eEgDKE7rpKJ4IFw-Ik^Yb}{u&{PJG=qmJ~~O8 zm}v$R>-_Ba^Tvu`5bUTh1u%EaPsOkwBk%*v@esk>e<$Mqnu*}?LTA+so<9PVv6JGf zPg}M!a0)fJ^kt}X+^<$!erQwA`qC(xs4!|KSKz|mUMTxdPXK=_yZXJh$*nu~+!_Jo zFs|F5`hV7s98yG!KPVz~6=1#}EM5bf`>_il=m`161Br%Bf$7H3#YS+z2`|$50l(Ky zVSd7xW%`_C(;wG^no)3+?SZFB5d>ja~SFrFl7Lxi67lASI+$E zx`&d#;}8Le4BL~_Fp>A^L?LRyb+BtKC}YnLkS;q_Bp01YBcvJ>O*Y2>isbU z$U0CB^hT@L*8VfUMv;%~6vU-hMozsj`j?RVXB2z4aTsS7&EAXu0(JlTLgGClYN2OU z@3H*3knmG%^%yyt5}{{Tz=ltf@+7^p zr@a9^1S=0!)aU;p5B(`gn65m8ojV9)@}GYD^Q--LQ2xyv{dZ9Q|34@=PDYG>u!yPP z&qc|fAdb}NDf6<$T$3)>Knv6Ah0>W&R2w=qDad8wDxsET|EcQiy6HE|yqLX4<^lY9b5wS1reHlOyXHs8Io=v8nUF zQp}dXLrufGVT>%+D_$Gf9PPkm9_$yQJNBH!h0U?X~ zU7;BObLH32|Lkn~E5rhn3oaFC+mZ>AuDpo<6d|`^QaPQq;W9a$cU_|Y(=c!YG7ceS z*2f7lGda$GmYV+_V3aJl6cgCs2f*RSd_-{h_rv`CSF4!-8+-ExfFGN;761KM9Y+AN zzTkOpmilqXe6SkXzrWJ?=|FHP(N=+CkO2Rp?*IFK=clzmwoQ(_2!1SC{0l$&$7~f` z0*LiUrgjD?Y=AY5@!vi>P>Tfe^F9rs|Ty+cl7#qCxAD-R# z&i^xDU@=AQ4g>B%@D8%X7E-_@vA`N=)H{pocYtY2t$QXeRr$cgY+I8FB#vy)#O3xT zU|_LsDO(F12%5g62q}ONE(bRA%YkX@ZsEF}z_jHnW{fMz-v^E=SN*sIBs%gh<8l=% zFtE5Y-bMjOg7iF}V$X*RjqiYIYuDqLV&DM4ES)r5t$}@@JosTLur4PeBYH`$*|ORO zmyL`}cLjf(aJQRe_vayhd3v(;&Z(Fsn5sWo0d~(Y3VOMb34NMWuz$D1FPzQZ}7ZINVYgV9c9zrew8`*$q zlh-M-2h~c1ZeYd>WQY3%gTV_-X;Lkl(C76LRyb?`UPXkQgi$P#0DgNHlpnSR8KXJ=-HCmU|YqZD}c!&#HkvzowDaxL&Ed&Mmh+juqp gj1Z + PROCESSING_BUCKET= + DOCKER_IMAGE_URL=us-docker.pkg.dev/${PROJECT_ID}/dataprocessing/dp:v0.0.1 +``` + + +2. Create a Cloud Storage bucket to store raw data + +``` + gcloud storage buckets create gs://${PROCESSING_BUCKET} --project ${PROJECT_ID} +``` + + +3. Download the raw data csv file from above and store into the bucket created in the previous step. + The kaggle cli can be installed using the following [instructions](https://github.com/Kaggle/kaggle-api#installation) + To use the cli you must create an API token (Kaggle > User Profile > API > Create New Token), the downloaded file should be stored in HOME/.kaggle/kaggle.json. + Alternatively, it can be [downloaded](https://www.kaggle.com/datasets/atharvjairath/flipkart-ecommerce-dataset) from the kaggle website + +``` + kaggle datasets download --unzip atharvjairath/flipkart-ecommerce-dataset + gcloud storage cp flipkart_com-ecommerce_sample.csv \ + gs://${PROCESSING_BUCKET}/flipkart_raw_dataset/flipkart_com-ecommerce_sample.csv +``` + +4. Provide respective GCS bucket access rights to GKE Kubernetes Service Accounts. + Ray head with access to read the raw source data in the storage bucket + Ray worker(s) with the access to write data to the storage bucket. + +``` + gcloud projects add-iam-policy-binding ${PROJECT_ID} \ + --member "serviceAccount:wi-ml-team-ray-head@${PROJECT_ID}.iam.gserviceaccount.com" \ + --role roles/storage.objectViewer + + gcloud projects add-iam-policy-binding ${PROJECT_ID} \ + --member "serviceAccount:wi-ml-team-ray-worker@${PROJECT_ID}.iam.gserviceaccount.com" \ + --role roles/storage.objectAdmin +``` + +5. Create Artifact Registry repository for your docker image +``` + gcloud artifacts repositories create dataprocessing \ + --repository-format=docker \ + --location=us \ + --project=${PROJECT_ID} \ + --async +``` + +6. Build container image using Cloud Build and push the image to Artifact Registry +``` + gcloud builds submit . \ + --tag ${DOCKER_IMAGE_URL}:v0.0.1 +``` + +7. Update respective variables in the Job submission manifest to reflect your configuration. + a. Image is the docker image that was built in the previous step + b. Processing bucket is the location of the GCS bucket where the source data and results will be stored + c. Ray Cluster Host - if used in this example, it should not need to be changed, but if your Ray cluster service is named differently or in a different namespace, update accordingly. + +``` +sed -i 's|#IMAGE|${DOCKER_IMAGE_URL}:v0.0.1' job.yaml +sed -i 's|#PROCESSING_BUCKET|${PROCESSING_BUCKET}' job.yaml +``` + +8. Create the Job in the “ml-team” namespace using kubectl command + +``` +kubectl apply -f job.yaml -n ml-team +``` + +9. Monitor the execution in Ray Dashboard + a. Jobs -> Running Job ID + i) See the Tasks/actors overview for Running jobs + ii) See the Task Table for a detailed view of task and assigned node(s) + b. Cluster -> Node List + i) See the Ray actors running on the worker process + +11. Once the Job is completed, both the prepared dataset as a CSV and the images are stored in Google Cloud Storage. +``` + gcloud storage ls \ + gs://${PROCESSING_BUCKET}/flipkart_preprocessed_dataset/flipkart.csv + + gcloud storage ls \ + gs://${PROCESSING_BUCKET}/flipkart_images +``` diff --git a/ml-platform/examples/ray-dataprocessing/job.yaml b/ml-platform/examples/ray-dataprocessing/job.yaml new file mode 100644 index 000000000..89f14b570 --- /dev/null +++ b/ml-platform/examples/ray-dataprocessing/job.yaml @@ -0,0 +1,23 @@ +apiVersion: batch/v1 +kind: Job +metadata: + name: job + namespace: ml-team +spec: + template: + metadata: + labels: + app: job + spec: + containers: + - name: job + image: us-west2-docker.pkg.dev/cloud-sa-ml/data-processing-repo/image:latest + env: + - name: "PROCESSING_BUCKET" + value: ai-infra-ml-data-processing + - name: "RAY_CLUSTER_HOST" + value: "ray-cluster-kuberay-head-svc.ml-team:10001" + restartPolicy: Never + serviceAccountName: ray-worker +######################Ray code sample################################# + diff --git a/ml-platform/examples/ray-dataprocessing/src/Dockerfile b/ml-platform/examples/ray-dataprocessing/src/Dockerfile new file mode 100644 index 000000000..98aed63de --- /dev/null +++ b/ml-platform/examples/ray-dataprocessing/src/Dockerfile @@ -0,0 +1,14 @@ +FROM python:3.10-slim-bullseye as build-stage + +ENV PATH=/venv/bin:${PATH} +ENV PYTHONDONTWRITEBYTECODE=1 +ENV PYTHONUNBUFFERED=1 + +COPY requirements.txt /venv/requirements.txt +RUN pip install --no-cache-dir -r /venv/requirements.txt + +COPY preprocessing.py /app/preprocessing.py + +WORKDIR /app + +CMD python preprocessing.py diff --git a/ml-platform/examples/ray-dataprocessing/src/preprocessing.py b/ml-platform/examples/ray-dataprocessing/src/preprocessing.py new file mode 100644 index 000000000..7aef363d0 --- /dev/null +++ b/ml-platform/examples/ray-dataprocessing/src/preprocessing.py @@ -0,0 +1,158 @@ +import os +import ray +import pandas as pd +from typing import List +import urllib.request, urllib.error +import time +from google.cloud import storage +import spacy +import jsonpickle +import re + +IMAGE_BUCKET = os.environ['PROCESSING_BUCKET'] +RAY_CLUSTER_HOST = os.environ['RAY_CLUSTER_HOST'] +GCS_IMAGE_FOLDER = 'flipkart_images' + +@ray.remote(num_cpus=1) +def get_clean_df(df): + + def extract_url(image_list: str) -> List[str]: + image_list = image_list.replace('[', '') + image_list = image_list.replace(']', '') + image_list = image_list.replace('"', '') + image_urls = image_list.split(',') + return image_urls + + def download_image(image_url, image_file_name, destination_blob_name): + storage_client = storage.Client() + image_found_flag = False + try: + urllib.request.urlretrieve(image_url, image_file_name) + bucket = storage_client.bucket(IMAGE_BUCKET) + blob = bucket.blob(destination_blob_name) + blob.upload_from_filename(image_file_name) + print( + f"File {image_file_name} uploaded to {destination_blob_name}." + ) + image_found_flag = True + except urllib.error.HTTPError: + print("HTTPError exception") + except urllib.error.URLError: + print("URLError exception") + except: + print("Unknown exception") + return image_found_flag + + def prep_product_desc(df): + # Cleaning the description text + spacy.cli.download("en_core_web_sm") + model = spacy.load("en_core_web_sm") + + def parse_nlp_description(description) -> str: + if not pd.isna(description): + doc = model(description.lower()) + lemmas = [] + for token in doc: + if token.lemma_ not in lemmas and not token.is_stop and token.is_alpha: + lemmas.append(token.lemma_) + return ' '.join(lemmas) + + df['description'] = df['description'].apply(parse_nlp_description) + return df + + # Extract product attributes as key-value pair + def parse_attributes(specification: str): + spec_match_one = re.compile("(.*?)\\[(.*)\\](.*)") + spec_match_two = re.compile("(.*?)=>\"(.*?)\"(.*?)=>\"(.*?)\"(.*)") + if pd.isna(specification): + return None + m = spec_match_one.match(specification) + out = {} + if m is not None and m.group(2) is not None: + phrase = '' + for c in m.group(2): + if c == '}': + m2 = spec_match_two.match(phrase) + if m2 and m2.group(2) is not None and m2.group(4) is not None: + out[m2.group(2)] = m2.group(4) + phrase = '' + else: + phrase += c + json_string = jsonpickle.encode(out) + return json_string + + def get_product_image(df): + products_with_no_image_count = 0 + products_with_no_image = [] + gcs_image_url = [] + image_found_flag = False + for id, image_list in zip(df['uniq_id'], df['image']): + + if pd.isnull(image_list): # No image url + # print("WARNING: No image url: product ", id) + products_with_no_image_count += 1 + products_with_no_image.append(id) + gcs_image_url.append(None) + continue + image_urls = extract_url(image_list) + for index in range(len(image_urls)): + image_url = image_urls[index] + image_file_name = '{}_{}.jpg'.format(id, index) + destination_blob_name = GCS_IMAGE_FOLDER + '/' + image_file_name + image_found_flag = download_image(image_url, image_file_name, destination_blob_name) + if image_found_flag: + gcs_image_url.append('gs://' + IMAGE_BUCKET + '/' + destination_blob_name) + break + if not image_found_flag: + # print("WARNING: No image: product ", id) + products_with_no_image_count += 1 + products_with_no_image.append(id) + gcs_image_url.append(None) + + # appending gcs image uri into dataframe + gcs_image_loc = pd.DataFrame(gcs_image_url, index=df.index) + gcs_image_loc.columns = ["image_uri"] + df_with_gcs_image_uri = pd.concat([df, gcs_image_loc], axis=1) + return df_with_gcs_image_uri + + df_with_gcs_image_uri = get_product_image(df) + df_with_desc = prep_product_desc(df_with_gcs_image_uri) + df_with_desc['attributes'] = df_with_desc['product_specifications'].apply(parse_attributes) + + return df_with_desc + + +def split_dataframe(df, chunk_size=199): + chunks = list() + num_chunks = len(df) // chunk_size + 1 + for i in range(num_chunks): + chunks.append(df[i * chunk_size:(i + 1) * chunk_size]) + return chunks + + +# This function invokes ray task +def run_remote(): + df = pd.read_csv('gs://'+IMAGE_BUCKET+'/flipkart_raw_dataset/flipkart_com-ecommerce_sample.csv') + df = df[['uniq_id','product_name','description','brand','image','product_specifications']] + runtime_env = {"pip": ["google-cloud-storage==2.16.0", "spacy==3.7.4", "jsonpickle==3.0.3"]} + ray.init("ray://"+RAY_CLUSTER_HOST, runtime_env=runtime_env) + print("STARTED") + start_time = time.time() + res = split_dataframe(df) + results = ray.get([get_clean_df.remote(res[i]) for i in range(len(res))]) + print("FINISHED IN ") + duration = time.time() - start_time + print(duration) + ray.shutdown() + result_df = pd.concat(results, axis=0, ignore_index=True) + result_df.to_csv('gs://'+IMAGE_BUCKET+'/flipkart_preprocessed_dataset/flipkart.csv', index=False) + return result_df + + +def main(): + clean_df = run_remote() + + +if __name__ == "__main__": + """ This is executed when run from the command line """ + main() diff --git a/ml-platform/examples/ray-dataprocessing/src/requirements.txt b/ml-platform/examples/ray-dataprocessing/src/requirements.txt new file mode 100644 index 000000000..f2abde391 --- /dev/null +++ b/ml-platform/examples/ray-dataprocessing/src/requirements.txt @@ -0,0 +1,8 @@ +ray==2.7.1 +ray[client]==2.7.1 +spacy==3.7.4 +google-cloud-storage==2.16.0 +pandas==2.2.1 +gcsfs==2024.3.1 +fsspec==2024.3.1 +jsonpickle==3.0.3 From 495155475cb37b4b7b7ba8f55008ac30f18075e2 Mon Sep 17 00:00:00 2001 From: arueth Date: Fri, 29 Mar 2024 19:06:14 +0000 Subject: [PATCH 34/39] Restructured folder --- .../ml-platform}/README.md | 2 + .../ml-platform}/docs/images/configsync.png | Bin .../images/ray-dataprocessing-workflow.png | Bin .../examples/platform/sandbox/README.md | 334 ++++++++++++++++++ .../examples/platform/sandbox}/backend.tf | 0 .../examples/platform/sandbox}/main.tf | 0 .../platform/sandbox}/mlp.auto.tfvars | 0 .../examples/platform/sandbox}/outputs.tf | 0 .../sandbox}/scripts/create_cluster_yamls.sh | 0 .../sandbox}/scripts/create_git_cred.sh | 0 .../sandbox}/scripts/create_namespace.sh | 0 .../scripts/install_kuberay_operator.sh | 0 .../sandbox}/scripts/install_ray_cluster.sh | 0 .../sandbox}/scripts/manage_ray_ns.sh | 0 .../acm-template/manifests/apps/.gitkeep | 0 .../acm-template/manifests/clusters/.gitkeep | 0 .../templates/_cluster_template/cluster.yaml | 0 .../_cluster_template/config-selector.yaml | 0 .../kuberay/kustomization.yaml | 0 .../kuberay/rayclusters.yaml | 0 .../_cluster_template/kuberay/rayjobs.yaml | 0 .../kuberay/rayservices.yaml | 0 .../_cluster_template/kuberay/rbac.yaml | 0 .../_cluster_template/kuberay/values.yaml | 0 .../_cluster_template/kustomization.yaml | 0 .../templates/_cluster_template/selector.yaml | 0 .../_cluster_template/team/kustomization.yaml | 0 .../_cluster_template/team/namespace.yaml | 0 .../team/network-policy.yaml | 0 .../_cluster_template/team/rbac.yaml | 0 .../_cluster_template/team/reposync.yaml | 0 .../app/fluentd_config.yaml | 0 .../app/kustomization.yaml | 0 .../app/serviceaccount_ray_head.yaml | 0 .../app/serviceaccount_ray_worker.yaml | 0 .../_namespace_template/app/values.yaml | 0 .../examples/platform/sandbox}/variables.tf | 0 .../examples/platform/sandbox}/versions.tf | 0 .../ray/dataprocessing}/CONVERSION.md | 0 .../use-case/ray/dataprocessing}/README.md | 0 .../use-case/ray/dataprocessing}/job.yaml | 0 .../ray/dataprocessing}/src/Dockerfile | 0 .../ray/dataprocessing}/src/preprocessing.py | 0 .../ray/dataprocessing}/src/requirements.txt | 0 .../terraform/features}/initialize/backend.tf | 0 .../features}/initialize/backend.tf.bucket | 0 .../initialize/initialize.auto.tfvars | 0 .../terraform/features}/initialize/main.tf | 0 .../features/initialize/state/default.tfstate | 21 ++ .../initialize/state/default.tfstate.backup | 197 +++++++++++ .../features}/initialize/variables.tf | 0 .../features}/initialize/versions.tf | 0 .../terraform/modules/cloud-nat/README.md | 0 .../terraform/modules/cloud-nat/main.tf | 0 .../terraform/modules/cloud-nat/outputs.tf | 0 .../terraform/modules/cloud-nat/variables.tf | 0 .../terraform/modules/cloud-nat/versions.tf | 0 .../terraform/modules/cluster/gke.tf | 0 .../terraform/modules/cluster/outputs.tf | 0 .../terraform/modules/cluster/variables.tf | 0 .../terraform/modules/cluster/versions.tf | 0 .../terraform/modules/network/README.md | 0 .../terraform/modules/network/outputs.tf | 0 .../terraform/modules/network/variables.tf | 0 .../terraform/modules/network/versions.tf | 0 .../terraform/modules/network/vpc.tf | 0 .../terraform/modules/node-pools/nodepools.tf | 0 .../terraform/modules/node-pools/variables.tf | 0 .../terraform/modules/node-pools/versions.tf | 0 .../modules/vm-reservations/outputs.tf | 0 .../modules/vm-reservations/reservations.tf | 0 .../modules/vm-reservations/variables.tf | 0 .../modules/vm-reservations/versions.tf | 0 ml-platform/terraform/README.md | 112 ------ 74 files changed, 554 insertions(+), 112 deletions(-) rename {ml-platform => best-practices/ml-platform}/README.md (99%) rename {ml-platform => best-practices/ml-platform}/docs/images/configsync.png (100%) rename {ml-platform => best-practices/ml-platform}/docs/images/ray-dataprocessing-workflow.png (100%) create mode 100644 best-practices/ml-platform/examples/platform/sandbox/README.md rename {ml-platform/terraform => best-practices/ml-platform/examples/platform/sandbox}/backend.tf (100%) rename {ml-platform/terraform => best-practices/ml-platform/examples/platform/sandbox}/main.tf (100%) rename {ml-platform/terraform => best-practices/ml-platform/examples/platform/sandbox}/mlp.auto.tfvars (100%) rename {ml-platform/terraform => best-practices/ml-platform/examples/platform/sandbox}/outputs.tf (100%) rename {ml-platform/terraform => best-practices/ml-platform/examples/platform/sandbox}/scripts/create_cluster_yamls.sh (100%) rename {ml-platform/terraform => best-practices/ml-platform/examples/platform/sandbox}/scripts/create_git_cred.sh (100%) rename {ml-platform/terraform => best-practices/ml-platform/examples/platform/sandbox}/scripts/create_namespace.sh (100%) rename {ml-platform/terraform => best-practices/ml-platform/examples/platform/sandbox}/scripts/install_kuberay_operator.sh (100%) rename {ml-platform/terraform => best-practices/ml-platform/examples/platform/sandbox}/scripts/install_ray_cluster.sh (100%) rename {ml-platform/terraform => best-practices/ml-platform/examples/platform/sandbox}/scripts/manage_ray_ns.sh (100%) rename {ml-platform/terraform => best-practices/ml-platform/examples/platform/sandbox}/templates/acm-template/manifests/apps/.gitkeep (100%) rename {ml-platform/terraform => best-practices/ml-platform/examples/platform/sandbox}/templates/acm-template/manifests/clusters/.gitkeep (100%) rename {ml-platform/terraform => best-practices/ml-platform/examples/platform/sandbox}/templates/acm-template/templates/_cluster_template/cluster.yaml (100%) rename {ml-platform/terraform => best-practices/ml-platform/examples/platform/sandbox}/templates/acm-template/templates/_cluster_template/config-selector.yaml (100%) rename {ml-platform/terraform => best-practices/ml-platform/examples/platform/sandbox}/templates/acm-template/templates/_cluster_template/kuberay/kustomization.yaml (100%) rename {ml-platform/terraform => best-practices/ml-platform/examples/platform/sandbox}/templates/acm-template/templates/_cluster_template/kuberay/rayclusters.yaml (100%) rename {ml-platform/terraform => best-practices/ml-platform/examples/platform/sandbox}/templates/acm-template/templates/_cluster_template/kuberay/rayjobs.yaml (100%) rename {ml-platform/terraform => best-practices/ml-platform/examples/platform/sandbox}/templates/acm-template/templates/_cluster_template/kuberay/rayservices.yaml (100%) rename {ml-platform/terraform => best-practices/ml-platform/examples/platform/sandbox}/templates/acm-template/templates/_cluster_template/kuberay/rbac.yaml (100%) rename {ml-platform/terraform => best-practices/ml-platform/examples/platform/sandbox}/templates/acm-template/templates/_cluster_template/kuberay/values.yaml (100%) rename {ml-platform/terraform => best-practices/ml-platform/examples/platform/sandbox}/templates/acm-template/templates/_cluster_template/kustomization.yaml (100%) rename {ml-platform/terraform => best-practices/ml-platform/examples/platform/sandbox}/templates/acm-template/templates/_cluster_template/selector.yaml (100%) rename {ml-platform/terraform => best-practices/ml-platform/examples/platform/sandbox}/templates/acm-template/templates/_cluster_template/team/kustomization.yaml (100%) rename {ml-platform/terraform => best-practices/ml-platform/examples/platform/sandbox}/templates/acm-template/templates/_cluster_template/team/namespace.yaml (100%) rename {ml-platform/terraform => best-practices/ml-platform/examples/platform/sandbox}/templates/acm-template/templates/_cluster_template/team/network-policy.yaml (100%) rename {ml-platform/terraform => best-practices/ml-platform/examples/platform/sandbox}/templates/acm-template/templates/_cluster_template/team/rbac.yaml (100%) rename {ml-platform/terraform => best-practices/ml-platform/examples/platform/sandbox}/templates/acm-template/templates/_cluster_template/team/reposync.yaml (100%) rename {ml-platform/terraform => best-practices/ml-platform/examples/platform/sandbox}/templates/acm-template/templates/_namespace_template/app/fluentd_config.yaml (100%) rename {ml-platform/terraform => best-practices/ml-platform/examples/platform/sandbox}/templates/acm-template/templates/_namespace_template/app/kustomization.yaml (100%) rename {ml-platform/terraform => best-practices/ml-platform/examples/platform/sandbox}/templates/acm-template/templates/_namespace_template/app/serviceaccount_ray_head.yaml (100%) rename {ml-platform/terraform => best-practices/ml-platform/examples/platform/sandbox}/templates/acm-template/templates/_namespace_template/app/serviceaccount_ray_worker.yaml (100%) rename {ml-platform/terraform => best-practices/ml-platform/examples/platform/sandbox}/templates/acm-template/templates/_namespace_template/app/values.yaml (100%) rename {ml-platform/terraform => best-practices/ml-platform/examples/platform/sandbox}/variables.tf (100%) rename {ml-platform/terraform => best-practices/ml-platform/examples/platform/sandbox}/versions.tf (100%) rename {ml-platform/examples/ray-dataprocessing => best-practices/ml-platform/examples/use-case/ray/dataprocessing}/CONVERSION.md (100%) rename {ml-platform/examples/ray-dataprocessing => best-practices/ml-platform/examples/use-case/ray/dataprocessing}/README.md (100%) rename {ml-platform/examples/ray-dataprocessing => best-practices/ml-platform/examples/use-case/ray/dataprocessing}/job.yaml (100%) rename {ml-platform/examples/ray-dataprocessing => best-practices/ml-platform/examples/use-case/ray/dataprocessing}/src/Dockerfile (100%) rename {ml-platform/examples/ray-dataprocessing => best-practices/ml-platform/examples/use-case/ray/dataprocessing}/src/preprocessing.py (100%) rename {ml-platform/examples/ray-dataprocessing => best-practices/ml-platform/examples/use-case/ray/dataprocessing}/src/requirements.txt (100%) rename {ml-platform/terraform => best-practices/ml-platform/terraform/features}/initialize/backend.tf (100%) rename {ml-platform/terraform => best-practices/ml-platform/terraform/features}/initialize/backend.tf.bucket (100%) rename {ml-platform/terraform => best-practices/ml-platform/terraform/features}/initialize/initialize.auto.tfvars (100%) rename {ml-platform/terraform => best-practices/ml-platform/terraform/features}/initialize/main.tf (100%) create mode 100644 best-practices/ml-platform/terraform/features/initialize/state/default.tfstate create mode 100644 best-practices/ml-platform/terraform/features/initialize/state/default.tfstate.backup rename {ml-platform/terraform => best-practices/ml-platform/terraform/features}/initialize/variables.tf (100%) rename {ml-platform/terraform => best-practices/ml-platform/terraform/features}/initialize/versions.tf (100%) rename {ml-platform => best-practices/ml-platform}/terraform/modules/cloud-nat/README.md (100%) rename {ml-platform => best-practices/ml-platform}/terraform/modules/cloud-nat/main.tf (100%) rename {ml-platform => best-practices/ml-platform}/terraform/modules/cloud-nat/outputs.tf (100%) rename {ml-platform => best-practices/ml-platform}/terraform/modules/cloud-nat/variables.tf (100%) rename {ml-platform => best-practices/ml-platform}/terraform/modules/cloud-nat/versions.tf (100%) rename {ml-platform => best-practices/ml-platform}/terraform/modules/cluster/gke.tf (100%) rename {ml-platform => best-practices/ml-platform}/terraform/modules/cluster/outputs.tf (100%) rename {ml-platform => best-practices/ml-platform}/terraform/modules/cluster/variables.tf (100%) rename {ml-platform => best-practices/ml-platform}/terraform/modules/cluster/versions.tf (100%) rename {ml-platform => best-practices/ml-platform}/terraform/modules/network/README.md (100%) rename {ml-platform => best-practices/ml-platform}/terraform/modules/network/outputs.tf (100%) rename {ml-platform => best-practices/ml-platform}/terraform/modules/network/variables.tf (100%) rename {ml-platform => best-practices/ml-platform}/terraform/modules/network/versions.tf (100%) rename {ml-platform => best-practices/ml-platform}/terraform/modules/network/vpc.tf (100%) rename {ml-platform => best-practices/ml-platform}/terraform/modules/node-pools/nodepools.tf (100%) rename {ml-platform => best-practices/ml-platform}/terraform/modules/node-pools/variables.tf (100%) rename {ml-platform => best-practices/ml-platform}/terraform/modules/node-pools/versions.tf (100%) rename {ml-platform => best-practices/ml-platform}/terraform/modules/vm-reservations/outputs.tf (100%) rename {ml-platform => best-practices/ml-platform}/terraform/modules/vm-reservations/reservations.tf (100%) rename {ml-platform => best-practices/ml-platform}/terraform/modules/vm-reservations/variables.tf (100%) rename {ml-platform => best-practices/ml-platform}/terraform/modules/vm-reservations/versions.tf (100%) delete mode 100644 ml-platform/terraform/README.md diff --git a/ml-platform/README.md b/best-practices/ml-platform/README.md similarity index 99% rename from ml-platform/README.md rename to best-practices/ml-platform/README.md index d6a1058fb..b5bc89c9c 100644 --- a/ml-platform/README.md +++ b/best-practices/ml-platform/README.md @@ -53,6 +53,8 @@ This reference architecture demonstrates how to build a GKE platform that facili ## Deploy a single environment reference architecture +README + This is the quick-start deployment guide. It can be used to set up an environment to familiarize yourself with the architecture and get an understanding of the concepts. ### Requirements diff --git a/ml-platform/docs/images/configsync.png b/best-practices/ml-platform/docs/images/configsync.png similarity index 100% rename from ml-platform/docs/images/configsync.png rename to best-practices/ml-platform/docs/images/configsync.png diff --git a/ml-platform/docs/images/ray-dataprocessing-workflow.png b/best-practices/ml-platform/docs/images/ray-dataprocessing-workflow.png similarity index 100% rename from ml-platform/docs/images/ray-dataprocessing-workflow.png rename to best-practices/ml-platform/docs/images/ray-dataprocessing-workflow.png diff --git a/best-practices/ml-platform/examples/platform/sandbox/README.md b/best-practices/ml-platform/examples/platform/sandbox/README.md new file mode 100644 index 000000000..e4cdb3e6e --- /dev/null +++ b/best-practices/ml-platform/examples/platform/sandbox/README.md @@ -0,0 +1,334 @@ +# Machine learning platform (MLP) on GKE reference architecture: Sandbox + +This quick-start deployment guide can be used to set up an environment to familiarize yourself with the architecture and get an understanding of the concepts. + +### Requirements + +In this guide you can choose to bring your project (BYOP) or have Terraform create a new project for you. The requirements are difference based on the option that you choose. + +#### Bring your own project (BYOP) + +- Project ID of a new Google Cloud Project, preferably with no APIs enabled +- `roles/owner` IAM permissions on the project +- GitHub Personal Access Token, steps to create the token are provided below + +#### Terraform managed project + +- Billing account ID +- Organization or folder ID +- `roles/billing.user` IAM permissions on the billing account specified +- `roles/resourcemanager.projectCreator` IAM permissions on the organization or folder specified +- GitHub Personal Access Token, steps to create the token are provided below + +### Pull the source code + +- Clone the repository and change directory to the guide directory + + ``` + git clone https://github.com/GoogleCloudPlatform/ai-on-gke + cd ai-on-gke/ml-platform + ``` + +- Set environment variables + + ``` + export MLP_BASE_DIR=$(pwd) && \ + echo "export MLP_BASE_DIR=${MLP_BASE_DIR}" >> ${HOME}/.bashrc + ``` + +### GitHub Configuration + +- Create a [Personal Access Token][personal-access-token] in [GitHub][github]: + + Note: It is recommended to use a [machine user account][machine-user-account] for this but you can use a personal user account just to try this reference architecture. + + **Fine-grained personal access token** + + - Go to https://github.com/settings/tokens and login using your credentials + - Click "Generate new token" >> "Generate new token (Beta)". + - Enter a Token name. + - Select the expiration. + - Select the Resource owner. + - Select All repositories + - Set the following Permissions: + - Repository permissions + - Administration: Read and write + - Content: Read and write + - Click "Generate token" + + **Personal access tokens (classic)** + + - Go to https://github.com/settings/tokens and login using your credentials + - Click "Generate new token" >> "Generate new token (classic)". + - You will be directed to a screen to created the new token. Provide the note and expiration. + - Choose the following two access: + - [x] repo - Full control of private repositories + - [x] delete_repo - Delete repositories + - Click "Generate token" + +- Store the token in a secure file. + + ``` + # Create a secure directory + mkdir -p ${HOME}/secrets/ + chmod go-rwx ${HOME}/secrets + + # Create a secure file + touch ${HOME}/secrets/mlp-github-token + chmod go-rwx ${HOME}/secrets/mlp-github-token + + # Put the token in the secure file using your preferred editor + nano ${HOME}/secrets/mlp-github-token + ``` + +- Set the GitHub environment variables in Cloud Shell + + Replace the following values: + + - `` is the GitHub organization or user namespace to use for the repositories + - `` is the GitHub account to use for authentication + - `` is the email address to use for commit + + ``` + export MLP_GITHUB_ORG="" + export MLP_GITHUB_USER="" + export MLP_GITHUB_EMAIL="" + ``` + +- Set the configuration variables + + ``` + sed -i "s/YOUR_GITHUB_EMAIL/${MLP_GITHUB_EMAIL}/g" ${MLP_BASE_DIR}/terraform/mlp.auto.tfvars + sed -i "s/YOUR_GITHUB_ORG/${MLP_GITHUB_ORG}/g" ${MLP_BASE_DIR}/terraform/mlp.auto.tfvars + sed -i "s/YOUR_GITHUB_USER/${MLP_GITHUB_USER}/g" ${MLP_BASE_DIR}/terraform/mlp.auto.tfvars + ``` + +### Project Configuration + +You only need to complete the section for the option that you have selected. + +#### Bring your own project (BYOP) + +- Set the project environment variables in Cloud Shell + + Replace the following values + + - `` is the ID of your existing Google Cloud project + + ``` + export MLP_PROJECT_ID="" + export MLP_STATE_BUCKET="${MLP_PROJECT_ID}-tf-state" + ``` + +- Set the default `gcloud` project + + ``` + gcloud config set project ${MLP_PROJECT_ID} + ``` + +- Authorize `gcloud` + + ``` + gcloud auth login --activate --no-launch-browser --quiet --update-adc + ``` + +- Create a Cloud Storage bucket to store the Terraform state + + ``` + gcloud storage buckets create gs://${MLP_STATE_BUCKET} --project ${MLP_PROJECT_ID} + ``` + +- Set the configuration variables + + ``` + sed -i "s/YOUR_STATE_BUCKET/${MLP_STATE_BUCKET}/g" ${MLP_BASE_DIR}/terraform/backend.tf + sed -i "s/YOUR_PROJECT_ID/${MLP_PROJECT_ID}/g" ${MLP_BASE_DIR}/terraform/mlp.auto.tfvars + ``` + +#### Terraform managed project + +- Set the configuration variables + + ``` + nano ${MLP_BASE_DIR}/terraform/initialize/initialize.auto.tfvars + ``` + + ``` + project = { + billing_account_id = "XXXXXX-XXXXXX-XXXXXX" + folder_id = "############" + name = "mlp" + org_id = "############" + } + ``` + + > `project.billing_account_id` the billing account ID + > + > Enter either `project.folder_id` **OR** `project.org_id` + > `project.folder_id` the folder ID + > `project.org_id` the organization ID + +- Authorize `gcloud` + + ``` + gcloud auth login --activate --no-launch-browser --quiet --update-adc + ``` + +- Create a new project + + ``` + cd ${MLP_BASE_DIR}/terraform/initialize + terraform init && \ + terraform plan -input=false -out=tfplan && \ + terraform apply -input=false tfplan && \ + rm tfplan && \ + terraform init -force-copy -migrate-state && \ + rm -rf state + ``` + +### Run Terraform + +- Create the resources + + ``` + cd ${MLP_BASE_DIR}/terraform && \ + terraform init && \ + terraform plan -input=false -var github_token="$(tr --delete '\n' < ${HOME}/secrets/mlp-github-token)" -out=tfplan && \ + terraform apply -input=false tfplan + rm tfplan + ``` + +### Review the resources + +#### GKE clusters and ConfigSync + +- Go to Google Cloud Console, click on the navigation menu and click on Kubernetes Engine > Clusters. You should see one cluster. + +- Go to Google Cloud Console, click on the navigation menu and click on Kubernetes Engine > Config. If you haven't enabled GKE Enterprise in the project earlier, Click `LEARN AND ENABLE` button and then `ENABLE GKE ENTERPRISE`. You should see a RootSync and RepoSync object. + ![configsync](docs/images/configsync.png) + +#### Software installed via RepoSync and RootSync + +Open Cloud Shell to execute the following commands: + +- Store your GKE cluster name in env variable: + + `export GKE_CLUSTER=` + +- Get cluster credentials: + + ``` + gcloud container fleet memberships get-credentials ${GKE_CLUSTER} + ``` + +- Fetch KubeRay operator CRDs + + ``` + kubectl get crd | grep ray + ``` + + The output will be similar to the following: + + ``` + rayclusters.ray.io 2024-02-12T21:19:06Z + rayjobs.ray.io 2024-02-12T21:19:09Z + rayservices.ray.io 2024-02-12T21:19:12Z + ``` + +- Fetch KubeRay operator pod + + ``` + kubectl get pods + ``` + + The output will be similar to the following: + + ``` + NAME READY STATUS RESTARTS AGE + kuberay-operator-56b8d98766-2nvht 1/1 Running 0 6m26s + ``` + +- Check the namespace `ml-team` created: + + ``` + kubectl get ns | grep ml-team + ``` + +- Check the RepoSync object created `ml-team` namespace: + ``` + kubectl get reposync -n ml-team + ``` +- Check the `raycluster` in `ml-team` namespace + + ``` + kubectl get raycluster -n ml-team + ``` + + The output will be similar to the following: + + ``` + NAME DESIRED WORKERS AVAILABLE WORKERS STATUS AGE + ray-cluster-kuberay 1 1 ready 29m + ``` + +- Check the head and worker pods of kuberay in `ml-team` namespace + ``` + kubectl get pods -n ml-team + ``` + The output will be similar to the following: + ``` + NAME READY STATUS RESTARTS AGE + ray-cluster-kuberay-head-sp6dg 2/2 Running 0 3m21s + ray-cluster-kuberay-worker-workergroup-rzpjw 2/2 Running 0 3m21s + ``` + +### Cleanup + +- Destroy the resources + + ``` + cd ${MLP_BASE_DIR}/terraform && \ + terraform init && \ + terraform destroy -auto-approve -var github_token="$(tr --delete '\n' < ${HOME}/secrets/mlp-github-token)" && \ + rm -rf .terraform .terraform.lock.hcl + ``` + +#### Project + +You only need to complete the section for the option that you have selected. + +##### Bring your own project (BYOP) + +- Delete the project + + ``` + gcloud projects delete ${MLP_PROJECT_ID} + ``` + +#### Terraform managed project + +- Destroy the project + + ``` + cd ${MLP_BASE_DIR}/terraform/initialize && \ + TERRAFORM_BUCKET_NAME=$(grep bucket backend.tf | awk -F"=" '{print $2}' | xargs) && \ + cp backend.tf.local backend.tf && \ + terraform init -force-copy -lock=false -migrate-state && \ + gsutil -m rm -rf gs://${TERRAFORM_BUCKET_NAME}/* && \ + terraform init && \ + terraform destroy -auto-approve && \ + rm -rf .terraform .terraform.lock.hcl + ``` + +[gitops]: https://about.gitlab.com/topics/gitops/ +[repo-sync]: https://cloud.google.com/anthos-config-management/docs/reference/rootsync-reposync-fields +[root-sync]: https://cloud.google.com/anthos-config-management/docs/reference/rootsync-reposync-fields +[config-sync]: https://cloud.google.com/anthos-config-management/docs/config-sync-overview +[cloud-deploy]: https://cloud.google.com/deploy?hl=en +[terraform]: https://www.terraform.io/ +[gke]: https://cloud.google.com/kubernetes-engine?hl=en +[git]: https://git-scm.com/ +[github]: https://github.com/ +[gcp-project]: https://cloud.google.com/resource-manager/docs/creating-managing-projects +[personal-access-token]: https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/managing-your-personal-access-tokens +[machine-user-account]: https://docs.github.com/en/get-started/learning-about-github/types-of-github-accounts diff --git a/ml-platform/terraform/backend.tf b/best-practices/ml-platform/examples/platform/sandbox/backend.tf similarity index 100% rename from ml-platform/terraform/backend.tf rename to best-practices/ml-platform/examples/platform/sandbox/backend.tf diff --git a/ml-platform/terraform/main.tf b/best-practices/ml-platform/examples/platform/sandbox/main.tf similarity index 100% rename from ml-platform/terraform/main.tf rename to best-practices/ml-platform/examples/platform/sandbox/main.tf diff --git a/ml-platform/terraform/mlp.auto.tfvars b/best-practices/ml-platform/examples/platform/sandbox/mlp.auto.tfvars similarity index 100% rename from ml-platform/terraform/mlp.auto.tfvars rename to best-practices/ml-platform/examples/platform/sandbox/mlp.auto.tfvars diff --git a/ml-platform/terraform/outputs.tf b/best-practices/ml-platform/examples/platform/sandbox/outputs.tf similarity index 100% rename from ml-platform/terraform/outputs.tf rename to best-practices/ml-platform/examples/platform/sandbox/outputs.tf diff --git a/ml-platform/terraform/scripts/create_cluster_yamls.sh b/best-practices/ml-platform/examples/platform/sandbox/scripts/create_cluster_yamls.sh similarity index 100% rename from ml-platform/terraform/scripts/create_cluster_yamls.sh rename to best-practices/ml-platform/examples/platform/sandbox/scripts/create_cluster_yamls.sh diff --git a/ml-platform/terraform/scripts/create_git_cred.sh b/best-practices/ml-platform/examples/platform/sandbox/scripts/create_git_cred.sh similarity index 100% rename from ml-platform/terraform/scripts/create_git_cred.sh rename to best-practices/ml-platform/examples/platform/sandbox/scripts/create_git_cred.sh diff --git a/ml-platform/terraform/scripts/create_namespace.sh b/best-practices/ml-platform/examples/platform/sandbox/scripts/create_namespace.sh similarity index 100% rename from ml-platform/terraform/scripts/create_namespace.sh rename to best-practices/ml-platform/examples/platform/sandbox/scripts/create_namespace.sh diff --git a/ml-platform/terraform/scripts/install_kuberay_operator.sh b/best-practices/ml-platform/examples/platform/sandbox/scripts/install_kuberay_operator.sh similarity index 100% rename from ml-platform/terraform/scripts/install_kuberay_operator.sh rename to best-practices/ml-platform/examples/platform/sandbox/scripts/install_kuberay_operator.sh diff --git a/ml-platform/terraform/scripts/install_ray_cluster.sh b/best-practices/ml-platform/examples/platform/sandbox/scripts/install_ray_cluster.sh similarity index 100% rename from ml-platform/terraform/scripts/install_ray_cluster.sh rename to best-practices/ml-platform/examples/platform/sandbox/scripts/install_ray_cluster.sh diff --git a/ml-platform/terraform/scripts/manage_ray_ns.sh b/best-practices/ml-platform/examples/platform/sandbox/scripts/manage_ray_ns.sh similarity index 100% rename from ml-platform/terraform/scripts/manage_ray_ns.sh rename to best-practices/ml-platform/examples/platform/sandbox/scripts/manage_ray_ns.sh diff --git a/ml-platform/terraform/templates/acm-template/manifests/apps/.gitkeep b/best-practices/ml-platform/examples/platform/sandbox/templates/acm-template/manifests/apps/.gitkeep similarity index 100% rename from ml-platform/terraform/templates/acm-template/manifests/apps/.gitkeep rename to best-practices/ml-platform/examples/platform/sandbox/templates/acm-template/manifests/apps/.gitkeep diff --git a/ml-platform/terraform/templates/acm-template/manifests/clusters/.gitkeep b/best-practices/ml-platform/examples/platform/sandbox/templates/acm-template/manifests/clusters/.gitkeep similarity index 100% rename from ml-platform/terraform/templates/acm-template/manifests/clusters/.gitkeep rename to best-practices/ml-platform/examples/platform/sandbox/templates/acm-template/manifests/clusters/.gitkeep diff --git a/ml-platform/terraform/templates/acm-template/templates/_cluster_template/cluster.yaml b/best-practices/ml-platform/examples/platform/sandbox/templates/acm-template/templates/_cluster_template/cluster.yaml similarity index 100% rename from ml-platform/terraform/templates/acm-template/templates/_cluster_template/cluster.yaml rename to best-practices/ml-platform/examples/platform/sandbox/templates/acm-template/templates/_cluster_template/cluster.yaml diff --git a/ml-platform/terraform/templates/acm-template/templates/_cluster_template/config-selector.yaml b/best-practices/ml-platform/examples/platform/sandbox/templates/acm-template/templates/_cluster_template/config-selector.yaml similarity index 100% rename from ml-platform/terraform/templates/acm-template/templates/_cluster_template/config-selector.yaml rename to best-practices/ml-platform/examples/platform/sandbox/templates/acm-template/templates/_cluster_template/config-selector.yaml diff --git a/ml-platform/terraform/templates/acm-template/templates/_cluster_template/kuberay/kustomization.yaml b/best-practices/ml-platform/examples/platform/sandbox/templates/acm-template/templates/_cluster_template/kuberay/kustomization.yaml similarity index 100% rename from ml-platform/terraform/templates/acm-template/templates/_cluster_template/kuberay/kustomization.yaml rename to best-practices/ml-platform/examples/platform/sandbox/templates/acm-template/templates/_cluster_template/kuberay/kustomization.yaml diff --git a/ml-platform/terraform/templates/acm-template/templates/_cluster_template/kuberay/rayclusters.yaml b/best-practices/ml-platform/examples/platform/sandbox/templates/acm-template/templates/_cluster_template/kuberay/rayclusters.yaml similarity index 100% rename from ml-platform/terraform/templates/acm-template/templates/_cluster_template/kuberay/rayclusters.yaml rename to best-practices/ml-platform/examples/platform/sandbox/templates/acm-template/templates/_cluster_template/kuberay/rayclusters.yaml diff --git a/ml-platform/terraform/templates/acm-template/templates/_cluster_template/kuberay/rayjobs.yaml b/best-practices/ml-platform/examples/platform/sandbox/templates/acm-template/templates/_cluster_template/kuberay/rayjobs.yaml similarity index 100% rename from ml-platform/terraform/templates/acm-template/templates/_cluster_template/kuberay/rayjobs.yaml rename to best-practices/ml-platform/examples/platform/sandbox/templates/acm-template/templates/_cluster_template/kuberay/rayjobs.yaml diff --git a/ml-platform/terraform/templates/acm-template/templates/_cluster_template/kuberay/rayservices.yaml b/best-practices/ml-platform/examples/platform/sandbox/templates/acm-template/templates/_cluster_template/kuberay/rayservices.yaml similarity index 100% rename from ml-platform/terraform/templates/acm-template/templates/_cluster_template/kuberay/rayservices.yaml rename to best-practices/ml-platform/examples/platform/sandbox/templates/acm-template/templates/_cluster_template/kuberay/rayservices.yaml diff --git a/ml-platform/terraform/templates/acm-template/templates/_cluster_template/kuberay/rbac.yaml b/best-practices/ml-platform/examples/platform/sandbox/templates/acm-template/templates/_cluster_template/kuberay/rbac.yaml similarity index 100% rename from ml-platform/terraform/templates/acm-template/templates/_cluster_template/kuberay/rbac.yaml rename to best-practices/ml-platform/examples/platform/sandbox/templates/acm-template/templates/_cluster_template/kuberay/rbac.yaml diff --git a/ml-platform/terraform/templates/acm-template/templates/_cluster_template/kuberay/values.yaml b/best-practices/ml-platform/examples/platform/sandbox/templates/acm-template/templates/_cluster_template/kuberay/values.yaml similarity index 100% rename from ml-platform/terraform/templates/acm-template/templates/_cluster_template/kuberay/values.yaml rename to best-practices/ml-platform/examples/platform/sandbox/templates/acm-template/templates/_cluster_template/kuberay/values.yaml diff --git a/ml-platform/terraform/templates/acm-template/templates/_cluster_template/kustomization.yaml b/best-practices/ml-platform/examples/platform/sandbox/templates/acm-template/templates/_cluster_template/kustomization.yaml similarity index 100% rename from ml-platform/terraform/templates/acm-template/templates/_cluster_template/kustomization.yaml rename to best-practices/ml-platform/examples/platform/sandbox/templates/acm-template/templates/_cluster_template/kustomization.yaml diff --git a/ml-platform/terraform/templates/acm-template/templates/_cluster_template/selector.yaml b/best-practices/ml-platform/examples/platform/sandbox/templates/acm-template/templates/_cluster_template/selector.yaml similarity index 100% rename from ml-platform/terraform/templates/acm-template/templates/_cluster_template/selector.yaml rename to best-practices/ml-platform/examples/platform/sandbox/templates/acm-template/templates/_cluster_template/selector.yaml diff --git a/ml-platform/terraform/templates/acm-template/templates/_cluster_template/team/kustomization.yaml b/best-practices/ml-platform/examples/platform/sandbox/templates/acm-template/templates/_cluster_template/team/kustomization.yaml similarity index 100% rename from ml-platform/terraform/templates/acm-template/templates/_cluster_template/team/kustomization.yaml rename to best-practices/ml-platform/examples/platform/sandbox/templates/acm-template/templates/_cluster_template/team/kustomization.yaml diff --git a/ml-platform/terraform/templates/acm-template/templates/_cluster_template/team/namespace.yaml b/best-practices/ml-platform/examples/platform/sandbox/templates/acm-template/templates/_cluster_template/team/namespace.yaml similarity index 100% rename from ml-platform/terraform/templates/acm-template/templates/_cluster_template/team/namespace.yaml rename to best-practices/ml-platform/examples/platform/sandbox/templates/acm-template/templates/_cluster_template/team/namespace.yaml diff --git a/ml-platform/terraform/templates/acm-template/templates/_cluster_template/team/network-policy.yaml b/best-practices/ml-platform/examples/platform/sandbox/templates/acm-template/templates/_cluster_template/team/network-policy.yaml similarity index 100% rename from ml-platform/terraform/templates/acm-template/templates/_cluster_template/team/network-policy.yaml rename to best-practices/ml-platform/examples/platform/sandbox/templates/acm-template/templates/_cluster_template/team/network-policy.yaml diff --git a/ml-platform/terraform/templates/acm-template/templates/_cluster_template/team/rbac.yaml b/best-practices/ml-platform/examples/platform/sandbox/templates/acm-template/templates/_cluster_template/team/rbac.yaml similarity index 100% rename from ml-platform/terraform/templates/acm-template/templates/_cluster_template/team/rbac.yaml rename to best-practices/ml-platform/examples/platform/sandbox/templates/acm-template/templates/_cluster_template/team/rbac.yaml diff --git a/ml-platform/terraform/templates/acm-template/templates/_cluster_template/team/reposync.yaml b/best-practices/ml-platform/examples/platform/sandbox/templates/acm-template/templates/_cluster_template/team/reposync.yaml similarity index 100% rename from ml-platform/terraform/templates/acm-template/templates/_cluster_template/team/reposync.yaml rename to best-practices/ml-platform/examples/platform/sandbox/templates/acm-template/templates/_cluster_template/team/reposync.yaml diff --git a/ml-platform/terraform/templates/acm-template/templates/_namespace_template/app/fluentd_config.yaml b/best-practices/ml-platform/examples/platform/sandbox/templates/acm-template/templates/_namespace_template/app/fluentd_config.yaml similarity index 100% rename from ml-platform/terraform/templates/acm-template/templates/_namespace_template/app/fluentd_config.yaml rename to best-practices/ml-platform/examples/platform/sandbox/templates/acm-template/templates/_namespace_template/app/fluentd_config.yaml diff --git a/ml-platform/terraform/templates/acm-template/templates/_namespace_template/app/kustomization.yaml b/best-practices/ml-platform/examples/platform/sandbox/templates/acm-template/templates/_namespace_template/app/kustomization.yaml similarity index 100% rename from ml-platform/terraform/templates/acm-template/templates/_namespace_template/app/kustomization.yaml rename to best-practices/ml-platform/examples/platform/sandbox/templates/acm-template/templates/_namespace_template/app/kustomization.yaml diff --git a/ml-platform/terraform/templates/acm-template/templates/_namespace_template/app/serviceaccount_ray_head.yaml b/best-practices/ml-platform/examples/platform/sandbox/templates/acm-template/templates/_namespace_template/app/serviceaccount_ray_head.yaml similarity index 100% rename from ml-platform/terraform/templates/acm-template/templates/_namespace_template/app/serviceaccount_ray_head.yaml rename to best-practices/ml-platform/examples/platform/sandbox/templates/acm-template/templates/_namespace_template/app/serviceaccount_ray_head.yaml diff --git a/ml-platform/terraform/templates/acm-template/templates/_namespace_template/app/serviceaccount_ray_worker.yaml b/best-practices/ml-platform/examples/platform/sandbox/templates/acm-template/templates/_namespace_template/app/serviceaccount_ray_worker.yaml similarity index 100% rename from ml-platform/terraform/templates/acm-template/templates/_namespace_template/app/serviceaccount_ray_worker.yaml rename to best-practices/ml-platform/examples/platform/sandbox/templates/acm-template/templates/_namespace_template/app/serviceaccount_ray_worker.yaml diff --git a/ml-platform/terraform/templates/acm-template/templates/_namespace_template/app/values.yaml b/best-practices/ml-platform/examples/platform/sandbox/templates/acm-template/templates/_namespace_template/app/values.yaml similarity index 100% rename from ml-platform/terraform/templates/acm-template/templates/_namespace_template/app/values.yaml rename to best-practices/ml-platform/examples/platform/sandbox/templates/acm-template/templates/_namespace_template/app/values.yaml diff --git a/ml-platform/terraform/variables.tf b/best-practices/ml-platform/examples/platform/sandbox/variables.tf similarity index 100% rename from ml-platform/terraform/variables.tf rename to best-practices/ml-platform/examples/platform/sandbox/variables.tf diff --git a/ml-platform/terraform/versions.tf b/best-practices/ml-platform/examples/platform/sandbox/versions.tf similarity index 100% rename from ml-platform/terraform/versions.tf rename to best-practices/ml-platform/examples/platform/sandbox/versions.tf diff --git a/ml-platform/examples/ray-dataprocessing/CONVERSION.md b/best-practices/ml-platform/examples/use-case/ray/dataprocessing/CONVERSION.md similarity index 100% rename from ml-platform/examples/ray-dataprocessing/CONVERSION.md rename to best-practices/ml-platform/examples/use-case/ray/dataprocessing/CONVERSION.md diff --git a/ml-platform/examples/ray-dataprocessing/README.md b/best-practices/ml-platform/examples/use-case/ray/dataprocessing/README.md similarity index 100% rename from ml-platform/examples/ray-dataprocessing/README.md rename to best-practices/ml-platform/examples/use-case/ray/dataprocessing/README.md diff --git a/ml-platform/examples/ray-dataprocessing/job.yaml b/best-practices/ml-platform/examples/use-case/ray/dataprocessing/job.yaml similarity index 100% rename from ml-platform/examples/ray-dataprocessing/job.yaml rename to best-practices/ml-platform/examples/use-case/ray/dataprocessing/job.yaml diff --git a/ml-platform/examples/ray-dataprocessing/src/Dockerfile b/best-practices/ml-platform/examples/use-case/ray/dataprocessing/src/Dockerfile similarity index 100% rename from ml-platform/examples/ray-dataprocessing/src/Dockerfile rename to best-practices/ml-platform/examples/use-case/ray/dataprocessing/src/Dockerfile diff --git a/ml-platform/examples/ray-dataprocessing/src/preprocessing.py b/best-practices/ml-platform/examples/use-case/ray/dataprocessing/src/preprocessing.py similarity index 100% rename from ml-platform/examples/ray-dataprocessing/src/preprocessing.py rename to best-practices/ml-platform/examples/use-case/ray/dataprocessing/src/preprocessing.py diff --git a/ml-platform/examples/ray-dataprocessing/src/requirements.txt b/best-practices/ml-platform/examples/use-case/ray/dataprocessing/src/requirements.txt similarity index 100% rename from ml-platform/examples/ray-dataprocessing/src/requirements.txt rename to best-practices/ml-platform/examples/use-case/ray/dataprocessing/src/requirements.txt diff --git a/ml-platform/terraform/initialize/backend.tf b/best-practices/ml-platform/terraform/features/initialize/backend.tf similarity index 100% rename from ml-platform/terraform/initialize/backend.tf rename to best-practices/ml-platform/terraform/features/initialize/backend.tf diff --git a/ml-platform/terraform/initialize/backend.tf.bucket b/best-practices/ml-platform/terraform/features/initialize/backend.tf.bucket similarity index 100% rename from ml-platform/terraform/initialize/backend.tf.bucket rename to best-practices/ml-platform/terraform/features/initialize/backend.tf.bucket diff --git a/ml-platform/terraform/initialize/initialize.auto.tfvars b/best-practices/ml-platform/terraform/features/initialize/initialize.auto.tfvars similarity index 100% rename from ml-platform/terraform/initialize/initialize.auto.tfvars rename to best-practices/ml-platform/terraform/features/initialize/initialize.auto.tfvars diff --git a/ml-platform/terraform/initialize/main.tf b/best-practices/ml-platform/terraform/features/initialize/main.tf similarity index 100% rename from ml-platform/terraform/initialize/main.tf rename to best-practices/ml-platform/terraform/features/initialize/main.tf diff --git a/best-practices/ml-platform/terraform/features/initialize/state/default.tfstate b/best-practices/ml-platform/terraform/features/initialize/state/default.tfstate new file mode 100644 index 000000000..60e4e4772 --- /dev/null +++ b/best-practices/ml-platform/terraform/features/initialize/state/default.tfstate @@ -0,0 +1,21 @@ +{ + "version": 4, + "terraform_version": "1.7.1", + "serial": 46, + "lineage": "048c221a-5b39-cc81-5abf-befd45bd76c5", + "outputs": {}, + "resources": [], + "check_results": [ + { + "object_kind": "var", + "config_addr": "var.project", + "status": "unknown", + "objects": [ + { + "object_addr": "var.project", + "status": "unknown" + } + ] + } + ] +} diff --git a/best-practices/ml-platform/terraform/features/initialize/state/default.tfstate.backup b/best-practices/ml-platform/terraform/features/initialize/state/default.tfstate.backup new file mode 100644 index 000000000..c7ebde5f7 --- /dev/null +++ b/best-practices/ml-platform/terraform/features/initialize/state/default.tfstate.backup @@ -0,0 +1,197 @@ +{ + "version": 4, + "terraform_version": "1.7.1", + "serial": 39, + "lineage": "048c221a-5b39-cc81-5abf-befd45bd76c5", + "outputs": {}, + "resources": [ + { + "mode": "managed", + "type": "google_project", + "name": "environment", + "provider": "provider[\"registry.terraform.io/hashicorp/google\"]", + "instances": [ + { + "schema_version": 1, + "attributes": { + "auto_create_network": true, + "billing_account": "01EF67-93C321-CB8BDE", + "effective_labels": {}, + "folder_id": "", + "id": "projects/mlp-dev-n5w1z3m2u4tmsjksn8q1hg", + "labels": null, + "name": "mlp-dev", + "number": "882788124079", + "org_id": "886449255148", + "project_id": "mlp-dev-n5w1z3m2u4tmsjksn8q1hg", + "skip_delete": null, + "terraform_labels": {}, + "timeouts": null + }, + "sensitive_attributes": [], + "private": "eyJlMmJmYjczMC1lY2FhLTExZTYtOGY4OC0zNDM2M2JjN2M0YzAiOnsiY3JlYXRlIjo2MDAwMDAwMDAwMDAsImRlbGV0ZSI6NjAwMDAwMDAwMDAwLCJyZWFkIjo2MDAwMDAwMDAwMDAsInVwZGF0ZSI6NjAwMDAwMDAwMDAwfSwic2NoZW1hX3ZlcnNpb24iOiIxIn0=", + "dependencies": [ + "random_string.project_id_suffix" + ] + } + ] + }, + { + "mode": "managed", + "type": "google_storage_bucket", + "name": "mlp", + "provider": "provider[\"registry.terraform.io/hashicorp/google\"]", + "instances": [ + { + "schema_version": 1, + "attributes": { + "autoclass": [], + "cors": [], + "custom_placement_config": [], + "default_event_based_hold": false, + "effective_labels": {}, + "enable_object_retention": false, + "encryption": [], + "force_destroy": false, + "id": "mlp-dev-n5w1z3m2u4tmsjksn8q1hg-mlp", + "labels": null, + "lifecycle_rule": [], + "location": "US-CENTRAL1", + "logging": [], + "name": "mlp-dev-n5w1z3m2u4tmsjksn8q1hg-mlp", + "project": "mlp-dev-n5w1z3m2u4tmsjksn8q1hg", + "public_access_prevention": "inherited", + "requester_pays": false, + "retention_policy": [], + "rpo": null, + "self_link": "https://www.googleapis.com/storage/v1/b/mlp-dev-n5w1z3m2u4tmsjksn8q1hg-mlp", + "storage_class": "STANDARD", + "terraform_labels": {}, + "timeouts": null, + "uniform_bucket_level_access": true, + "url": "gs://mlp-dev-n5w1z3m2u4tmsjksn8q1hg-mlp", + "versioning": [ + { + "enabled": true + } + ], + "website": [] + }, + "sensitive_attributes": [], + "private": "eyJlMmJmYjczMC1lY2FhLTExZTYtOGY4OC0zNDM2M2JjN2M0YzAiOnsiY3JlYXRlIjo2MDAwMDAwMDAwMDAsInJlYWQiOjI0MDAwMDAwMDAwMCwidXBkYXRlIjoyNDAwMDAwMDAwMDB9LCJzY2hlbWFfdmVyc2lvbiI6IjEifQ==", + "dependencies": [ + "google_project.environment", + "random_string.project_id_suffix" + ] + } + ] + }, + { + "mode": "managed", + "type": "null_resource", + "name": "write_environment_name", + "provider": "provider[\"registry.terraform.io/hashicorp/null\"]", + "instances": [ + { + "schema_version": 0, + "attributes": { + "id": "7387891238569079315", + "triggers": { + "md5": "dev", + "tfvars_file": "../mlp.auto.tfvars" + } + }, + "sensitive_attributes": [] + } + ] + }, + { + "mode": "managed", + "type": "null_resource", + "name": "write_project_id", + "provider": "provider[\"registry.terraform.io/hashicorp/null\"]", + "instances": [ + { + "schema_version": 0, + "attributes": { + "id": "1485193428210134212", + "triggers": { + "md5": "mlp-dev-n5w1z3m2u4tmsjksn8q1hg", + "tfvars_file": "../mlp.auto.tfvars" + } + }, + "sensitive_attributes": [], + "dependencies": [ + "google_project.environment", + "random_string.project_id_suffix" + ] + } + ] + }, + { + "mode": "managed", + "type": "null_resource", + "name": "write_storage_bucket", + "provider": "provider[\"registry.terraform.io/hashicorp/null\"]", + "instances": [ + { + "schema_version": 0, + "attributes": { + "id": "7155781797202565811", + "triggers": { + "backend_file": "../backend.tf", + "md5": "mlp-dev-n5w1z3m2u4tmsjksn8q1hg-mlp" + } + }, + "sensitive_attributes": [], + "dependencies": [ + "google_project.environment", + "google_storage_bucket.mlp", + "random_string.project_id_suffix" + ] + } + ] + }, + { + "mode": "managed", + "type": "random_string", + "name": "project_id_suffix", + "provider": "provider[\"registry.terraform.io/hashicorp/random\"]", + "instances": [ + { + "schema_version": 2, + "attributes": { + "id": "n5w1z3m2u4tmsjksn8q1hg", + "keepers": null, + "length": 22, + "lower": true, + "min_lower": 0, + "min_numeric": 0, + "min_special": 0, + "min_upper": 0, + "number": true, + "numeric": true, + "override_special": null, + "result": "n5w1z3m2u4tmsjksn8q1hg", + "special": false, + "upper": false + }, + "sensitive_attributes": [] + } + ] + } + ], + "check_results": [ + { + "object_kind": "var", + "config_addr": "var.project", + "status": "pass", + "objects": [ + { + "object_addr": "var.project", + "status": "pass" + } + ] + } + ] +} diff --git a/ml-platform/terraform/initialize/variables.tf b/best-practices/ml-platform/terraform/features/initialize/variables.tf similarity index 100% rename from ml-platform/terraform/initialize/variables.tf rename to best-practices/ml-platform/terraform/features/initialize/variables.tf diff --git a/ml-platform/terraform/initialize/versions.tf b/best-practices/ml-platform/terraform/features/initialize/versions.tf similarity index 100% rename from ml-platform/terraform/initialize/versions.tf rename to best-practices/ml-platform/terraform/features/initialize/versions.tf diff --git a/ml-platform/terraform/modules/cloud-nat/README.md b/best-practices/ml-platform/terraform/modules/cloud-nat/README.md similarity index 100% rename from ml-platform/terraform/modules/cloud-nat/README.md rename to best-practices/ml-platform/terraform/modules/cloud-nat/README.md diff --git a/ml-platform/terraform/modules/cloud-nat/main.tf b/best-practices/ml-platform/terraform/modules/cloud-nat/main.tf similarity index 100% rename from ml-platform/terraform/modules/cloud-nat/main.tf rename to best-practices/ml-platform/terraform/modules/cloud-nat/main.tf diff --git a/ml-platform/terraform/modules/cloud-nat/outputs.tf b/best-practices/ml-platform/terraform/modules/cloud-nat/outputs.tf similarity index 100% rename from ml-platform/terraform/modules/cloud-nat/outputs.tf rename to best-practices/ml-platform/terraform/modules/cloud-nat/outputs.tf diff --git a/ml-platform/terraform/modules/cloud-nat/variables.tf b/best-practices/ml-platform/terraform/modules/cloud-nat/variables.tf similarity index 100% rename from ml-platform/terraform/modules/cloud-nat/variables.tf rename to best-practices/ml-platform/terraform/modules/cloud-nat/variables.tf diff --git a/ml-platform/terraform/modules/cloud-nat/versions.tf b/best-practices/ml-platform/terraform/modules/cloud-nat/versions.tf similarity index 100% rename from ml-platform/terraform/modules/cloud-nat/versions.tf rename to best-practices/ml-platform/terraform/modules/cloud-nat/versions.tf diff --git a/ml-platform/terraform/modules/cluster/gke.tf b/best-practices/ml-platform/terraform/modules/cluster/gke.tf similarity index 100% rename from ml-platform/terraform/modules/cluster/gke.tf rename to best-practices/ml-platform/terraform/modules/cluster/gke.tf diff --git a/ml-platform/terraform/modules/cluster/outputs.tf b/best-practices/ml-platform/terraform/modules/cluster/outputs.tf similarity index 100% rename from ml-platform/terraform/modules/cluster/outputs.tf rename to best-practices/ml-platform/terraform/modules/cluster/outputs.tf diff --git a/ml-platform/terraform/modules/cluster/variables.tf b/best-practices/ml-platform/terraform/modules/cluster/variables.tf similarity index 100% rename from ml-platform/terraform/modules/cluster/variables.tf rename to best-practices/ml-platform/terraform/modules/cluster/variables.tf diff --git a/ml-platform/terraform/modules/cluster/versions.tf b/best-practices/ml-platform/terraform/modules/cluster/versions.tf similarity index 100% rename from ml-platform/terraform/modules/cluster/versions.tf rename to best-practices/ml-platform/terraform/modules/cluster/versions.tf diff --git a/ml-platform/terraform/modules/network/README.md b/best-practices/ml-platform/terraform/modules/network/README.md similarity index 100% rename from ml-platform/terraform/modules/network/README.md rename to best-practices/ml-platform/terraform/modules/network/README.md diff --git a/ml-platform/terraform/modules/network/outputs.tf b/best-practices/ml-platform/terraform/modules/network/outputs.tf similarity index 100% rename from ml-platform/terraform/modules/network/outputs.tf rename to best-practices/ml-platform/terraform/modules/network/outputs.tf diff --git a/ml-platform/terraform/modules/network/variables.tf b/best-practices/ml-platform/terraform/modules/network/variables.tf similarity index 100% rename from ml-platform/terraform/modules/network/variables.tf rename to best-practices/ml-platform/terraform/modules/network/variables.tf diff --git a/ml-platform/terraform/modules/network/versions.tf b/best-practices/ml-platform/terraform/modules/network/versions.tf similarity index 100% rename from ml-platform/terraform/modules/network/versions.tf rename to best-practices/ml-platform/terraform/modules/network/versions.tf diff --git a/ml-platform/terraform/modules/network/vpc.tf b/best-practices/ml-platform/terraform/modules/network/vpc.tf similarity index 100% rename from ml-platform/terraform/modules/network/vpc.tf rename to best-practices/ml-platform/terraform/modules/network/vpc.tf diff --git a/ml-platform/terraform/modules/node-pools/nodepools.tf b/best-practices/ml-platform/terraform/modules/node-pools/nodepools.tf similarity index 100% rename from ml-platform/terraform/modules/node-pools/nodepools.tf rename to best-practices/ml-platform/terraform/modules/node-pools/nodepools.tf diff --git a/ml-platform/terraform/modules/node-pools/variables.tf b/best-practices/ml-platform/terraform/modules/node-pools/variables.tf similarity index 100% rename from ml-platform/terraform/modules/node-pools/variables.tf rename to best-practices/ml-platform/terraform/modules/node-pools/variables.tf diff --git a/ml-platform/terraform/modules/node-pools/versions.tf b/best-practices/ml-platform/terraform/modules/node-pools/versions.tf similarity index 100% rename from ml-platform/terraform/modules/node-pools/versions.tf rename to best-practices/ml-platform/terraform/modules/node-pools/versions.tf diff --git a/ml-platform/terraform/modules/vm-reservations/outputs.tf b/best-practices/ml-platform/terraform/modules/vm-reservations/outputs.tf similarity index 100% rename from ml-platform/terraform/modules/vm-reservations/outputs.tf rename to best-practices/ml-platform/terraform/modules/vm-reservations/outputs.tf diff --git a/ml-platform/terraform/modules/vm-reservations/reservations.tf b/best-practices/ml-platform/terraform/modules/vm-reservations/reservations.tf similarity index 100% rename from ml-platform/terraform/modules/vm-reservations/reservations.tf rename to best-practices/ml-platform/terraform/modules/vm-reservations/reservations.tf diff --git a/ml-platform/terraform/modules/vm-reservations/variables.tf b/best-practices/ml-platform/terraform/modules/vm-reservations/variables.tf similarity index 100% rename from ml-platform/terraform/modules/vm-reservations/variables.tf rename to best-practices/ml-platform/terraform/modules/vm-reservations/variables.tf diff --git a/ml-platform/terraform/modules/vm-reservations/versions.tf b/best-practices/ml-platform/terraform/modules/vm-reservations/versions.tf similarity index 100% rename from ml-platform/terraform/modules/vm-reservations/versions.tf rename to best-practices/ml-platform/terraform/modules/vm-reservations/versions.tf diff --git a/ml-platform/terraform/README.md b/ml-platform/terraform/README.md deleted file mode 100644 index 1c765d049..000000000 --- a/ml-platform/terraform/README.md +++ /dev/null @@ -1,112 +0,0 @@ -## Requirements - -| Name | Version | -|------|---------| -| [github](#requirement\_github) | 6.0.1 | -| [google](#requirement\_google) | 5.19.0 | -| [google-beta](#requirement\_google-beta) | 5.19.0 | -| [null](#requirement\_null) | 3.2.2 | - -## Providers - -| Name | Version | -|------|---------| -| [github](#provider\_github) | 6.0.1 | -| [google](#provider\_google) | 5.19.0 | -| [google-beta](#provider\_google-beta) | 5.19.0 | -| [null](#provider\_null) | 3.2.2 | - -## Modules - -| Name | Source | Version | -|------|--------|---------| -| [cloud-nat](#module\_cloud-nat) | ./modules/cloud-nat | n/a | -| [create-vpc](#module\_create-vpc) | ./modules/network | n/a | -| [gcp-project](#module\_gcp-project) | ./modules/projects | n/a | -| [gke](#module\_gke) | ./modules/cluster | n/a | -| [node\_pool-ondemand](#module\_node\_pool-ondemand) | ./modules/node-pools | n/a | -| [node\_pool-reserved](#module\_node\_pool-reserved) | ./modules/node-pools | n/a | -| [node\_pool-spot](#module\_node\_pool-spot) | ./modules/node-pools | n/a | -| [reservation](#module\_reservation) | ./modules/vm-reservations | n/a | - -## Resources - -| Name | Type | -|------|------| -| [github_branch.branch](https://registry.terraform.io/providers/integrations/github/6.0.1/docs/resources/branch) | resource | -| [github_branch_default.default_branch](https://registry.terraform.io/providers/integrations/github/6.0.1/docs/resources/branch_default) | resource | -| [github_branch_protection_v3.branch_protection](https://registry.terraform.io/providers/integrations/github/6.0.1/docs/resources/branch_protection_v3) | resource | -| [github_repository.acm_repo](https://registry.terraform.io/providers/integrations/github/6.0.1/docs/resources/repository) | resource | -| [google-beta_google_gke_hub_feature.configmanagement_acm_feature](https://registry.terraform.io/providers/hashicorp/google-beta/5.19.0/docs/resources/google_gke_hub_feature) | resource | -| [google-beta_google_gke_hub_feature_membership.feature_member](https://registry.terraform.io/providers/hashicorp/google-beta/5.19.0/docs/resources/google_gke_hub_feature_membership) | resource | -| [google-beta_google_gke_hub_membership.membership](https://registry.terraform.io/providers/hashicorp/google-beta/5.19.0/docs/resources/google_gke_hub_membership) | resource | -| [google_project_service.project_services-an](https://registry.terraform.io/providers/hashicorp/google/5.19.0/docs/resources/project_service) | resource | -| [google_project_service.project_services-anc](https://registry.terraform.io/providers/hashicorp/google/5.19.0/docs/resources/project_service) | resource | -| [google_project_service.project_services-com](https://registry.terraform.io/providers/hashicorp/google/5.19.0/docs/resources/project_service) | resource | -| [google_project_service.project_services-con](https://registry.terraform.io/providers/hashicorp/google/5.19.0/docs/resources/project_service) | resource | -| [google_project_service.project_services-cr](https://registry.terraform.io/providers/hashicorp/google/5.19.0/docs/resources/project_service) | resource | -| [google_project_service.project_services-gate](https://registry.terraform.io/providers/hashicorp/google/5.19.0/docs/resources/project_service) | resource | -| [google_project_service.project_services-gkecon](https://registry.terraform.io/providers/hashicorp/google/5.19.0/docs/resources/project_service) | resource | -| [google_project_service.project_services-gkeh](https://registry.terraform.io/providers/hashicorp/google/5.19.0/docs/resources/project_service) | resource | -| [google_project_service.project_services-iam](https://registry.terraform.io/providers/hashicorp/google/5.19.0/docs/resources/project_service) | resource | -| [null_resource.create_git_cred_cms](https://registry.terraform.io/providers/hashicorp/null/3.2.2/docs/resources/resource) | resource | -| [null_resource.create_git_cred_ns](https://registry.terraform.io/providers/hashicorp/null/3.2.2/docs/resources/resource) | resource | -| [null_resource.create_namespace](https://registry.terraform.io/providers/hashicorp/null/3.2.2/docs/resources/resource) | resource | -| [null_resource.install_kuberay_operator](https://registry.terraform.io/providers/hashicorp/null/3.2.2/docs/resources/resource) | resource | -| [null_resource.install_ray_cluster](https://registry.terraform.io/providers/hashicorp/null/3.2.2/docs/resources/resource) | resource | -| [null_resource.manage_ray_ns](https://registry.terraform.io/providers/hashicorp/null/3.2.2/docs/resources/resource) | resource | - -## Inputs - -| Name | Description | Type | Default | Required | -|------|-------------|------|---------|:--------:| -| [billing\_account](#input\_billing\_account) | GCP billing account | `string` | `null` | no | -| [cluster\_name](#input\_cluster\_name) | Name of the GKE cluster | `string` | `"gke-ml"` | no | -| [configsync\_repo\_name](#input\_configsync\_repo\_name) | Name of the GitHub repo that will be synced to the cluster with Config sync. | `string` | `"config-sync-repo"` | no | -| [create\_namespace](#input\_create\_namespace) | Setup a namespace to demo. | `number` | `1` | no | -| [create\_projects](#input\_create\_projects) | Flag to create GCP projects | `number` | `0` | no | -| [env](#input\_env) | List of environments | `set(string)` |
[
"dev"
]
| no | -| [folder\_id](#input\_folder\_id) | Folder Id where the GCP projects will be created | `string` | `null` | no | -| [github\_email](#input\_github\_email) | GitHub user email. | `string` | n/a | yes | -| [github\_org](#input\_github\_org) | GitHub org. | `string` | n/a | yes | -| [github\_token](#input\_github\_token) | GitHub token. It is a token with write permissions as it will create a repo in the GitHub org. | `string` | n/a | yes | -| [github\_user](#input\_github\_user) | GitHub user name. | `string` | n/a | yes | -| [install\_kuberay](#input\_install\_kuberay) | Flag to install kuberay operator. | `number` | `1` | no | -| [install\_ray\_in\_ns](#input\_install\_ray\_in\_ns) | Flag to install ray cluster in the namespace created with the demo. | `number` | `1` | no | -| [namespace](#input\_namespace) | Name of the namespace to demo. | `string` | `"ml-team"` | no | -| [network\_name](#input\_network\_name) | VPC network where GKE cluster will be created | `string` | `"ml-vpc"` | no | -| [ondemand\_taints](#input\_ondemand\_taints) | Taints to be applied to the on-demand node pool. |
list(object({
key = string
value = any
effect = string
}))
|
[
{
"effect": "NO_SCHEDULE",
"key": "ondemand",
"value": true
}
]
| no | -| [org\_id](#input\_org\_id) | The GCP orig id | `string` | `null` | no | -| [project\_id](#input\_project\_id) | The GCP project where the resources will be created. It is a map with environments as keys and project\_ids s values | `map` | n/a | yes | -| [project\_name](#input\_project\_name) | GCP project name | `string` | `null` | no | -| [reserved\_taints](#input\_reserved\_taints) | Taints to be applied to the reserved node pool. |
list(object({
key = string
value = any
effect = string
}))
|
[
{
"effect": "NO_SCHEDULE",
"key": "reserved",
"value": true
}
]
| no | -| [routing\_mode](#input\_routing\_mode) | VPC routing mode. | `string` | `"GLOBAL"` | no | -| [secret\_for\_rootsync](#input\_secret\_for\_rootsync) | Create git-cred in config-management-system namespace. | `number` | `1` | no | -| [spot\_taints](#input\_spot\_taints) | Taints to be applied to the spot node pool. |
list(object({
key = string
value = any
effect = string
}))
|
[
{
"effect": "NO_SCHEDULE",
"key": "spot",
"value": true
}
]
| no | -| [subnet\_01\_description](#input\_subnet\_01\_description) | Description of the first subnet. | `string` | `"subnet 01"` | no | -| [subnet\_01\_ip](#input\_subnet\_01\_ip) | CIDR of the first subnet. | `string` | `"10.40.0.0/22"` | no | -| [subnet\_01\_name](#input\_subnet\_01\_name) | Name of the first subnet in the VPC network. | `string` | `"ml-vpc-subnet-01"` | no | -| [subnet\_01\_region](#input\_subnet\_01\_region) | Region of the first subnet. | `string` | `"us-central1"` | no | -| [subnet\_02\_description](#input\_subnet\_02\_description) | Description of the second subnet. | `string` | `"subnet 02"` | no | -| [subnet\_02\_ip](#input\_subnet\_02\_ip) | CIDR of the second subnet. | `string` | `"10.12.0.0/22"` | no | -| [subnet\_02\_name](#input\_subnet\_02\_name) | Name of the second subnet in the VPC network. | `string` | `"gke-vpc-subnet-02"` | no | -| [subnet\_02\_region](#input\_subnet\_02\_region) | Region of the second subnet. | `string` | `"us-west2"` | no | - -## Outputs - -| Name | Description | -|------|-------------| -| [project\_ids](#output\_project\_ids) | n/a | - -[gitops]: https://about.gitlab.com/topics/gitops/ -[repo-sync]: https://cloud.google.com/anthos-config-management/docs/reference/rootsync-reposync-fields -[root-sync]: https://cloud.google.com/anthos-config-management/docs/reference/rootsync-reposync-fields -[config-sync]: https://cloud.google.com/anthos-config-management/docs/config-sync-overview -[cloud-deploy]: https://cloud.google.com/deploy?hl=en -[terraform]: https://www.terraform.io/ -[gke]: https://cloud.google.com/kubernetes-engine?hl=en -[git]: https://git-scm.com/ -[github]: https://github.com/ -[gcp-project]: https://cloud.google.com/resource-manager/docs/creating-managing-projects -[personal-access-token]: https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/managing-your-personal-access-tokens -[machine-user-account]: https://docs.github.com/en/get-started/learning-about-github/types-of-github-accounts \ No newline at end of file From 3f0df9a1173b76a5ffc99ff6a2e589db0b0f19ed Mon Sep 17 00:00:00 2001 From: arueth Date: Fri, 29 Mar 2024 19:38:11 +0000 Subject: [PATCH 35/39] Cleanup from folder restructuring --- best-practices/ml-platform/README.md | 324 +----------------- .../examples/platform/sandbox/README.md | 32 +- .../examples/platform/sandbox/main.tf | 14 +- .../use-case/ray/dataprocessing/README.md | 32 +- 4 files changed, 48 insertions(+), 354 deletions(-) diff --git a/best-practices/ml-platform/README.md b/best-practices/ml-platform/README.md index b5bc89c9c..9261ba049 100644 --- a/best-practices/ml-platform/README.md +++ b/best-practices/ml-platform/README.md @@ -51,329 +51,13 @@ This reference architecture demonstrates how to build a GKE platform that facili - [Google Configuration Management repo-sync][repo-sync] - [GitHub][github] -## Deploy a single environment reference architecture +## Deploy the platform -README +[Sandbox Reference Architecture Guide](examples/platform/sandbox/README.md): Set up an environment to familiarize yourself with the architecture and get an understanding of the concepts. -This is the quick-start deployment guide. It can be used to set up an environment to familiarize yourself with the architecture and get an understanding of the concepts. +## Use cases -### Requirements - -In this guide you can choose to bring your project (BYOP) or have Terraform create a new project for you. The requirements are difference based on the option that you choose. - -#### Bring your own project (BYOP) - -- Project ID of a new Google Cloud Project, preferably with no APIs enabled -- `roles/owner` IAM permissions on the project -- GitHub Personal Access Token, steps to create the token are provided below - -#### Terraform managed project - -- Billing account ID -- Organization or folder ID -- `roles/billing.user` IAM permissions on the billing account specified -- `roles/resourcemanager.projectCreator` IAM permissions on the organization or folder specified -- GitHub Personal Access Token, steps to create the token are provided below - -### Pull the source code - -- Clone the repository and change directory to the guide directory - - ``` - git clone https://github.com/GoogleCloudPlatform/ai-on-gke - cd ai-on-gke/ml-platform - ``` - -- Set environment variables - - ``` - export MLP_BASE_DIR=$(pwd) && \ - echo "export MLP_BASE_DIR=${MLP_BASE_DIR}" >> ${HOME}/.bashrc - ``` - -### GitHub Configuration - -- Create a [Personal Access Token][personal-access-token] in [GitHub][github]: - - Note: It is recommended to use a [machine user account][machine-user-account] for this but you can use a personal user account just to try this reference architecture. - - **Fine-grained personal access token** - - - Go to https://github.com/settings/tokens and login using your credentials - - Click "Generate new token" >> "Generate new token (Beta)". - - Enter a Token name. - - Select the expiration. - - Select the Resource owner. - - Select All repositories - - Set the following Permissions: - - Repository permissions - - Administration: Read and write - - Content: Read and write - - Click "Generate token" - - **Personal access tokens (classic)** - - - Go to https://github.com/settings/tokens and login using your credentials - - Click "Generate new token" >> "Generate new token (classic)". - - You will be directed to a screen to created the new token. Provide the note and expiration. - - Choose the following two access: - - [x] repo - Full control of private repositories - - [x] delete_repo - Delete repositories - - Click "Generate token" - -- Store the token in a secure file. - - ``` - # Create a secure directory - mkdir -p ${HOME}/secrets/ - chmod go-rwx ${HOME}/secrets - - # Create a secure file - touch ${HOME}/secrets/mlp-github-token - chmod go-rwx ${HOME}/secrets/mlp-github-token - - # Put the token in the secure file using your preferred editor - nano ${HOME}/secrets/mlp-github-token - ``` - -- Set the GitHub environment variables in Cloud Shell - - Replace the following values: - - - `` is the GitHub organization or user namespace to use for the repositories - - `` is the GitHub account to use for authentication - - `` is the email address to use for commit - - ``` - export MLP_GITHUB_ORG="" - export MLP_GITHUB_USER="" - export MLP_GITHUB_EMAIL="" - ``` - -- Set the configuration variables - - ``` - sed -i "s/YOUR_GITHUB_EMAIL/${MLP_GITHUB_EMAIL}/g" ${MLP_BASE_DIR}/terraform/mlp.auto.tfvars - sed -i "s/YOUR_GITHUB_ORG/${MLP_GITHUB_ORG}/g" ${MLP_BASE_DIR}/terraform/mlp.auto.tfvars - sed -i "s/YOUR_GITHUB_USER/${MLP_GITHUB_USER}/g" ${MLP_BASE_DIR}/terraform/mlp.auto.tfvars - ``` - -### Project Configuration - -You only need to complete the section for the option that you have selected. - -#### Bring your own project (BYOP) - -- Set the project environment variables in Cloud Shell - - Replace the following values - - - `` is the ID of your existing Google Cloud project - - ``` - export MLP_PROJECT_ID="" - export MLP_STATE_BUCKET="${MLP_PROJECT_ID}-tf-state" - ``` - -- Set the default `gcloud` project - - ``` - gcloud config set project ${MLP_PROJECT_ID} - ``` - -- Authorize `gcloud` - - ``` - gcloud auth login --activate --no-launch-browser --quiet --update-adc - ``` - -- Create a Cloud Storage bucket to store the Terraform state - - ``` - gcloud storage buckets create gs://${MLP_STATE_BUCKET} --project ${MLP_PROJECT_ID} - ``` - -- Set the configuration variables - - ``` - sed -i "s/YOUR_STATE_BUCKET/${MLP_STATE_BUCKET}/g" ${MLP_BASE_DIR}/terraform/backend.tf - sed -i "s/YOUR_PROJECT_ID/${MLP_PROJECT_ID}/g" ${MLP_BASE_DIR}/terraform/mlp.auto.tfvars - ``` - -#### Terraform managed project - -- Set the configuration variables - - ``` - nano ${MLP_BASE_DIR}/terraform/initialize/initialize.auto.tfvars - ``` - - ``` - project = { - billing_account_id = "XXXXXX-XXXXXX-XXXXXX" - folder_id = "############" - name = "mlp" - org_id = "############" - } - ``` - - > `project.billing_account_id` the billing account ID - > - > Enter either `project.folder_id` **OR** `project.org_id` - > `project.folder_id` the folder ID - > `project.org_id` the organization ID - -- Authorize `gcloud` - - ``` - gcloud auth login --activate --no-launch-browser --quiet --update-adc - ``` - -- Create a new project - - ``` - cd ${MLP_BASE_DIR}/terraform/initialize - terraform init && \ - terraform plan -input=false -out=tfplan && \ - terraform apply -input=false tfplan && \ - rm tfplan && \ - terraform init -force-copy -migrate-state && \ - rm -rf state - ``` - -### Run Terraform - -- Create the resources - - ``` - cd ${MLP_BASE_DIR}/terraform && \ - terraform init && \ - terraform plan -input=false -var github_token="$(tr --delete '\n' < ${HOME}/secrets/mlp-github-token)" -out=tfplan && \ - terraform apply -input=false tfplan - rm tfplan - ``` - -### Review the resources - -#### GKE clusters and ConfigSync - -- Go to Google Cloud Console, click on the navigation menu and click on Kubernetes Engine > Clusters. You should see one cluster. - -- Go to Google Cloud Console, click on the navigation menu and click on Kubernetes Engine > Config. If you haven't enabled GKE Enterprise in the project earlier, Click `LEARN AND ENABLE` button and then `ENABLE GKE ENTERPRISE`. You should see a RootSync and RepoSync object. - ![configsync](docs/images/configsync.png) - -#### Software installed via RepoSync and RootSync - -Open Cloud Shell to execute the following commands: - -- Store your GKE cluster name in env variable: - - `export GKE_CLUSTER=` - -- Get cluster credentials: - - ``` - gcloud container fleet memberships get-credentials ${GKE_CLUSTER} - ``` - -- Fetch KubeRay operator CRDs - - ``` - kubectl get crd | grep ray - ``` - - The output will be similar to the following: - - ``` - rayclusters.ray.io 2024-02-12T21:19:06Z - rayjobs.ray.io 2024-02-12T21:19:09Z - rayservices.ray.io 2024-02-12T21:19:12Z - ``` - -- Fetch KubeRay operator pod - - ``` - kubectl get pods - ``` - - The output will be similar to the following: - - ``` - NAME READY STATUS RESTARTS AGE - kuberay-operator-56b8d98766-2nvht 1/1 Running 0 6m26s - ``` - -- Check the namespace `ml-team` created: - - ``` - kubectl get ns | grep ml-team - ``` - -- Check the RepoSync object created `ml-team` namespace: - ``` - kubectl get reposync -n ml-team - ``` -- Check the `raycluster` in `ml-team` namespace - - ``` - kubectl get raycluster -n ml-team - ``` - - The output will be similar to the following: - - ``` - NAME DESIRED WORKERS AVAILABLE WORKERS STATUS AGE - ray-cluster-kuberay 1 1 ready 29m - ``` - -- Check the head and worker pods of kuberay in `ml-team` namespace - ``` - kubectl get pods -n ml-team - ``` - The output will be similar to the following: - ``` - NAME READY STATUS RESTARTS AGE - ray-cluster-kuberay-head-sp6dg 2/2 Running 0 3m21s - ray-cluster-kuberay-worker-workergroup-rzpjw 2/2 Running 0 3m21s - ``` - -### Cleanup - -- Destroy the resources - - ``` - cd ${MLP_BASE_DIR}/terraform && \ - terraform init && \ - terraform destroy -auto-approve -var github_token="$(tr --delete '\n' < ${HOME}/secrets/mlp-github-token)" && \ - rm -rf .terraform .terraform.lock.hcl - ``` - -#### Project - -You only need to complete the section for the option that you have selected. - -##### Bring your own project (BYOP) - -- Delete the project - - ``` - gcloud projects delete ${MLP_PROJECT_ID} - ``` - -#### Terraform managed project - -- Destroy the project - - ``` - cd ${MLP_BASE_DIR}/terraform/initialize && \ - TERRAFORM_BUCKET_NAME=$(grep bucket backend.tf | awk -F"=" '{print $2}' | xargs) && \ - cp backend.tf.local backend.tf && \ - terraform init -force-copy -lock=false -migrate-state && \ - gsutil -m rm -rf gs://${TERRAFORM_BUCKET_NAME}/* && \ - terraform init && \ - terraform destroy -auto-approve && \ - rm -rf .terraform .terraform.lock.hcl - ``` +- [Distributed Data Processing with Ray](examples/use-case/ray/dataprocessing/README.md): Run a distributed data processing job using Ray. [gitops]: https://about.gitlab.com/topics/gitops/ [repo-sync]: https://cloud.google.com/anthos-config-management/docs/reference/rootsync-reposync-fields diff --git a/best-practices/ml-platform/examples/platform/sandbox/README.md b/best-practices/ml-platform/examples/platform/sandbox/README.md index e4cdb3e6e..0ce83cbc7 100644 --- a/best-practices/ml-platform/examples/platform/sandbox/README.md +++ b/best-practices/ml-platform/examples/platform/sandbox/README.md @@ -2,6 +2,8 @@ This quick-start deployment guide can be used to set up an environment to familiarize yourself with the architecture and get an understanding of the concepts. +**NOTE: This environment is not intended to be a long lived environment. It is intended for temporary demonstration and learning purposes.** + ### Requirements In this guide you can choose to bring your project (BYOP) or have Terraform create a new project for you. The requirements are difference based on the option that you choose. @@ -25,8 +27,8 @@ In this guide you can choose to bring your project (BYOP) or have Terraform crea - Clone the repository and change directory to the guide directory ``` - git clone https://github.com/GoogleCloudPlatform/ai-on-gke - cd ai-on-gke/ml-platform + git clone https://github.com/GoogleCloudPlatform/ai-on-gke && \ + cd ai-on-gke/best-practices/ml-platform ``` - Set environment variables @@ -34,6 +36,10 @@ In this guide you can choose to bring your project (BYOP) or have Terraform crea ``` export MLP_BASE_DIR=$(pwd) && \ echo "export MLP_BASE_DIR=${MLP_BASE_DIR}" >> ${HOME}/.bashrc + + cd examples/platform/sandbox && \ + export MLP_TYPE_BASE_DIR=$(pwd) && \ + echo "export MLP_TYPE_BASE_DIR=${MLP_TYPE_BASE_DIR}" >> ${HOME}/.bashrc ``` ### GitHub Configuration @@ -98,9 +104,9 @@ In this guide you can choose to bring your project (BYOP) or have Terraform crea - Set the configuration variables ``` - sed -i "s/YOUR_GITHUB_EMAIL/${MLP_GITHUB_EMAIL}/g" ${MLP_BASE_DIR}/terraform/mlp.auto.tfvars - sed -i "s/YOUR_GITHUB_ORG/${MLP_GITHUB_ORG}/g" ${MLP_BASE_DIR}/terraform/mlp.auto.tfvars - sed -i "s/YOUR_GITHUB_USER/${MLP_GITHUB_USER}/g" ${MLP_BASE_DIR}/terraform/mlp.auto.tfvars + sed -i "s/YOUR_GITHUB_EMAIL/${MLP_GITHUB_EMAIL}/g" ${MLP_TYPE_BASE_DIR}/mlp.auto.tfvars + sed -i "s/YOUR_GITHUB_ORG/${MLP_GITHUB_ORG}/g" ${MLP_TYPE_BASE_DIR}/mlp.auto.tfvars + sed -i "s/YOUR_GITHUB_USER/${MLP_GITHUB_USER}/g" ${MLP_TYPE_BASE_DIR}/mlp.auto.tfvars ``` ### Project Configuration @@ -141,8 +147,8 @@ You only need to complete the section for the option that you have selected. - Set the configuration variables ``` - sed -i "s/YOUR_STATE_BUCKET/${MLP_STATE_BUCKET}/g" ${MLP_BASE_DIR}/terraform/backend.tf - sed -i "s/YOUR_PROJECT_ID/${MLP_PROJECT_ID}/g" ${MLP_BASE_DIR}/terraform/mlp.auto.tfvars + sed -i "s/YOUR_STATE_BUCKET/${MLP_STATE_BUCKET}/g" ${MLP_TYPE_BASE_DIR}/backend.tf + sed -i "s/YOUR_PROJECT_ID/${MLP_PROJECT_ID}/g" ${MLP_TYPE_BASE_DIR}/mlp.auto.tfvars ``` #### Terraform managed project @@ -150,7 +156,7 @@ You only need to complete the section for the option that you have selected. - Set the configuration variables ``` - nano ${MLP_BASE_DIR}/terraform/initialize/initialize.auto.tfvars + nano ${MLP_BASE_DIR}/terraform/features/initialize/initialize.auto.tfvars ``` ``` @@ -177,7 +183,7 @@ You only need to complete the section for the option that you have selected. - Create a new project ``` - cd ${MLP_BASE_DIR}/terraform/initialize + cd ${MLP_BASE_DIR}/terraform/features/initialize terraform init && \ terraform plan -input=false -out=tfplan && \ terraform apply -input=false tfplan && \ @@ -191,7 +197,7 @@ You only need to complete the section for the option that you have selected. - Create the resources ``` - cd ${MLP_BASE_DIR}/terraform && \ + cd ${MLP_TYPE_BASE_DIR} && \ terraform init && \ terraform plan -input=false -var github_token="$(tr --delete '\n' < ${HOME}/secrets/mlp-github-token)" -out=tfplan && \ terraform apply -input=false tfplan @@ -205,7 +211,7 @@ You only need to complete the section for the option that you have selected. - Go to Google Cloud Console, click on the navigation menu and click on Kubernetes Engine > Clusters. You should see one cluster. - Go to Google Cloud Console, click on the navigation menu and click on Kubernetes Engine > Config. If you haven't enabled GKE Enterprise in the project earlier, Click `LEARN AND ENABLE` button and then `ENABLE GKE ENTERPRISE`. You should see a RootSync and RepoSync object. - ![configsync](docs/images/configsync.png) + ![configsync](/best-practices/ml-platform/docs/images/configsync.png) #### Software installed via RepoSync and RootSync @@ -287,7 +293,7 @@ Open Cloud Shell to execute the following commands: - Destroy the resources ``` - cd ${MLP_BASE_DIR}/terraform && \ + cd ${MLP_TYPE_BASE_DIR} && \ terraform init && \ terraform destroy -auto-approve -var github_token="$(tr --delete '\n' < ${HOME}/secrets/mlp-github-token)" && \ rm -rf .terraform .terraform.lock.hcl @@ -310,7 +316,7 @@ You only need to complete the section for the option that you have selected. - Destroy the project ``` - cd ${MLP_BASE_DIR}/terraform/initialize && \ + cd ${MLP_BASE_DIR}/terraform/features/initialize && \ TERRAFORM_BUCKET_NAME=$(grep bucket backend.tf | awk -F"=" '{print $2}' | xargs) && \ cp backend.tf.local backend.tf && \ terraform init -force-copy -lock=false -migrate-state && \ diff --git a/best-practices/ml-platform/examples/platform/sandbox/main.tf b/best-practices/ml-platform/examples/platform/sandbox/main.tf index 05445c183..4d953e912 100644 --- a/best-practices/ml-platform/examples/platform/sandbox/main.tf +++ b/best-practices/ml-platform/examples/platform/sandbox/main.tf @@ -100,7 +100,7 @@ resource "google_project_service" "project_services-gate" { # Networking ########################################################################## module "create-vpc" { - source = "./modules/network" + source = "../../../terraform/modules/network" depends_on = [ google_project_service.project_services-com @@ -118,7 +118,7 @@ module "create-vpc" { } module "cloud-nat" { - source = "./modules/cloud-nat" + source = "../../../terraform/modules/cloud-nat" create_router = true name = format("%s-%s", "nat-for-acm", var.environment_name) @@ -146,7 +146,7 @@ resource "google_gke_hub_feature" "configmanagement_acm_feature" { } module "gke" { - source = "./modules/cluster" + source = "../../../terraform/modules/cluster" depends_on = [ google_gke_hub_feature.configmanagement_acm_feature, @@ -165,7 +165,7 @@ module "gke" { } module "reservation" { - source = "./modules/vm-reservations" + source = "../../../terraform/modules/vm-reservations" cluster_name = module.gke.cluster_name project_id = data.google_project.environment.project_id @@ -173,7 +173,7 @@ module "reservation" { } module "node_pool-reserved" { - source = "./modules/node-pools" + source = "../../../terraform/modules/node-pools" depends_on = [ module.reservation @@ -189,7 +189,7 @@ module "node_pool-reserved" { } module "node_pool-ondemand" { - source = "./modules/node-pools" + source = "../../../terraform/modules/node-pools" depends_on = [ module.gke @@ -204,7 +204,7 @@ module "node_pool-ondemand" { } module "node_pool-spot" { - source = "./modules/node-pools" + source = "../../../terraform/modules/node-pools" depends_on = [ module.gke diff --git a/best-practices/ml-platform/examples/use-case/ray/dataprocessing/README.md b/best-practices/ml-platform/examples/use-case/ray/dataprocessing/README.md index d573f1c47..9f0e4f6c9 100644 --- a/best-practices/ml-platform/examples/use-case/ray/dataprocessing/README.md +++ b/best-practices/ml-platform/examples/use-case/ray/dataprocessing/README.md @@ -1,20 +1,23 @@ # Distributed Data Processing with Ray on GKE ## Dataset + [This](https://www.kaggle.com/datasets/PromptCloudHQ/flipkart-products) is a pre-crawled public dataset, taken as a subset of a bigger dataset (more than 5.8 million products) that was created by extracting data from [Flipkart](https://www.flipkart.com/), a leading Indian eCommerce store. ## Architecture - ![DataPreprocessing](/ml-platform/docs/images/ray-dataprocessing-workflow.png) + +![DataPreprocessing](/best-practices/ml-platform/docs/images/ray-dataprocessing-workflow.png) ## Data processing steps -The dataset has product information such as id, name, brand, description, image urls, product specifications. +The dataset has product information such as id, name, brand, description, image urls, product specifications. The preprocessing.py file does the following: -* Read the csv from Cloud Storage -* Clean up the product description text -* Extract image urls, validate and download the images into cloud storage -* Cleanup & extract attributes as key-value pairs + +- Read the csv from Cloud Storage +- Clean up the product description text +- Extract image urls, validate and download the images into cloud storage +- Cleanup & extract attributes as key-value pairs ## How to use this repo: @@ -26,14 +29,12 @@ The preprocessing.py file does the following: DOCKER_IMAGE_URL=us-docker.pkg.dev/${PROJECT_ID}/dataprocessing/dp:v0.0.1 ``` - 2. Create a Cloud Storage bucket to store raw data ``` gcloud storage buckets create gs://${PROCESSING_BUCKET} --project ${PROJECT_ID} ``` - 3. Download the raw data csv file from above and store into the bucket created in the previous step. The kaggle cli can be installed using the following [instructions](https://github.com/Kaggle/kaggle-api#installation) To use the cli you must create an API token (Kaggle > User Profile > API > Create New Token), the downloaded file should be stored in HOME/.kaggle/kaggle.json. @@ -60,6 +61,7 @@ The preprocessing.py file does the following: ``` 5. Create Artifact Registry repository for your docker image + ``` gcloud artifacts repositories create dataprocessing \ --repository-format=docker \ @@ -69,6 +71,7 @@ The preprocessing.py file does the following: ``` 6. Build container image using Cloud Build and push the image to Artifact Registry + ``` gcloud builds submit . \ --tag ${DOCKER_IMAGE_URL}:v0.0.1 @@ -91,13 +94,14 @@ kubectl apply -f job.yaml -n ml-team ``` 9. Monitor the execution in Ray Dashboard - a. Jobs -> Running Job ID - i) See the Tasks/actors overview for Running jobs - ii) See the Task Table for a detailed view of task and assigned node(s) - b. Cluster -> Node List - i) See the Ray actors running on the worker process + a. Jobs -> Running Job ID + i) See the Tasks/actors overview for Running jobs + ii) See the Task Table for a detailed view of task and assigned node(s) + b. Cluster -> Node List + i) See the Ray actors running on the worker process + +10. Once the Job is completed, both the prepared dataset as a CSV and the images are stored in Google Cloud Storage. -11. Once the Job is completed, both the prepared dataset as a CSV and the images are stored in Google Cloud Storage. ``` gcloud storage ls \ gs://${PROCESSING_BUCKET}/flipkart_preprocessed_dataset/flipkart.csv From cd069f028d96920eec4cb449d070e58265d41121 Mon Sep 17 00:00:00 2001 From: arueth Date: Fri, 29 Mar 2024 21:15:39 +0000 Subject: [PATCH 36/39] Updated the CUJs --- best-practices/ml-platform/README.md | 53 ++++++++++++++-------------- 1 file changed, 27 insertions(+), 26 deletions(-) diff --git a/best-practices/ml-platform/README.md b/best-practices/ml-platform/README.md index 9261ba049..10c43eb52 100644 --- a/best-practices/ml-platform/README.md +++ b/best-practices/ml-platform/README.md @@ -10,35 +10,36 @@ This reference architecture demonstrates how to build a GKE platform that facili - Platform admins will create a namespace per application and provide the application team member full access to it. - The namespace scoped resources will be created by the Application/ML teams either via [Config Sync][config-sync] or through a deployment tool like [Cloud Deploy][cloud-deploy] -## CUJ and Personae addressed in the reference architecture +## Critical User Journeys (CUJs) ### Persona : Platform Admin -**CUJ 1** : Provide templates with built-in standard practices to stamp out GKE platforms to be used by ML Engineers, Data Scientists and Application teams. - -**CUJ 2** : Provide GKE clusters. - -**CUJ 2** : Provide space for the teams on GKE cluster to run their workloads and the permissions following the principle of least privilege. - -**CUJ 3** : Provide secure methods to the ML Engineers, Data Scientist, Application teams and the Operators to connect to the private GKE clusters. - -**CUJ 4** : Enforcing security policies on the underlying platform. - -### Persona : ML Engineers - -**CUJ 1** : Use ML tools like `ray` to perform their day to day tasks like data pre-processing, ML training etc. - -**CUJ 2** : Use a development environment like Jupyter Notebook for faster inner loop of ML development. **[TBD]** - -### Persona : Operators - -**CUJ 1**: Act as a bridge between the Platform admins and the ML Engineers by providing and maintaining software needed by the ML engineers so they can focus on their job. - -**CUJ 2**: Deploying the models. **[TBD]** - -**CUJ 3**: Building observability on the models. **[TBD]** - -**CUJ 4**: Operationalizing the models. **[TBD]** +- Offer a platform that incorporates established best practices. +- Grant end users the essential resources, guided by the principle of least privilege, empowering them to manage and maintain their workloads. +- Establish secure channels for end users to interact seamlessly with the platform. +- Empower the enforcement of robust security policies across the platform. + +### Persona : Machine Learning Engineer + +- Deploy the model with ease and make the endpoints available only to the intended audience +- Continuously monitor the model performance and resource utilization +- Troubleshoot any performance or integration issues +- Ability to version, store and access the models and model artifacts: + - To debug & troubleshoot in production and track back to the specific model version & associated training data + - To quick & controlled rollback to a previous, more stable version +- Implement the feedback loop to adapt to changing data & business needs: + - Ability to retrain / fine-tune the model. + - Ability to split the traffic between models (A/B testing) + - Switching between the models without breaking inference system for the end-users +- Ability to scaling up/down the infra to accommodate changing needs +- Ability to share the insights and findings with stakeholders to take data-driven decisions + +### Persona : Machine Learning Operator + +- Provide and maintain software required by the end users of the platform. +- Operationalize experimental workload by providing guidance and best practices for running the workload on the platform. +- Deploy the workloads on the platform. +- Assist with enabling observability and monitoring for the workloads to ensure smooth operations. ## Prerequisites From bbff3bbd7a4bbfc4b4493d6a7b9b5c5c05a57ee6 Mon Sep 17 00:00:00 2001 From: arueth Date: Fri, 29 Mar 2024 22:08:35 +0000 Subject: [PATCH 37/39] Leaving default node pool for sanbox to decrease provisioning time --- .../examples/platform/sandbox/main.tf | 3 +++ .../terraform/modules/cluster/gke.tf | 6 ++++-- .../terraform/modules/cluster/variables.tf | 18 ++++++++++++++++++ 3 files changed, 25 insertions(+), 2 deletions(-) diff --git a/best-practices/ml-platform/examples/platform/sandbox/main.tf b/best-practices/ml-platform/examples/platform/sandbox/main.tf index 4d953e912..8da37d4b1 100644 --- a/best-practices/ml-platform/examples/platform/sandbox/main.tf +++ b/best-practices/ml-platform/examples/platform/sandbox/main.tf @@ -156,10 +156,13 @@ module "gke" { cluster_name = format("%s-%s", var.cluster_name, var.environment_name) env = var.environment_name + initial_node_count = 1 + machine_type = "n2-standard-8" master_auth_networks_ipcidr = var.subnet_01_ip network = module.create-vpc.vpc project_id = data.google_project.environment.project_id region = var.subnet_01_region + remove_default_node_pool = false subnet = module.create-vpc.subnet-1 zone = "${var.subnet_01_region}-a" } diff --git a/best-practices/ml-platform/terraform/modules/cluster/gke.tf b/best-practices/ml-platform/terraform/modules/cluster/gke.tf index 79e84215f..9c51f4d64 100644 --- a/best-practices/ml-platform/terraform/modules/cluster/gke.tf +++ b/best-practices/ml-platform/terraform/modules/cluster/gke.tf @@ -23,13 +23,13 @@ resource "google_container_cluster" "mlp" { deletion_protection = false enable_shielded_nodes = true - initial_node_count = 1 + initial_node_count = var.initial_node_count location = var.region name = var.cluster_name network = var.network node_locations = ["${var.region}-a", "${var.region}-b", "${var.region}-c"] project = var.project_id - remove_default_node_pool = true + remove_default_node_pool = var.remove_default_node_pool subnetwork = var.subnet addons_config { @@ -165,6 +165,8 @@ resource "google_container_cluster" "mlp" { } node_config { + machine_type = var.machine_type + shielded_instance_config { enable_integrity_monitoring = true enable_secure_boot = true diff --git a/best-practices/ml-platform/terraform/modules/cluster/variables.tf b/best-practices/ml-platform/terraform/modules/cluster/variables.tf index d54153b0f..e2f8c0daa 100644 --- a/best-practices/ml-platform/terraform/modules/cluster/variables.tf +++ b/best-practices/ml-platform/terraform/modules/cluster/variables.tf @@ -23,6 +23,18 @@ variable "env" { type = string } +variable "initial_node_count" { + default = 1 + description = "The number of nodes to create in this cluster's default node pool. In regional or multi-zonal clusters, this is the number of nodes per zone. Must be set if node_pool is not set. If you're using google_container_node_pool objects with no default node pool, you'll need to set this to a value of at least 1, alongside setting remove_default_node_pool to true." + type = number +} + +variable "machine_type" { + default = "e2-medium" + description = "The name of a Google Compute Engine machine type." + type = string +} + variable "master_auth_networks_ipcidr" { description = "master authorized network" type = string @@ -45,6 +57,12 @@ variable "region" { type = string } +variable "remove_default_node_pool" { + default = true + description = "If true, deletes the default node pool upon cluster creation. If you're using google_container_node_pool resources with no default node pool, this should be set to true, alongside setting initial_node_count to at least 1." + type = bool +} + variable "subnet" { description = "subnetwork where the cluster will be created" type = string From 5984eacc3328d74d5a0ae6de398f6c8ee8df47a9 Mon Sep 17 00:00:00 2001 From: arueth Date: Fri, 29 Mar 2024 22:45:40 +0000 Subject: [PATCH 38/39] Cleaned up the data processing README --- .../use-case/ray/dataprocessing/CONVERSION.md | 12 +- .../use-case/ray/dataprocessing/README.md | 172 ++++++++++-------- .../use-case/ray/dataprocessing/job.yaml | 6 +- 3 files changed, 107 insertions(+), 83 deletions(-) diff --git a/best-practices/ml-platform/examples/use-case/ray/dataprocessing/CONVERSION.md b/best-practices/ml-platform/examples/use-case/ray/dataprocessing/CONVERSION.md index 7c9f961ba..7e99345cd 100644 --- a/best-practices/ml-platform/examples/use-case/ray/dataprocessing/CONVERSION.md +++ b/best-practices/ml-platform/examples/use-case/ray/dataprocessing/CONVERSION.md @@ -1,25 +1,29 @@ # Steps to convert the code from Notebook to run with Ray on GKE 1. Decorate the function which needs to be run as remote function in Ray workers - + ``` import ray @ray.remote(num_cpus=1) ``` + 1. Create the run time environment with the libraries needed by remote function + ``` runtime_env = {"pip": ["google-cloud-storage==2.16.0", "spacy==3.7.4", "jsonpickle==3.0.3"]} ``` 1. Initialize the Ray with the Ray cluster created & pass the runtime environment along + ``` ray.init("ray://"+RAY_CLUSTER_HOST, runtime_env=runtime_env)`` ``` 1. Get remote object using ray.get() method - ``` - results = ray.get([get_clean_df.remote(res[i]) for i in range(len(res))]) - ``` + + ``` + results = ray.get([get_clean_df.remote(res[i]) for i in range(len(res))]) + ``` 1. After completing the execution, shutdown Ray clusters ``` diff --git a/best-practices/ml-platform/examples/use-case/ray/dataprocessing/README.md b/best-practices/ml-platform/examples/use-case/ray/dataprocessing/README.md index 9f0e4f6c9..8e02fe3e7 100644 --- a/best-practices/ml-platform/examples/use-case/ray/dataprocessing/README.md +++ b/best-practices/ml-platform/examples/use-case/ray/dataprocessing/README.md @@ -21,91 +21,113 @@ The preprocessing.py file does the following: ## How to use this repo: +1. Clone the repository and change directory to the guide directory + + ``` + git clone https://github.com/GoogleCloudPlatform/ai-on-gke && \ + cd ai-on-gke/best-practices/ml-platform/examples/use-case/ray/dataprocessing + ``` + 1. Set environment variables -``` - PROJECT_ID= - PROCESSING_BUCKET= - DOCKER_IMAGE_URL=us-docker.pkg.dev/${PROJECT_ID}/dataprocessing/dp:v0.0.1 -``` + ``` + CLUSTER_NAME= + PROJECT_ID= + PROCESSING_BUCKET= + DOCKER_IMAGE_URL=us-docker.pkg.dev/${PROJECT_ID}/dataprocessing/dp:v0.0.1 + ``` -2. Create a Cloud Storage bucket to store raw data +1. Create a Cloud Storage bucket to store raw data -``` - gcloud storage buckets create gs://${PROCESSING_BUCKET} --project ${PROJECT_ID} -``` + ``` + gcloud storage buckets create gs://${PROCESSING_BUCKET} --project ${PROJECT_ID} + ``` -3. Download the raw data csv file from above and store into the bucket created in the previous step. +1. Download the raw data csv file from above and store into the bucket created in the previous step. The kaggle cli can be installed using the following [instructions](https://github.com/Kaggle/kaggle-api#installation) To use the cli you must create an API token (Kaggle > User Profile > API > Create New Token), the downloaded file should be stored in HOME/.kaggle/kaggle.json. Alternatively, it can be [downloaded](https://www.kaggle.com/datasets/atharvjairath/flipkart-ecommerce-dataset) from the kaggle website -``` - kaggle datasets download --unzip atharvjairath/flipkart-ecommerce-dataset - gcloud storage cp flipkart_com-ecommerce_sample.csv \ - gs://${PROCESSING_BUCKET}/flipkart_raw_dataset/flipkart_com-ecommerce_sample.csv -``` + ``` + kaggle datasets download --unzip atharvjairath/flipkart-ecommerce-dataset && \ + gcloud storage cp flipkart_com-ecommerce_sample.csv \ + gs://${PROCESSING_BUCKET}/flipkart_raw_dataset/flipkart_com-ecommerce_sample.csv + ``` -4. Provide respective GCS bucket access rights to GKE Kubernetes Service Accounts. +1. Provide respective GCS bucket access rights to GKE Kubernetes Service Accounts. Ray head with access to read the raw source data in the storage bucket Ray worker(s) with the access to write data to the storage bucket. -``` - gcloud projects add-iam-policy-binding ${PROJECT_ID} \ - --member "serviceAccount:wi-ml-team-ray-head@${PROJECT_ID}.iam.gserviceaccount.com" \ - --role roles/storage.objectViewer - - gcloud projects add-iam-policy-binding ${PROJECT_ID} \ - --member "serviceAccount:wi-ml-team-ray-worker@${PROJECT_ID}.iam.gserviceaccount.com" \ - --role roles/storage.objectAdmin -``` - -5. Create Artifact Registry repository for your docker image - -``` - gcloud artifacts repositories create dataprocessing \ - --repository-format=docker \ - --location=us \ - --project=${PROJECT_ID} \ - --async -``` - -6. Build container image using Cloud Build and push the image to Artifact Registry - -``` - gcloud builds submit . \ - --tag ${DOCKER_IMAGE_URL}:v0.0.1 -``` - -7. Update respective variables in the Job submission manifest to reflect your configuration. - a. Image is the docker image that was built in the previous step - b. Processing bucket is the location of the GCS bucket where the source data and results will be stored - c. Ray Cluster Host - if used in this example, it should not need to be changed, but if your Ray cluster service is named differently or in a different namespace, update accordingly. - -``` -sed -i 's|#IMAGE|${DOCKER_IMAGE_URL}:v0.0.1' job.yaml -sed -i 's|#PROCESSING_BUCKET|${PROCESSING_BUCKET}' job.yaml -``` - -8. Create the Job in the “ml-team” namespace using kubectl command - -``` -kubectl apply -f job.yaml -n ml-team -``` - -9. Monitor the execution in Ray Dashboard - a. Jobs -> Running Job ID - i) See the Tasks/actors overview for Running jobs - ii) See the Task Table for a detailed view of task and assigned node(s) - b. Cluster -> Node List - i) See the Ray actors running on the worker process - -10. Once the Job is completed, both the prepared dataset as a CSV and the images are stored in Google Cloud Storage. - -``` - gcloud storage ls \ - gs://${PROCESSING_BUCKET}/flipkart_preprocessed_dataset/flipkart.csv - - gcloud storage ls \ - gs://${PROCESSING_BUCKET}/flipkart_images -``` + ``` + gcloud projects add-iam-policy-binding ${PROJECT_ID} \ + --member "serviceAccount:wi-ml-team-ray-head@${PROJECT_ID}.iam.gserviceaccount.com" \ + --role roles/storage.objectViewer + + gcloud projects add-iam-policy-binding ${PROJECT_ID} \ + --member "serviceAccount:wi-ml-team-ray-worker@${PROJECT_ID}.iam.gserviceaccount.com" \ + --role roles/storage.objectAdmin + ``` + +1. Create Artifact Registry repository for your docker image + + ``` + gcloud artifacts repositories create dataprocessing \ + --repository-format=docker \ + --location=us \ + --project=${PROJECT_ID} \ + --async + ``` + +1. Enable the Cloud Build APIs + + ``` + gcloud services enable cloudbuild.googleapis.com --project ${PROJECT_ID} + ``` + +1. Build container image using Cloud Build and push the image to Artifact Registry + + ``` + cd src && \ + gcloud builds submit --tag ${DOCKER_IMAGE_URL} . && \ + cd .. + ``` + +1. Update respective variables in the Job submission manifest to reflect your configuration. + + - Image is the docker image that was built in the previous step + - Processing bucket is the location of the GCS bucket where the source data and results will be stored + - Ray Cluster Host - if used in this example, it should not need to be changed, but if your Ray cluster service is named differently or in a different namespace, update accordingly. + + ``` + sed -i "s|#IMAGE|${DOCKER_IMAGE_URL}|" job.yaml && \ + sed -i "s|#PROCESSING_BUCKET|${PROCESSING_BUCKET}|" job.yaml + ``` + +1. Get credentials for the GKE cluster + + ``` + gcloud container fleet memberships get-credentials ${CLUSTER_NAME} + ``` + +1. Create the Job in the “ml-team” namespace using kubectl command + + ``` + kubectl apply -f job.yaml + ``` + +1. Monitor the execution in Ray Dashboard + + - Jobs -> Running Job ID + - See the Tasks/actors overview for Running jobs + - See the Task Table for a detailed view of task and assigned node(s) + - Cluster -> Node List + - See the Ray actors running on the worker process + +1. Once the Job is completed, both the prepared dataset as a CSV and the images are stored in Google Cloud Storage. + + ``` + gcloud storage ls gs://${PROCESSING_BUCKET}/flipkart_preprocessed_dataset/flipkart.csv + gcloud storage ls gs://${PROCESSING_BUCKET}/flipkart_images + ``` + +> For additional information about converting you code from a notebook to run as a Job on GKE see the [Conversion Guide](CONVERSION.md) diff --git a/best-practices/ml-platform/examples/use-case/ray/dataprocessing/job.yaml b/best-practices/ml-platform/examples/use-case/ray/dataprocessing/job.yaml index 89f14b570..cc44a972d 100644 --- a/best-practices/ml-platform/examples/use-case/ray/dataprocessing/job.yaml +++ b/best-practices/ml-platform/examples/use-case/ray/dataprocessing/job.yaml @@ -11,13 +11,11 @@ spec: spec: containers: - name: job - image: us-west2-docker.pkg.dev/cloud-sa-ml/data-processing-repo/image:latest + image: #IMAGE env: - name: "PROCESSING_BUCKET" - value: ai-infra-ml-data-processing + value: #PROCESSING_BUCKET - name: "RAY_CLUSTER_HOST" value: "ray-cluster-kuberay-head-svc.ml-team:10001" restartPolicy: Never serviceAccountName: ray-worker -######################Ray code sample################################# - From f421af92f7f834c51cd4d3a0814ba962e44f8bd6 Mon Sep 17 00:00:00 2001 From: arueth Date: Fri, 29 Mar 2024 23:51:44 +0000 Subject: [PATCH 39/39] Cleaned up the initialize feature --- .../terraform/features/initialize/main.tf | 4 ++-- .../terraform/features/initialize/output.tf | 17 +++++++++++++++++ .../terraform/features/initialize/variables.tf | 6 ++++++ 3 files changed, 25 insertions(+), 2 deletions(-) create mode 100644 best-practices/ml-platform/terraform/features/initialize/output.tf diff --git a/best-practices/ml-platform/terraform/features/initialize/main.tf b/best-practices/ml-platform/terraform/features/initialize/main.tf index 2232ad86a..bd9f5e55c 100644 --- a/best-practices/ml-platform/terraform/features/initialize/main.tf +++ b/best-practices/ml-platform/terraform/features/initialize/main.tf @@ -13,10 +13,10 @@ # limitations under the License. locals { - backend_file = "../backend.tf" + backend_file = "../../../examples/platform/${var.platform_type}/backend.tf" project_id_prefix = "${var.project.name}-${var.environment_name}" project_id_suffix_length = 29 - length(local.project_id_prefix) - tfvars_file = "../mlp.auto.tfvars" + tfvars_file = "../../../examples/platform/${var.platform_type}/mlp.auto.tfvars" } resource "random_string" "project_id_suffix" { diff --git a/best-practices/ml-platform/terraform/features/initialize/output.tf b/best-practices/ml-platform/terraform/features/initialize/output.tf new file mode 100644 index 000000000..ac134489c --- /dev/null +++ b/best-practices/ml-platform/terraform/features/initialize/output.tf @@ -0,0 +1,17 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +output "project_id" { + value = google_project.environment.project_id +} diff --git a/best-practices/ml-platform/terraform/features/initialize/variables.tf b/best-practices/ml-platform/terraform/features/initialize/variables.tf index 0a438c3a6..dd7539346 100644 --- a/best-practices/ml-platform/terraform/features/initialize/variables.tf +++ b/best-practices/ml-platform/terraform/features/initialize/variables.tf @@ -18,6 +18,12 @@ variable "environment_name" { type = string } +variable "platform_type" { + default = "sandbox" + description = "Name of the platform type" + type = string +} + variable "project" { default = { billing_account_id = ""