From 8bcdb85c3f17837714498f4015d8b71891578523 Mon Sep 17 00:00:00 2001 From: Shobhit Gupta <43795024+gushob21@users.noreply.github.com> Date: Mon, 11 Mar 2024 17:57:52 +0000 Subject: [PATCH] Mlops platform udates (#326) * Adding default single env installation and updated documentation --- ml-platform/01_gcp_project/README.md | 108 ----- ml-platform/01_gcp_project/backend.tf | 20 - ml-platform/01_gcp_project/main.tf | 22 - ml-platform/01_gcp_project/providers.tf | 22 - ml-platform/01_gcp_project/variables.tf | 43 -- ml-platform/02_gke/README.md | 141 ------- ml-platform/02_gke/main.tf | 117 ------ .../02_gke/modules/cloud-nat/versions.tf | 30 -- .../02_gke/modules/network/versions.tf | 22 - ml-platform/02_gke/outputs.tf | 17 - ml-platform/03_configsync/README.md | 147 ------- ml-platform/03_configsync/backend.tf | 20 - ml-platform/03_configsync/main.tf | 131 ------ ml-platform/03_configsync/outputs.tf | 21 - ml-platform/03_configsync/variables.tf | 48 --- ml-platform/04_setup_clusters/README.md | 139 ------- ml-platform/05_setup_teams/README.md | 169 -------- ml-platform/06_operating_teams/README.md | 154 ------- ml-platform/README.md | 278 ++++++++++--- ml-platform/{02_gke => }/backend.tf | 3 +- .../create_cluster_yamls.sh | 45 +- ml-platform/create_git_cred.sh | 38 ++ ml-platform/create_namespace.sh | 57 +++ ml-platform/install_kuberay_operator.sh | 47 +++ ml-platform/install_ray_cluster.sh | 48 +++ ml-platform/main.tf | 390 ++++++++++++++++++ ml-platform/manage_ray_ns.sh | 43 ++ ml-platform/mlenv.auto.tfvars | 9 + .../{02_gke => }/modules/cloud-nat/README.md | 2 - .../{02_gke => }/modules/cloud-nat/main.tf | 0 .../{02_gke => }/modules/cloud-nat/outputs.tf | 1 + .../modules/cloud-nat/variables.tf | 0 ml-platform/modules/cloud-nat/versions.tf | 50 +++ .../{02_gke => }/modules/cluster/gke.tf | 19 +- .../{02_gke => }/modules/cluster/outputs.tf | 2 +- .../{02_gke => }/modules/cluster/variables.tf | 2 +- .../cluster}/versions.tf | 23 +- .../{02_gke => }/modules/network/README.md | 2 - .../{02_gke => }/modules/network/outputs.tf | 2 +- .../{02_gke => }/modules/network/variables.tf | 11 +- .../network}/versions.tf | 15 +- .../{02_gke => }/modules/network/vpc.tf | 8 + .../modules/node-pools/nodepools.tf | 11 +- .../modules/node-pools/variables.tf | 9 +- .../node-pools/versions.tf} | 23 +- .../modules/projects/outputs.tf | 2 +- .../modules/projects/projects.tf | 2 +- .../modules/projects/variables.tf | 0 .../cluster => modules/projects}/versions.tf | 15 +- .../modules/vm-reservations/outputs.tf | 2 +- .../modules/vm-reservations/reservations.tf | 2 +- .../modules/vm-reservations/variables.tf | 4 - .../modules/vm-reservations/versions.tf | 39 ++ ml-platform/{01_gcp_project => }/outputs.tf | 8 +- .../acm-template/manifests/apps/.gitkeep | 0 .../acm-template/manifests/clusters/.gitkeep | 0 .../templates/_cluster_template/cluster.yaml | 0 .../_cluster_template/config-selector.yaml | 0 .../kuberay/kustomization.yaml | 0 .../kuberay/rayclusters.yaml | 0 .../_cluster_template/kuberay/rayjobs.yaml | 0 .../kuberay/rayservices.yaml | 0 .../_cluster_template/kuberay/rbac.yaml | 0 .../_cluster_template/kuberay/values.yaml | 3 +- .../_cluster_template/kustomization.yaml | 0 .../templates/_cluster_template/selector.yaml | 0 .../_cluster_template/team/kustomization.yaml | 0 .../_cluster_template/team/namespace.yaml | 2 +- .../team/network-policy.yaml | 0 .../_cluster_template/team/rbac.yaml | 0 .../_cluster_template/team/reposync.yaml | 2 +- .../app/fluentd_config.yaml | 0 .../app/kustomization.yaml | 0 .../app/serviceaccount.yaml | 0 .../_namespace_template/app/values.yaml | 0 ml-platform/{02_gke => }/variables.tf | 123 +++++- .../providers.tf => versions.tf} | 21 +- 77 files changed, 1178 insertions(+), 1556 deletions(-) delete mode 100644 ml-platform/01_gcp_project/README.md delete mode 100644 ml-platform/01_gcp_project/backend.tf delete mode 100644 ml-platform/01_gcp_project/main.tf delete mode 100644 ml-platform/01_gcp_project/providers.tf delete mode 100644 ml-platform/01_gcp_project/variables.tf delete mode 100644 ml-platform/02_gke/README.md delete mode 100644 ml-platform/02_gke/main.tf delete mode 100644 ml-platform/02_gke/modules/cloud-nat/versions.tf delete mode 100644 ml-platform/02_gke/modules/network/versions.tf delete mode 100644 ml-platform/02_gke/outputs.tf delete mode 100644 ml-platform/03_configsync/README.md delete mode 100644 ml-platform/03_configsync/backend.tf delete mode 100644 ml-platform/03_configsync/main.tf delete mode 100644 ml-platform/03_configsync/outputs.tf delete mode 100644 ml-platform/03_configsync/variables.tf delete mode 100644 ml-platform/04_setup_clusters/README.md delete mode 100644 ml-platform/05_setup_teams/README.md delete mode 100644 ml-platform/06_operating_teams/README.md rename ml-platform/{02_gke => }/backend.tf (96%) rename ml-platform/{03_configsync => }/create_cluster_yamls.sh (53%) create mode 100755 ml-platform/create_git_cred.sh create mode 100755 ml-platform/create_namespace.sh create mode 100755 ml-platform/install_kuberay_operator.sh create mode 100755 ml-platform/install_ray_cluster.sh create mode 100644 ml-platform/main.tf create mode 100755 ml-platform/manage_ray_ns.sh create mode 100644 ml-platform/mlenv.auto.tfvars rename ml-platform/{02_gke => }/modules/cloud-nat/README.md (99%) rename ml-platform/{02_gke => }/modules/cloud-nat/main.tf (100%) rename ml-platform/{02_gke => }/modules/cloud-nat/outputs.tf (99%) rename ml-platform/{02_gke => }/modules/cloud-nat/variables.tf (100%) create mode 100644 ml-platform/modules/cloud-nat/versions.tf rename ml-platform/{02_gke => }/modules/cluster/gke.tf (87%) rename ml-platform/{02_gke => }/modules/cluster/outputs.tf (99%) rename ml-platform/{02_gke => }/modules/cluster/variables.tf (99%) rename ml-platform/{02_gke/modules/node-pools => modules/cluster}/versions.tf (71%) rename ml-platform/{02_gke => }/modules/network/README.md (99%) rename ml-platform/{02_gke => }/modules/network/outputs.tf (99%) rename ml-platform/{02_gke => }/modules/network/variables.tf (92%) rename ml-platform/{02_gke/modules/vm-reservations => modules/network}/versions.tf (80%) rename ml-platform/{02_gke => }/modules/network/vpc.tf (85%) rename ml-platform/{02_gke => }/modules/node-pools/nodepools.tf (90%) rename ml-platform/{02_gke => }/modules/node-pools/variables.tf (98%) rename ml-platform/{02_gke/providers.tf => modules/node-pools/versions.tf} (71%) rename ml-platform/{01_gcp_project => }/modules/projects/outputs.tf (99%) rename ml-platform/{01_gcp_project => }/modules/projects/projects.tf (99%) rename ml-platform/{01_gcp_project => }/modules/projects/variables.tf (100%) rename ml-platform/{02_gke/modules/cluster => modules/projects}/versions.tf (80%) rename ml-platform/{02_gke => }/modules/vm-reservations/outputs.tf (99%) rename ml-platform/{02_gke => }/modules/vm-reservations/reservations.tf (99%) rename ml-platform/{02_gke => }/modules/vm-reservations/variables.tf (99%) create mode 100644 ml-platform/modules/vm-reservations/versions.tf rename ml-platform/{01_gcp_project => }/outputs.tf (72%) rename ml-platform/{03_configsync => }/templates/acm-template/manifests/apps/.gitkeep (100%) rename ml-platform/{03_configsync => }/templates/acm-template/manifests/clusters/.gitkeep (100%) rename ml-platform/{03_configsync => }/templates/acm-template/templates/_cluster_template/cluster.yaml (100%) rename ml-platform/{03_configsync => }/templates/acm-template/templates/_cluster_template/config-selector.yaml (100%) rename ml-platform/{03_configsync => }/templates/acm-template/templates/_cluster_template/kuberay/kustomization.yaml (100%) rename ml-platform/{03_configsync => }/templates/acm-template/templates/_cluster_template/kuberay/rayclusters.yaml (100%) rename ml-platform/{03_configsync => }/templates/acm-template/templates/_cluster_template/kuberay/rayjobs.yaml (100%) rename ml-platform/{03_configsync => }/templates/acm-template/templates/_cluster_template/kuberay/rayservices.yaml (100%) rename ml-platform/{03_configsync => }/templates/acm-template/templates/_cluster_template/kuberay/rbac.yaml (100%) rename ml-platform/{03_configsync => }/templates/acm-template/templates/_cluster_template/kuberay/values.yaml (99%) rename ml-platform/{03_configsync => }/templates/acm-template/templates/_cluster_template/kustomization.yaml (100%) rename ml-platform/{03_configsync => }/templates/acm-template/templates/_cluster_template/selector.yaml (100%) rename ml-platform/{03_configsync => }/templates/acm-template/templates/_cluster_template/team/kustomization.yaml (100%) rename ml-platform/{03_configsync => }/templates/acm-template/templates/_cluster_template/team/namespace.yaml (97%) rename ml-platform/{03_configsync => }/templates/acm-template/templates/_cluster_template/team/network-policy.yaml (100%) rename ml-platform/{03_configsync => }/templates/acm-template/templates/_cluster_template/team/rbac.yaml (100%) rename ml-platform/{03_configsync => }/templates/acm-template/templates/_cluster_template/team/reposync.yaml (99%) rename ml-platform/{03_configsync => }/templates/acm-template/templates/_namespace_template/app/fluentd_config.yaml (100%) rename ml-platform/{03_configsync => }/templates/acm-template/templates/_namespace_template/app/kustomization.yaml (100%) rename ml-platform/{03_configsync => }/templates/acm-template/templates/_namespace_template/app/serviceaccount.yaml (100%) rename ml-platform/{03_configsync => }/templates/acm-template/templates/_namespace_template/app/values.yaml (100%) rename ml-platform/{02_gke => }/variables.tf (56%) rename ml-platform/{03_configsync/providers.tf => versions.tf} (81%) diff --git a/ml-platform/01_gcp_project/README.md b/ml-platform/01_gcp_project/README.md deleted file mode 100644 index 269576a79..000000000 --- a/ml-platform/01_gcp_project/README.md +++ /dev/null @@ -1,108 +0,0 @@ - -## Requirements - -| Name | Version | -|------|---------| -| [google](#requirement\_google) | 4.72.1 | - - -## Modules - -| Name | Source | Version | -|------|--------|---------| -| [gcp-project](#module\_gcp-project) | ./modules/projects | n/a | - - -## Inputs - -| Name | Description | Type | Default | Required | -|-----------------------------------------------------------------------------------|--------------------------------------------------|------|------------------------------|:--------:| -| [billing\_account](#input\_billing\_account) | GCP billing account | `string` | n/a | yes | -| [env](#input\_env) | List of environments | `set(string)` |
[
"dev"
]
| no | -| [folder\_id](#input\_folder\_id) | Folder Id where the GCP projects will be created | `string` | `null` | no | -| [org\_id](#input\_org\_id) | The GCP orig id | `string` | n/a | yes | -| [project\_name](#input\_project\_name) | Project name | `string` | `ml-platfrom` | no | - -## Outputs - -| Name | Description | -|------|-------------| -| [project\_ids](#output\_project\_ids) | n/a | - - -## Workflow - -This module accepts a list of environments and creates a GCP project for each environment. - -Typically, you would want to have dev, staging and production environments created in separate projects. To have such isolation, pass `env` input variable as `[ "dev", "staging", "prod" ]`. This will create one project for dev, staging and prod environments. You can update the input variable `env` based on how many environments/projects you want to create. - -However, if you want to use a single project for multiple environments, you can create just one project by passing one element to `env` input variable list e.g [ "dev" ] or ["my-playground"] etc. - -## Prerequisite -To run this Terraform Module, you need to have the following IAM roles: -- roles/resourcemanager.projectCreator - -## Usage - -- Create a new GCP project that will host the TF state bucket. - - To create a new project, open `cloudshell` and run the following command: - ``` - gcloud projects create - ``` - - Associate billing account to the project - ``` - gcloud beta billing projects link \ - --billing-account - ``` - -- Create a GCS bucket in the project for storing TF state. - - To create a new bucket, run the following command in `cloudshell` - ``` - gcloud storage buckets create gs://-tf-state --location= --project - ``` -- Clone the repo and change dir - ``` - git clone https://github.com/GoogleCloudPlatform/ai-on-gke - cd ml-platform/01_gcp_project - ``` -- In backend.tf replace `YOUR_STATE_BUCKET` with the name of the GCS bucket. -- In variables.tf: - - replace `YOUR_GCP_ORG_ID` with your GCP Org ID. - - replace `YOUR_BILLING_ACCOUNT` with GCP your Billing account. - - (optional) overridde the default value of `folder_id` with the numeric ID of the folder this project should be created under. If you leave `folder_id` null, the projects will bw created under your org. - - (optional) override the default value of `env`. See [workflow](#workflow) for details. - -- terraform init -- terraform plan -- terraform apply --auto-approve - - -## Clean up - -1. The easiest way to prevent continued billing for the resources that you created for this tutorial is to delete the project you created for the tutorial. Run the following commands from Cloud Shell: - - ```bash - gcloud config unset project && \ - echo y | gcloud projects delete - ``` - -2. If the project needs to be left intact, another option is to destroy the infrastructure created from this module. Note, this does not destroy the Cloud Storage bucket containing the Terraform state and service enablement created out of Terraform. - - ```bash - cd ml-platform/01_gcp_project && \ - terraform destroy --auto-approve - ``` \ No newline at end of file diff --git a/ml-platform/01_gcp_project/backend.tf b/ml-platform/01_gcp_project/backend.tf deleted file mode 100644 index b54d5aca8..000000000 --- a/ml-platform/01_gcp_project/backend.tf +++ /dev/null @@ -1,20 +0,0 @@ -# Copyright 2024 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -terraform { - backend "gcs" { - prefix = "01_gcp_project" - bucket = "YOUR_STATE_BUCKET" - } -} diff --git a/ml-platform/01_gcp_project/main.tf b/ml-platform/01_gcp_project/main.tf deleted file mode 100644 index 1dadd943e..000000000 --- a/ml-platform/01_gcp_project/main.tf +++ /dev/null @@ -1,22 +0,0 @@ -# Copyright 2024 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -module "gcp-project" { - source = "./modules/projects" - org_id = var.org_id - folder_id = var.folder_id - env = var.env - billing_account = var.billing_account - project_name = var.project_name -} diff --git a/ml-platform/01_gcp_project/providers.tf b/ml-platform/01_gcp_project/providers.tf deleted file mode 100644 index 95ff9fe61..000000000 --- a/ml-platform/01_gcp_project/providers.tf +++ /dev/null @@ -1,22 +0,0 @@ -# Copyright 2024 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -terraform { - required_providers { - google = { - source = "hashicorp/google" - version = "4.72.1" - } - } -} diff --git a/ml-platform/01_gcp_project/variables.tf b/ml-platform/01_gcp_project/variables.tf deleted file mode 100644 index bb1adda73..000000000 --- a/ml-platform/01_gcp_project/variables.tf +++ /dev/null @@ -1,43 +0,0 @@ -# Copyright 2024 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -variable "org_id" { - type = string - description = "The GCP orig id" - default = "YOUR_GCP_ORG_ID" -} - -variable "env" { - type = set(string) - description = "List of environments" - default = ["dev"] -} - -variable "folder_id" { - type = string - description = "Folder Id where the GCP projects will be created" - default = null -} - -variable "billing_account" { - type = string - description = "GCP billing account" - default = "YOUR_BILLING_ACCOUNT" -} - -variable "project_name" { - type = string - description = "GCP project name" - default = "ml-platform" -} \ No newline at end of file diff --git a/ml-platform/02_gke/README.md b/ml-platform/02_gke/README.md deleted file mode 100644 index 248d1ee57..000000000 --- a/ml-platform/02_gke/README.md +++ /dev/null @@ -1,141 +0,0 @@ - -## Requirements - -| Name | Version | -|------|---------| -| [google](#requirement\_google) | 4.72.1 | -| [google-beta](#requirement\_google-beta) | 4.72.1 | - -## Modules - -| Name | Source | Version | -|------|--------|---------| -| [cloud-nat](#module\_cloud-nat) | ./modules/cloud-nat | n/a | -| [create-vpc](#module\_create-vpc) | ./modules/network | n/a | -| [gke](#module\_gke) | ./modules/cluster | n/a | -| [node\_pool-ondemand](#module\_node\_pool-ondemand) | ./modules/node-pools | n/a | -| [node\_pool-reserved](#module\_node\_pool-reserved) | ./modules/node-pools | n/a | -| [node\_pool-spot](#module\_node\_pool-spot) | ./modules/node-pools | n/a | -| [reservation](#module\_reservation) | ./modules/vm-reservations | n/a | - -## Inputs - -| Name | Description | Type | Default | Required | -|------|-------------|------|---------------------------------------------------------------------------------------------------------------------------------------------|:--------:| -| [cluster\_name](#input\_cluster\_name) | Name of the GKE cluster | `string` | `"gke-ml"` | no | -| [lookup\_state\_bucket](#input\_lookup\_state\_bucket) | GCS bucket to look up TF state from previous steps. | `string` | n/a | yes | -| [network\_name](#input\_network\_name) | VPC network where GKE cluster will be created | `string` | `"ml-vpc"` | no | -| [ondemand\_taints](#input\_ondemand\_taints) | Taints to be applied to the on-demand node pool. |
list(object({
key = string
value = any
effect = string
}))
|
[
{
"effect": "NO_SCHEDULE",
"key": "ondemand",
"value": true
}
]
| no | -| [project\_id](#input\_project\_id) | The GCP project where the resources will be created. It is a map with environments a skeys and project\_ids s values | `map` | n/a
 An example : 
project_id = {
"dev": "project_id1",
"staging": "project_id2",
"prod": "project_id3"
}
| yes | -| [reserved\_taints](#input\_reserved\_taints) | Taints to be applied to the reserved node pool. |
list(object({
key = string
value = any
effect = string
}))
|
[
{
"effect": "NO_SCHEDULE",
"key": "reserved",
"value": true
}
]
| no | -| [routing\_mode](#input\_routing\_mode) | VPC routing mode. | `string` | `"GLOBAL"` | no | -| [spot\_taints](#input\_spot\_taints) | Taints to be applied to the spot node pool. |
list(object({
key = string
value = any
effect = string
}))
|
[
{
"effect": "NO_SCHEDULE",
"key": "spot",
"value": true
}
]
| no | -| [subnet\_01\_description](#input\_subnet\_01\_description) | Description of the first subnet. | `string` | `"subnet 01"` | no | -| [subnet\_01\_ip](#input\_subnet\_01\_ip) | CIDR of the first subnet. | `string` | `"10.40.0.0/22"` | no | -| [subnet\_01\_name](#input\_subnet\_01\_name) | Name of the first subnet in the VPC network. | `string` | `"ml-vpc-subnet-01"` | no | -| [subnet\_01\_region](#input\_subnet\_01\_region) | Region of the first subnet. | `string` | `"us-central1"` | no | -| [subnet\_02\_description](#input\_subnet\_02\_description) | Description of the second subnet. | `string` | `"subnet 02"` | no | -| [subnet\_02\_ip](#input\_subnet\_02\_ip) | CIDR of the second subnet. | `string` | `"10.12.0.0/22"` | no | -| [subnet\_02\_name](#input\_subnet\_02\_name) | Name of the second subnet in the VPC network. | `string` | `"gke-vpc-subnet-02"` | no | -| [subnet\_02\_region](#input\_subnet\_02\_region) | Region of the second subnet. | `string` | `"us-west2"` | no | - -## Outputs - -| Name | Description | -|------|------------------| -| [gke\_cluster](#output\_gke\_cluster) | GKE cluster info | - -## Prerequisite -To run this Terraform Module, you need to have the following IAM roles on the projects where the GKE clusters will be created: -- roles/Owner - -## Usage -- Skip this step if you have run [01_gcp_project][projects] to create GCP projects. If you are starting from this module, run these steps. - - Create a new GCP project that will host the TF state bucket or use an existing project. - - To create a new project, open `cloudshell` and run the following command: - ``` - gcloud projects create - ``` - - Associate billing account to the project - ``` - gcloud beta billing projects link \ - --billing-account - ``` - - - Create a GCS bucket in the project for storing TF state. - - To create a new bucket, run the following command in `cloudshell` - ``` - gcloud storage buckets create gs://-tf-state --location= --project - ``` -- Clone the repo and change dir - ``` - git clone https://github.com/GoogleCloudPlatform/ai-on-gke - cd ml-platform/02_gke - ``` -- In backend.tf replace `YOUR_STATE_BUCKET` with the name of the GCS bucket. -- In variables.tf, provide the values of the following variables: - - `project_id` : If you created the projects using [01_gcp_project][projects] module, no need to provide a value for it as TF will read the project ids from the state file. - If you are providing your existing project ids, provide it in the following format. - - The following is an example of creating three env in the same GCP project : - ``` - { "dev" : "project1", "staging" : "project1", "prod" : "project1" } - ``` - The following is an example of creating three env in three different projects: - ``` - { "dev" : "project1", "staging" : "project2", "prod" : "project3" } - ``` - - - `lookup_state_bucket` : provide the name of the GCS bucket. - - -- If you did not use [01_gcp_projects][projects] module to create GCP projects and are supplying your project ids in variables.tf, enable the following APIs in those project. - - In `cloudshell`, run: - ``` - gcloud config set project - - gcloud services enable cloudresourcemanager.googleapis.com iam.googleapis.com container.googleapis.com gkehub.googleapis.com anthos.googleapis.com anthosconfigmanagement.googleapis.com compute.googleapis.com - ``` - -- terraform init -- terraform plan -- terraform apply --auto-approve - -When Terraform apply has been completed, you will get the following resources: -- A VPC network per environment with a NAT gateway and Cloud router. -- A private GKE cluster per environment. This cluster will be created in the respective VPC. -- VM reservation for `nvidia-l4` -- Three node pools, spot, reserved and on-demand respectively. - - -## Clean up - -1. The easiest way to prevent continued billing for the resources that you created for this tutorial is to delete the project you created for the tutorial. Run the following commands from Cloud Shell: - - ```bash - gcloud config unset project && \ - echo y | gcloud projects delete - ``` - -2. If the project needs to be left intact, another option is to destroy the infrastructure created from this module. Note, this does not destroy the Cloud Storage bucket containing the Terraform state and service enablement created out of Terraform. - - ```bash - cd ml-platform/02_gke && \ - terraform destroy --auto-approve - ``` - -[projects]: ../01_gcp_project/README.md \ No newline at end of file diff --git a/ml-platform/02_gke/main.tf b/ml-platform/02_gke/main.tf deleted file mode 100644 index d8fbc0b21..000000000 --- a/ml-platform/02_gke/main.tf +++ /dev/null @@ -1,117 +0,0 @@ -# Copyright 2024 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -data "terraform_remote_state" "gcp-projects" { - count = length(keys("${var.project_id}")) == 0 ? 1 : 0 - backend = "gcs" - config = { - bucket = var.lookup_state_bucket - prefix = "01_gcp_project" - } -} - -locals { - parsed_project_id = length(keys("${var.project_id}")) == 0 ? data.terraform_remote_state.gcp-projects[0].outputs.project_ids : var.project_id -} - -module "create-vpc" { - for_each = local.parsed_project_id - source = "./modules/network" - project_id = each.value - network_name = format("%s-%s", var.network_name, each.key) - routing_mode = var.routing_mode - subnet_01_name = format("%s-%s", var.subnet_01_name, each.key) - subnet_01_ip = var.subnet_01_ip - subnet_01_region = var.subnet_01_region - subnet_02_name = format("%s-%s", var.subnet_02_name, each.key) - subnet_02_ip = var.subnet_02_ip - subnet_02_region = var.subnet_02_region - #default_route_name = format("%s-%s","default-route",each.key) -} - -resource "google_gke_hub_feature" "configmanagement_acm_feature" { - count = length(distinct(values(local.parsed_project_id))) - name = "configmanagement" - project = distinct(values(local.parsed_project_id))[count.index] - location = "global" - provider = google-beta -} - -module "gke" { - for_each = local.parsed_project_id - source = "./modules/cluster" - cluster_name = format("%s-%s", var.cluster_name, each.key) - network = module.create-vpc[each.key].vpc - subnet = module.create-vpc[each.key].subnet-1 - project_id = each.value - region = var.subnet_01_region - zone = "${var.subnet_01_region}-a" - master_auth_networks_ipcidr = var.subnet_01_ip - depends_on = [google_gke_hub_feature.configmanagement_acm_feature] - env = each.key -} -module "reservation" { - for_each = local.parsed_project_id - source = "./modules/vm-reservations" - cluster_name = module.gke[each.key].cluster_name - zone = "${var.subnet_01_region}-a" - project_id = each.value - depends_on = [module.gke] -} -module "node_pool-reserved" { - for_each = local.parsed_project_id - source = "./modules/node-pools" - node_pool_name = "reservation" - project_id = each.value - cluster_name = module.gke[each.key].cluster_name - region = var.subnet_01_region - taints = var.reserved_taints - resource_type = "reservation" - reservation_name = module.reservation[each.key].reservation_name -} - -module "node_pool-ondemand" { - for_each = local.parsed_project_id - source = "./modules/node-pools" - node_pool_name = "ondemand" - project_id = each.value - cluster_name = module.gke[each.key].cluster_name - region = var.subnet_01_region - taints = var.ondemand_taints - resource_type = "ondemand" -} - -module "node_pool-spot" { - for_each = local.parsed_project_id - source = "./modules/node-pools" - node_pool_name = "spot" - project_id = each.value - cluster_name = module.gke[each.key].cluster_name - region = var.subnet_01_region - taints = var.spot_taints - resource_type = "spot" - -} - -module "cloud-nat" { - for_each = local.parsed_project_id - source = "./modules/cloud-nat" - project_id = each.value - region = split("/", module.create-vpc[each.key].subnet-1)[3] - name = format("%s-%s", "nat-for-acm", each.key) - network = module.create-vpc[each.key].vpc - create_router = true - router = format("%s-%s", "router-for-acm", each.key) - depends_on = [module.create-vpc] -} diff --git a/ml-platform/02_gke/modules/cloud-nat/versions.tf b/ml-platform/02_gke/modules/cloud-nat/versions.tf deleted file mode 100644 index ee7532c5e..000000000 --- a/ml-platform/02_gke/modules/cloud-nat/versions.tf +++ /dev/null @@ -1,30 +0,0 @@ -# Copyright 2024 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -terraform { - required_providers { - - google = { - source = "hashicorp/google" - #version = ">= 4.51, < 5.0" - version = "4.72.1" - } - - random = { - source = "hashicorp/random" - version = "~> 2.2" - } - } - -} diff --git a/ml-platform/02_gke/modules/network/versions.tf b/ml-platform/02_gke/modules/network/versions.tf deleted file mode 100644 index c5f8c84a4..000000000 --- a/ml-platform/02_gke/modules/network/versions.tf +++ /dev/null @@ -1,22 +0,0 @@ -# Copyright 2024 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -terraform { - required_providers { - google = { - source = "hashicorp/google" - version = ">= 4.28.0" - } - } -} diff --git a/ml-platform/02_gke/outputs.tf b/ml-platform/02_gke/outputs.tf deleted file mode 100644 index 08500e25e..000000000 --- a/ml-platform/02_gke/outputs.tf +++ /dev/null @@ -1,17 +0,0 @@ -# Copyright 2024 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -output "gke_cluster" { - value = module.gke -} diff --git a/ml-platform/03_configsync/README.md b/ml-platform/03_configsync/README.md deleted file mode 100644 index 70da6a370..000000000 --- a/ml-platform/03_configsync/README.md +++ /dev/null @@ -1,147 +0,0 @@ - -## Requirements - -| Name | Version | -|------|--------| -| [github](#requirement\_github) | >= 4.3.0 | -| [google](#requirement\_google) | >= 4.72.1 | -| [google-beta](#requirement\_google-beta) | >= 4.72.1 | - -## Inputs - -| Name | Description | Type | Default | Required | -|----------------------------------------------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------|----------|---------|:--------:| -| [project\_id](#input\_project\_id) | Id of the GCP Project where the resources will be created. It is a map with environments as keys and project ids as values. | `map` | n/a | yes | -| [github\_user](#github\_user) | GitHub user name. | `string` | n/a | yes | -| [github\_email](#input\_github\_email) | GitHub user email. | `string` | n/a | yes | -| [github\_org](#input\_github\_org) | GitHub org. | `string` | n/a | yes | -| [github\_token](#input\_github\_token) | GitHub access token | `string` | n/a | yes | -| [lookup\_state\_bucket](#input\_lookup\_state\_bucket) | Lookup TF State bucket. Used for looking up resources created in steps 01 and 02. | `string` | n/a | yes | -| [configsync\_repo\_name](#input\_configsync\_repo\_name) | Configsync repo name to be created in GitHub. | `string` | n/a | no | - -## Prerequisite -- You have created GKE clusters using [02_gke][cluster] module. -- You have the role `roles/Owner` on the projects where you have created GKE clusters. - -## Usage -- Clone the repo and change dir - ``` - git clone https://github.com/GoogleCloudPlatform/ai-on-gke - cd ml-platform/03_configsync - ``` -- In backend.tf replace `YOUR_STATE_BUCKET` with the name of the GCS bucket. -- In variables.tf, provide the values of the following variables: - - `github_user` : GitHub user. We recommend you use a system user account. - - `github_email` : Email of the system user account. - - `github_org` : GitHub org where the config sync repo will be created. - - `lookup_state_bucket` : name of the GCS bucket. - - `configsync_repo_name` : Suitable name for your config sync repo. - -- You also need to provide a personal access token for the GitHub user. Generate a [personal access token][personal-access-token] with access to create and delete repo for the user in GitHub and pass it as env variable: - - export TF_VAR_github_token="``" -- terraform init -- terraform plan -- terraform apply --auto-approve - - -This module performs the following actions: -- Looks up project_id from the state file if not provided. -- Looks up GKE clusters created in step 02. -- Creates a GitHub repository and branches corresponding to each environment and apply branch protection rules on it. This is the configsync repo. -- Creates Config sync on each GKE clusters. -- Hydrates templates into K8s manifests and commit them to the default branch of the GitHub repo to do initial cluster setup. - -## Config sync repo workflow -After this module has been successfully completed, you will get a [root-sync][root-sync] object created on all the GKE clusters. - -Go to Google Cloud Console, click on the navigation menu and click on Kubernetes Engine > Config. Now, click on the `PACKAGES` tab. You will see three [root-sync][root-sync] objects created, one for each cluster. Review the `Source url` against the `dev` cluster. It should be something like: -``` -https://github.com//experiment-acm-repo/tree/dev/manifests/clusters - -``` -This means that the `dev` cluster is associated with the `manifests/clusters` folder on the `dev` branch of the configsync repo. So, manifests under `manifests/clusters` folder on the `dev` branch will be synced with the dev cluster. -Similarly, the folder `manifests/clusters` on `staging` branch will be synced with the `staging` cluster and `manifests/clusters` on `prod` branch will be sycned with `prod` cluster. - -We will follow GitOps methodology to create resources on the clusters. This means you can only make changes to the default branch while other branches are protected. In order to merge changes to non-default branches, you will need to create a pull request. - -The following documentation will assume that you have three clusters `dev`, `staging` and `prod` and that resulted in three branches on the configsync repo `dev`, `staging` and `prod`. The `dev` branch is the default branch. - -To follow `GitOps` approach, you will make changes and push them to the `dev` branch. Config sync will then sync the `dev` branch with the `dev` cluster. If the changes look good in `dev` environment, -and are ready to be moved to `staging` you create a pull request from `dev` to `staging` branch. Once this pull request is approved and merged, the `staging branch` will be synced with `staging` cluster reflecting the changes in staging environment. -Similarly, when you are ready to promote the changes in production environment, create a pull request from `staging` to `prod` branch and merge it. - -## Managing cluster-level and application-level objects - -It is recommended to have a separation of duties on who should be able to create what objects in a cluster. -The principle to follow should be that the cluster-level objects can only be created by platform admins while the application teams should be able to create their own application level objects. - -To achieve this separation, we will use [root-sync][root-sync] and [repo-sync][repo-sync]. [root-sync][root-sync] allows to creae cluster scoped objects while [repo-sync][repo-sync] allows to create namespace scoped objects. - -### Cluster-level objects -Since the [root-sync][root-sync] object is associated with the folder `manifests/clusters`, the cluster level objects will be created from this folder. This includes creating CRDs, namespaces etc. So, for example, if you want to create a namespace as a platform admin, create a `yaml` file with the required K8s definition and save it under `manifests/clusters`. The namespace will be created on the cluster as soon as the sync happens. - -Note that the owner of the repo should create a CODEOWNERS file to allow access to the platform admins to this folder so that only they can make cluster level objects. The Application teams should not have access to `manifests/clusters`. - -In the section [04_setup_clusters][cluster-setup], you will create cluster scoped objects. - - -### Application-level objects -It is recommended to provide each Application its dedicated namespace. This means, only the application and related resources will be created in that namespace. The owner of the application or the app team will be get full access on the namespace so they can manage their application without having to be dependent on the platform admins. - -Since the namespace is a cluster-scoped object, platform admin will need to create the namespace for the application and grant the app team members access on the namespace. Additionally, they will provide a [repo-sync][repo-sync] repo to the app teams so they can use that to manage their application's kubernetes resource. Once, this setup is done, the app team members can manage the application inside the namespace with the manifests in the [repo-sync][repo-sync] repo. - -In the section [05_setup_teams][team-setup], you will learn how the platform admins will set up an application by providing a namespace to the App team along with a [repo-sync][repo-sync] that the app teams will use to manage their applications. - -In the section [06_operating_teams][operating-teams], you will learn how the app teams can use their [repo-sync][repo-sync] to manage thir application. - -## Troubleshooting -If you do not have [GitHub pro membership][github-pro], you can not apply branch protection rules on your repositories in GitHub. This will cause `409 code` error when you run `terraform apply` . You can ignore these errors. The downside is that you will not get branch protection rules on your configsync repository and can accidentally push changes to the non-default branch which is `dev`. In other words, it will break the `GitOps` flow. - -## Contributing - -* [Contributing guidelines][contributing-guidelines] -* [Code of conduct][code-of-conduct] - - - -[contributing-guidelines]: CONTRIBUTING.md -[code-of-conduct]: code-of-conduct.md -[personal-access-token]: https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/managing-your-personal-access-tokens -[repo-sync]: https://cloud.google.com/anthos-config-management/docs/reference/rootsync-reposync-fields -[root-sync]: https://cloud.google.com/anthos-config-management/docs/reference/rootsync-reposync-fields -[cluster-setup]: ../04_setup_clusters/README.md -[team-setup]: ../05_setup_teams/README.md -[operating-teams]: ../06_operating_teams -[cluster]: ../02_gke -[github-pro]: https://docs.github.com/en/get-started/learning-about-github/githubs-plans - -## Clean up - -1. The easiest way to prevent continued billing for the resources that you created for this tutorial is to delete the project you created for the tutorial. Run the following commands from Cloud Shell: - - ```bash - gcloud config unset project && \ - echo y | gcloud projects delete - ``` - -2. If the project needs to be left intact, another option is to destroy the infrastructure created from this module. Note, this does not destroy the Cloud Storage bucket containing the Terraform state and service enablement created out of Terraform. - - ```bash - cd ml-platform/03_configsync && \ - terraform destroy --auto-approve - ``` - diff --git a/ml-platform/03_configsync/backend.tf b/ml-platform/03_configsync/backend.tf deleted file mode 100644 index b9d73f15f..000000000 --- a/ml-platform/03_configsync/backend.tf +++ /dev/null @@ -1,20 +0,0 @@ -# Copyright 2024 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -terraform { - backend "gcs" { - prefix = "03_config_sync_prerequisite" - bucket = "YOUR_STATE_BUCKET" - } -} diff --git a/ml-platform/03_configsync/main.tf b/ml-platform/03_configsync/main.tf deleted file mode 100644 index 671abee5c..000000000 --- a/ml-platform/03_configsync/main.tf +++ /dev/null @@ -1,131 +0,0 @@ -# Copyright 2024 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -data "terraform_remote_state" "gke-clusters" { - backend = "gcs" - config = { - bucket = var.lookup_state_bucket - prefix = "02_gke" - } -} - -locals { - parsed_gke_info = data.terraform_remote_state.gke-clusters.outputs.gke_cluster - project_id_list = [for k, v in "${data.terraform_remote_state.gke-clusters.outputs.gke_cluster}" : v.gke_project_id] -} - -//resource "google_gke_hub_feature" "configmanagement_acm_feature" { -// count = length(distinct(local.project_id_list)) -// name = "configmanagement" -// project = distinct(local.project_id_list)[count.index] -// location = "global" -// provider = google-beta -//} - -resource "google_gke_hub_membership" "membership" { - provider = google-beta - for_each = local.parsed_gke_info - project = each.value["gke_project_id"] - membership_id = each.value["cluster_name"] - endpoint { - gke_cluster { - resource_link = format("%s/%s", "//container.googleapis.com", each.value["cluster_id"]) - } - } - lifecycle { - ignore_changes = [ - "labels", "description" - ] - } - #depends_on = [ google_gke_hub_feature.configmanagement_acm_feature ] -} - -resource "github_repository" "acm_repo" { - name = var.configsync_repo_name - description = "Repo for Config Sync" - visibility = "private" - has_issues = false - has_projects = false - has_wiki = false - - allow_merge_commit = true - allow_squash_merge = true - allow_rebase_merge = true - delete_branch_on_merge = false - auto_init = true - vulnerability_alerts = true -} -//Create a branch for each env -resource "github_branch" "branch" { - for_each = local.parsed_gke_info - repository = split("/", github_repository.acm_repo.full_name)[1] - branch = each.key - depends_on = [github_repository.acm_repo] -} -//Set default branch as the lowest env -resource "github_branch_default" "default_branch" { - repository = split("/", github_repository.acm_repo.full_name)[1] - branch = tostring(keys(local.parsed_gke_info)[0]) - #rename = true - depends_on = [github_branch.branch] -} -#Protect branches other than the default branch -resource "github_branch_protection_v3" "branch_protection" { - for_each = local.parsed_gke_info - repository = split("/", github_repository.acm_repo.full_name)[1] - branch = each.key - required_pull_request_reviews { - required_approving_review_count = 1 - require_code_owner_reviews = true - } - restrictions { - - } - - depends_on = [github_branch.branch] -} - -resource "google_gke_hub_feature_membership" "feature_member" { - provider = google-beta - for_each = local.parsed_gke_info - project = each.value["gke_project_id"] - location = "global" - feature = "configmanagement" - membership = google_gke_hub_membership.membership[each.key].membership_id - configmanagement { - version = "1.17.0" - config_sync { - source_format = "unstructured" - git { - sync_repo = "https://github.com/${github_repository.acm_repo.full_name}.git" - sync_branch = each.value["env"] - policy_dir = "manifests/clusters" - secret_type = "token" - } - } - policy_controller { - enabled = true - template_library_installed = true - referential_rules_enabled = true - } - } - - provisioner "local-exec" { - command = "${path.module}/create_cluster_yamls.sh ${var.github_org} ${github_repository.acm_repo.full_name} ${var.github_user} ${var.github_email} ${each.value["env"]} ${each.value["cluster_name"]} ${index(keys(local.parsed_gke_info), each.key)}" - } - - #depends_on = [ - # google_gke_hub_feature.configmanagement_acm_feature - # ] -} diff --git a/ml-platform/03_configsync/outputs.tf b/ml-platform/03_configsync/outputs.tf deleted file mode 100644 index a19b71988..000000000 --- a/ml-platform/03_configsync/outputs.tf +++ /dev/null @@ -1,21 +0,0 @@ -# Copyright 2024 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -output "membership" { - value = google_gke_hub_membership.membership -} - -output "val" { - value = local.parsed_gke_info -} diff --git a/ml-platform/03_configsync/variables.tf b/ml-platform/03_configsync/variables.tf deleted file mode 100644 index f04844d23..000000000 --- a/ml-platform/03_configsync/variables.tf +++ /dev/null @@ -1,48 +0,0 @@ -# Copyright 2024 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -variable "lookup_state_bucket" { - description = "GCS bucket to look up TF state from previous steps." - type = string - default = "YOUR_STATE_BUCKET" -} - -variable "configsync_repo_name" { - type = string - description = "Name of the GitHub repo that will be synced to the cluster with Config sync." - default = "config-sync-repo" -} - -variable "github_user" { - description = "GitHub user name." - type = string - default = "YOUR_GIT_USER" -} - -variable "github_email" { - description = "GitHub user email." - type = string - default = "YOUR_GIT_USER_EMAIL" -} - -variable "github_org" { - type = string - description = "GitHub org." - default = "YOUR_GIT_ORG" -} - -variable "github_token" { - type = string - description = "GitHub token. It is a token with write permissions as it will create a repo in the GitHub org." -} diff --git a/ml-platform/04_setup_clusters/README.md b/ml-platform/04_setup_clusters/README.md deleted file mode 100644 index 8e69190ee..000000000 --- a/ml-platform/04_setup_clusters/README.md +++ /dev/null @@ -1,139 +0,0 @@ - -### This doc is meant for the platform admins or the group that has admin level permissions on the GKE clusters. The steps mentioned in these docs must be executed by them. - -## Prerequisite -- You have successfully run through [03_configsync][configsync] module. - -### Complete config synch setup - -Go to Google Cloud Console, click on the navigation menu and click on Kubernetes Engine > Config. Now, click on the `PACKAGES` -tab. You will notice that the `Sync status` will show as stalled for all [root-sync][root-sync]. -This is because, config sync needs to authenticate with GitHub to be able to read the manifests in the configsync repo. It expects a secret named `git-cred` in `config-menegement-system` namespace on the cluster. -This secret stores the github user and its [personal access token][personal-access-token]. The [personal access token][personal-access-token] should have the read only access so config sync can read the repo to perform the sync. - -Follow these steps to create a new secret `git-cred` in `config-menegement-system` namespace: -- For the GitHub user account that you plan to use, generate a [personal access token][personal-access-token] with read access to the configsync repo. It is recommended to use a [machine user account][machine-user-account] for this but you can use a personal user account just to try this reference architecture. -- Get IAM role `roles/gkehubeditor` to be able to use the connect gateway to access the GKE cluster. If you are the owner of the project, this step can be skipped. -- Open cloudshell and run these commands: - ``` - gcloud config set project - - gcloud container fleet memberships get-credentials - - kubectl create secret generic git-creds --namespace="config-management-system" --from-literal=username= --from-literal=token= - - gcloud container fleet memberships get-credentials - - kubectl create secret generic git-creds --namespace="config-management-system" --from-literal=username= --from-literal=token= - - gcloud container fleet memberships get-credentials - - kubectl create secret generic git-creds --namespace="config-management-system" --from-literal=username= --from-literal=token= - ``` - -After the `git-cred` secret has been created, you will see the `Sync status` for dev cluster will change from `stalled` to `synced` with a green tick mark against it. The `Synch status` for `staging` and `prod` clusters will change from stalled to Error. This is because the `staging` and `prod` branches of the repo has no content yet. - -Create a pull request from `dev` to `staging` and merge it. After the merge, the `Sync status` of the `staging` cluster will change from `Stalled` to `Synced`. Now, create a PR from `staging` to `prod` and merge it. The `Sync status` for `prod` cluster will change from `Stalled` to `Synced`. - -You just followed `GitOps` to promote changes from `dev` to higher environments. - -### Review the config synch repo -Open the configsync repo and go to `manifests/clusters`, you will see there is a cluster selector created for each cluster via yaml files. - -### Install a cluster scoped software -This section describes how platform admins will use the configsync repo to manage cluster scoped software or cluster level objects. These software could be used by multiple teams in their namespaces. An example of such software is [kuberay][kuberay] that can manage ray clusters in multiple namespace. - - -Let's install [Kuberay][kuberay] as a cluster level software that includes CRDs and deployments. Kuberay has a component called operator that facilitates `ray` on Kubernetes. We will install Kuberay operator in default namespace. The operator will then orchestrate `ray clusters` created in different namespace by different teams in the future. -Perform the following steps: -- Clone the configsync repo and change directory. The default branch `dev` is checked out. - ``` - git clone repo - cd repo - ``` - -- From the provided templates under `templates/_cluster_template`, copy kustomization.yaml to `manifests/clusters` which is synced with the GKE clusters. kustomization.yaml will become the entrypoint for the [root-sync][root-sync] in the `manifests/clusters` folder and it syncs all the files defined in kustomization.yaml with the cluster. - ``` - cp templates/_cluster_template/kustomization.yaml manifests/clusters - ``` - -- Copy the directory containing the manifests to install kuberay to the directory that is synced with the GKE clusters. - ``` - cp -r templates/_cluster_template/kuberay manifests/clusters - ``` - Note that the directory `kuberay` is supplied as a template with this reference architecture. You can modify it based on your requirements. - -- Add cluster selector files in kustomization.yaml so config sync syncs these files with the clusters. The selectors are useful when you want to apply changes on one or multiple clusters selectively. - ``` - cat <>manifests/clusters/kustomization.yaml - - - ./gke-ml-dev-cluster.yaml - - ./gke-ml-staging-cluster.yaml - - ./gke-ml-prod-cluster.yaml - - ./dev-selector.yaml - - ./staging-selector.yaml - - ./prod-selector.yaml - EOF - ``` - -- Commit the changes and push them to dev branch. - ``` - git add . - git commit -m "Installing Kuberay operator" - git push - ``` - -You just pushed the manifests to install kuberay operator in default namespace to the `dev` branch. Go to Google Cloud Console, click on the navigation menu and click on Kubernetes Engine > Config. Now, click on the `PACKAGES` tab. Verify that the dev cluster is in `Synced` status. - -Verify in the `dev` cluster that [Kuberay operator][kuberay] has been installed successfully. -Open cloudshell and run these commands: -- gcloud config set project `` -- gcloud container fleet memberships get-credentials `` -- kubectl get crd | grep ray - - This should show result similar to the following: - ``` - rayclusters.ray.io 2024-02-12T21:19:06Z - rayjobs.ray.io 2024-02-12T21:19:09Z - rayservices.ray.io 2024-02-12T21:19:12Z - ``` -- kubectl get pods - - This should show result similar to the following: - ``` - NAME READY STATUS RESTARTS AGE - kuberay-operator-56b8d98766-2nvht 1/1 Running 0 6m26s - ``` -As you can see , we have installed the CRDs and the deployment for the kuberay operator. - -## Contributing - -* [Contributing guidelines][contributing-guidelines] -* [Code of conduct][code-of-conduct] - - - -[contributing-guidelines]: CONTRIBUTING.md -[code-of-conduct]: code-of-conduct.md -[repo-sync]: https://cloud.google.com/anthos-config-management/docs/reference/rootsync-reposync-fields -[root-sync]: https://cloud.google.com/anthos-config-management/docs/reference/rootsync-reposync-fields -[personal-access-token]: https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/managing-your-personal-access-tokens -[machine-user-account]: https://docs.github.com/en/get-started/learning-about-github/types-of-github-accounts -[kuberay]: https://ray-project.github.io/kuberay/ -[configsync]: ../03_configsync - - - - diff --git a/ml-platform/05_setup_teams/README.md b/ml-platform/05_setup_teams/README.md deleted file mode 100644 index 70516f1a6..000000000 --- a/ml-platform/05_setup_teams/README.md +++ /dev/null @@ -1,169 +0,0 @@ - -### This doc is meant for the platform admins or the group that has admin level permissions on the GKE clusters. The steps mentioned in these docs must be executed by them. - -## Prerequisite -- You have successfully run through [04_setup_clusters][cluster-setup] module. - -## Setup teams -Typically, each team can own one or more namespaces and the team's users will get access to create, update and delete objects in those namespaces but they will be restricted from creating, updating or deleting cluster level objects or the objects in other namespaces. - -The platform admin will set up the teams(create namespace and permission team's users on it) using the configsync repo(via [root sync][root-sync]) and provide the app teams the means to manage objetcs in their own namepsace without further involvment. - -Setting up a team has the following steps: -- create a new namespace for the team and permission the users on the namespace. - - Note: In this reference architecture, we create the namespace with the same name as the team. In real-world scenario, a team can own multiple namespaces so you might want to create namespaces with the application name that will be deployed in it. -- create a network-policy(optional). App teams can do it later. -- create a [reposync][repo-sync] object on the GKE clusters that will be associated with the repo/dir that is owned by the app teams. The app teams can manage the namespace scoped resources via their repo/dir by adding the kubernetes manifests there. - -### Prepare the changes to create a team - -In order to create a new namespace, perform the following steps: -- Clone the configsync repo and change directory. The default branch `dev` is checked out. - ``` - git clone `` repo - cd repo - ``` -- Copy the team template directory to the directory that is synced with the GKE clusters. The team template directory contains manifests to create namespace,[rbac][rbac],network policy and [reposync][repo-sync] - ``` - cp -r templates/_cluster_template/team manifests/clusters/ - ``` - `` is the name of the team for which the namespace is being created. It can also be the name of the application. - Note that the team template is provided with this reference architecture. You can modify it based on your requirements. - - - -- Change the placeholders in the files under `manifests/clusters/` - - replace NAMSESPACE with the name of the namespace/team in the files under `manifests/clusters/` - ``` - sed -i 's#NAMESPACE##g' manifests/clusters//* - ``` - - replace GIT_REPO with the link to the Git repository that you want to sync with this reposync in `manifests/clusters//reposync.yaml`. - ``` - sed -i 's#GIT_REPO##g' manifests/clusters//reposync.yaml - ``` - - manually replace NUMBER_OF_CHARACTERS_IN_REPOSYNC_NAME in `manifests/clusters//reposync.yaml` - e.g if the reposync name is prod-myteam, replace NUMBER_OF_CHARACTERS_IN_REPOSYNC_NAME with 11. - -- Create a new directory that the reposync object is pointing to. - ``` - mkdir manifests/apps/ - touch manifests/apps//.gitkeep - ``` - -- Add the path to the new team dir in kustomization.yaml to include it in the sync. - ``` - cat <>manifests/clusters/kustomization.yaml - - ./ - EOF - ``` - - -### Review the files: -Go to `manifests/clusters/` -- kustomization.yaml specifies which yaml files should be synced with the cluster. -- namespace.yaml defines the code to create a new namespace. -- rbac.yaml creates a role for full access to the namespace and assign the role to the team's users. - - This can be changed to a more restricted role or you can create multiple roles for different users. - - There is also a rolebinding that provides [kuberay operator][kuberay] service account access to this namespace. This is required for [kuberay][kuberay] to be able to manage the ray clusters inside this namespace. -- reposync.yaml creates [reposync][repo-sync] object on the cluster for the given namespace. The [reposync][repo-sync] object will be connected to a repo and will be used by the app team to create, update and delete the namespace scoped objects like rayclusters etc. - - The app team either can bring their own repo and provide it to the platform admins so they can update reposync.yaml accordingly. - - Alternatively, if your organization wants to follow mono repo structure, platform admin can create a subfolder named `` in this repo for each team under `manifests/apps` and provide the path `manifests/apps/`to the [reposync][repo-sync] object for that namespace. Platform admin can permission only the required team members to be able to edit the files under `manifests/apps/``` folder. - - see the `repo`, `revision` and `dir` tags in `reposync.yaml` that defines wha repo and dir will be synced for this [reposync][repo-sync]. - - see [mono repo vs multi repos](#mono-repo-vs-multi-repos) if you want to decide which one to use. - -### Apply the changes: -Commit the changes and push them to dev branch. -``` -git add . -git commit -m "Adding a new team" -git push -``` - -The changes are pushed to `dev` branch so the namespace and related objects will be created in dev GKE cluster. -Now create pull request from `dev` to `staging` branch and merge it. Then create a pull request from `staging` to `prod` branch and merge it. This will create the namespace and related objects in `staging` and `prod` GKE clusters. - -Go to Google Cloud Console, click on the navigation menu and click on Kubernetes Engine > Config. Now, click on the `PACKAGES` tab. You will see a new [repo-sync][repo-sync] object created on each cluster but they will be in `Stalled` state. -This is because config sync needs to authenticate with GitHub to be able to read the manifests in the repo. It expects a secret named `git-cred` in the namespace for configuring [reposync][repo-sync] with the GitHub repo. -This secret stores the github user and its [personal access token][personal-access-token]. - -Follow these steps to create a new secret in dev cluster `git-cred`: -- For the GitHub user account that you plan to use, generate a [personal access token][personal-access-token] with read access to the configsync repo. It is recommended to use a [machine user account][machine-user-account] for this but you can use a personal user account just to try this reference architecture. -- Get IAM role roles/gkehubeditor to be able to use connect gateway to access the GKE cluster. If you are the owner of the project, this step can be skipped. -- Open cloudshell and run these commands: - ``` - gcloud config set project - - gcloud container fleet memberships get-credentials - - kubectl create secret generic git-creds --namespace="" --from-literal=username= --from-literal=token= - - gcloud container fleet memberships get-credentials - - kubectl create secret generic git-creds --namespace="" --from-literal=username= --from-literal=token= - - gcloud container fleet memberships get-credentials - - kubectl create secret generic git-creds --namespace="" --from-literal=username= --from-literal=token= - ``` - - -Go to Google Cloud Console, click on the navigation menu and click on Kubernetes Engine > Config. Now, click on the `PACKAGES` tab. You will see a new [repo-sync][repo-sync] object will have `Synch status` as `Synced` with a green tick mark against them. This confirms that the [reposync][repo-sync] objects have been successfully created on all the clusters. - -This marks the completion of the team/namespace. - - -The platform admin will provide access to this dir to the members of the team. The team members will create manifests under this folder to manage their namespace scoped objects. - - -Important : -Platform admins should also restrict access to this directory so only the members of the team can update the files under it. -This can be done by creating CODEOWNERS files to allow the required team members to have access to this dir. This way you will ensure that only the team members can manage Kubernetes objects in this namespace and no other team can do that. If the team members try to create cluster-scoped objects from this dir, it will result in error as this folder is connected to [reposync][repo-sync] objects which doesn't allow cluster level access. - - -### Mono repo vs multi repos -The platform admins and the app teams need to make a decision on what repo structure they will use for config sync. - -Using mono repo means: -- The same repo will be used for cluster level objects(created by platform admins) and namespace level objects(created by app teams). -- The platform admins will be the owner of the repo and maintain CODEOWNERS files to provide granular access to the platform admins and the app teams. -- However, if the app teams want to promote changes from one env to another, they will reply on platform admins or the repo owners to approve the PR. - -Using multiple repos mean: -- The [rootsync][root-sync] will be tied to a repo that only platform-admins own and they can create cluster level objects from this repo. -- The [reposync][repo-sync] for individual teams will be created and tied to their own git repos. There is no need for granular permissions by platform admins as the app teams use their own repos to create namespace level objects. -- The app teams can create and merge PRs to their own repo independently to promote changes from one env to another. - - -## Contributing - -* [Contributing guidelines][contributing-guidelines] -* [Code of conduct][code-of-conduct] - - - -[contributing-guidelines]: CONTRIBUTING.md -[code-of-conduct]: code-of-conduct.md -[repo-sync]: https://cloud.google.com/anthos-config-management/docs/reference/rootsync-reposync-fields -[root-sync]: https://cloud.google.com/anthos-config-management/docs/reference/rootsync-reposync-fields -[personal-access-token]: https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/managing-your-personal-access-tokens -[machine-user-account]: https://docs.github.com/en/get-started/learning-about-github/types-of-github-accounts -[kuberay]: https://ray-project.github.io/kuberay/ -[rbac]: https://kubernetes.io/docs/reference/access-authn-authz/rbac/ -[cluster-setup]: ../04_setup_clusters - - diff --git a/ml-platform/06_operating_teams/README.md b/ml-platform/06_operating_teams/README.md deleted file mode 100644 index 9b5712210..000000000 --- a/ml-platform/06_operating_teams/README.md +++ /dev/null @@ -1,154 +0,0 @@ - -This doc describes how you as an App team member will use the configsync repo to manage your applications scoped to your namespace. -We will demonstrate this with an example of installing `ray` in the namespace. Typically, you can install any software or deploy any application in your namespace in the same fashion. - -## Prerequisite -- You have successfully run through [05_setup_teams][team-setup] module. - - -## Install a software(ray) - -This section is meant for the app teams that have permission only on a given namespace in the GKE clusters. The steps mentioned in this section must be executed by them. - -`Ray` is s an open-source unified compute framework that makes it easy to scale AI and Python workloads — from reinforcement learning to deep learning to tuning, and model serving. -It is very commonly used by Machine Learning teams. In order to run `Ray` on Kubernetes, you need `Kuberay` operator. The `kuberay` operators can manage the ray clusters installed in different namespaces. So, if there are multiple teams that need to use `ray` can install it in their own namespace while the kuberay oeprator can manage all of them. -Installing `kuberay` requires cluster level access as it creates the CRDs. We demonstrated installing `kuberay` in [cluster-setup][cluster-setup]. -Here we will show how to install `ray` in a namespace and configure `kuebray` to manage it. - -As an app team member, you will have access to `manifests/apps/``` folder in this repo if you are using a [mono repo][mono-repo] structure. You can perform the following steps to add `ray` manifests to the folder. The [reposync][repo-sync] will sync the manifests to the namespace on the cluster and you will get `ray` installed in your namespace. - -Note: If you are using multi repo structure, you will have access to the entire git repo and you can add the manifests in the similar fashion in the required directory to install `ray`. - -### Create the manifests -- Open `cloudshell` and run the following commands: - ``` - git clone repo && cd repo - - cp -r templates/_namespace_template/app/* manifests/apps// - ``` - -- Replace NAMSESPACE with the name of the namespace in the newly copied files. - ``` - sed -i 's#NAMESPACE##g' manifests/apps//* - ``` - -### Review the manifests -- `kustomization.yaml` specifies which yaml files should be synced with the cluster for this namespace. It references to a helm chart to install `ray` -- `values.yaml` contains the overriding values for the kuberay helm chart. -- `fluentd_config.yaml` specifies Configmap that will be applied to the namespace. -- `serviceaccount.yaml`(optional) sepcifies the kubernetes service account. This service account can be used for [workload identity][workload-identity]. - -Note that these files are provided as a template with the reference architecture for installing ray cluster. You can modify these templates as needed. - -### Apply the manifest: -- Go to `cloudshell` where you cloned the repo and copied the new files. - ``` - git add . - - git commit -m "Installing ray in namespace " - - git push - ``` - -The changes are pushed to `dev` branch so `ray` is installed on `dev` GKE cluster. To apply these changes to `staging`, create a pull request from `dev` to `staging` branch and merge it. Similarly, in order to apply the changes to `prod` cluster, create a pull request from `staging` to `prod` branch and merge it. - -Go to Google Cloud Console, click on the navigation menu and click on Kubernetes Engine > Config. Now, click on the `PACKAGES` tab. The [repo-sync][repo-sync] objects should show `Sync status` as `Synced` with green tick against it. - -### Verify the raycluster is in ready state in the namespace. -- Open cloudshell and run these commands: - ``` - gcloud config set project - - gcloud container fleet memberships get-credentials - - kubectl get raycluster -n - ``` -- This should show result similar to the following: - - ``` - NAME DESIRED WORKERS AVAILABLE WORKERS STATUS AGE - ray-cluster-kuberay 4m9s - ``` - -### Update kuberay operator to manage ray in your namespace - -This section is meant for platform admins or the group that has admin level permissions on the GKE clusters. The steps mentioned in these docs must be executed by them. - -[Kuberay][kuberay] operator manages `ray` on Kubernetes. You need to configure kuberay operator so that it manages `ray` in your namespace. `kuberay` was installed via [rootsync][root-sync] from the folder `manifests/clusters` by platform-admin so they should be performing the following step. -- Go to `cloudshell` where you cloned the repo. - -- Open `manifests/clusters/kuberay/values.yaml` -- add the namespace under `watchNamespace` tag. e.g. - ``` - watchNamespace: - - - ``` -- Commit and push the changes - ``` - git add . - - git commit -m "Updating kuberay operator to watch the namespace " - - git push - ``` -To apply these changes to `staging`, create a pull request from `dev` to `staging` branch and merge it. Similarly, in order to apply the changes to `prod` cluster, create a pull request from `staging` to `prod` branch and merge it. - -[kuberay][kuberay] operator will start managing the `ray` in your namespace on all the clusters. - -### Verify the ray head and worker has been started in your namespace. -- Open `cloudshell` and run these commands: - ``` - gcloud config set project - - gcloud container fleet memberships get-credentials - ``` -- Run `kubectl get raycluster -n ``` . This should show result similar to the following indicating the raycluster is now ready: - - ``` - NAME DESIRED WORKERS AVAILABLE WORKERS STATUS AGE - ray-cluster-kuberay 1 1 ready 29m - - ``` - -- Run `kubectl get pods -n ``` . This should show result similar to the following: - - ``` - NAME READY STATUS RESTARTS AGE - ray-cluster-kuberay-head-sp6dg 2/2 Running 0 3m21s - ray-cluster-kuberay-worker-workergroup-rzpjw 2/2 Running 0 3m21s - ``` - -## Contributing - -* [Contributing guidelines][contributing-guidelines] -* [Code of conduct][code-of-conduct] - - - -[contributing-guidelines]: CONTRIBUTING.md -[code-of-conduct]: code-of-conduct.md -[repo-sync]: https://cloud.google.com/anthos-config-management/docs/reference/rootsync-reposync-fields -[root-sync]: https://cloud.google.com/anthos-config-management/docs/reference/rootsync-reposync-fields -[personal-access-token]: https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/managing-your-personal-access-tokens -[machine-user-account]: https://docs.github.com/en/get-started/learning-about-github/types-of-github-accounts -[kuberay]: https://ray-project.github.io/kuberay/ -[workload-identity]: https://cloud.google.com/kubernetes-engine/docs/how-to/workload-identity -[cluster-setup]: ../04_setup_clusters/README.md -[mono-repo]: ../05_setup_teams/README.md#mono-repo-vs-multi-repos -[team-setup]: ../05_setup_teams - - diff --git a/ml-platform/README.md b/ml-platform/README.md index 59e712b18..90c3a165f 100644 --- a/ml-platform/README.md +++ b/ml-platform/README.md @@ -1,45 +1,138 @@ - -# Reference architecture demonstrating how to build your ML platform on GKE. - -## Purpose - -This tutorial demonstrates repeatable patterns to setup a multi environment ML platform on private [Google Kubernetes Engine](https://cloud.google.com/kubernetes-engine/docs/concepts/kubernetes-engine-overview) (GKE) that can be extended for end-to-end MLOps. - -It addresses following personae and provides means to automate and simplify their CUJs. - -### Platform Admin - -**CUJ 1** : Provide templates with built-in standard practices to stamp out GKE platforms to be used by ML Engineers/Data Scientist. - -**CUJ 2** : Provide space for the ML teams on GKE cluster to run their workloads and the permissions following the principle of least privilege. - -**CUJ 3** : Provide secure methods to the ML teams and the Operators to connect to the private GKE clusters. + +## Requirements + +| Name | Version | +|------|---------| +| [github](#requirement\_github) | 6.0.1 | +| [google](#requirement\_google) | 5.19.0 | +| [google-beta](#requirement\_google-beta) | 5.19.0 | +| [null](#requirement\_null) | 3.2.2 | + +## Providers + +| Name | Version | +|------|---------| +| [github](#provider\_github) | 6.0.1 | +| [google](#provider\_google) | 5.19.0 | +| [google-beta](#provider\_google-beta) | 5.19.0 | +| [null](#provider\_null) | 3.2.2 | + +## Modules + +| Name | Source | Version | +|------|--------|---------| +| [cloud-nat](#module\_cloud-nat) | ./modules/cloud-nat | n/a | +| [create-vpc](#module\_create-vpc) | ./modules/network | n/a | +| [gcp-project](#module\_gcp-project) | ./modules/projects | n/a | +| [gke](#module\_gke) | ./modules/cluster | n/a | +| [node\_pool-ondemand](#module\_node\_pool-ondemand) | ./modules/node-pools | n/a | +| [node\_pool-reserved](#module\_node\_pool-reserved) | ./modules/node-pools | n/a | +| [node\_pool-spot](#module\_node\_pool-spot) | ./modules/node-pools | n/a | +| [reservation](#module\_reservation) | ./modules/vm-reservations | n/a | + +## Resources + +| Name | Type | +|------|------| +| [github_branch.branch](https://registry.terraform.io/providers/integrations/github/6.0.1/docs/resources/branch) | resource | +| [github_branch_default.default_branch](https://registry.terraform.io/providers/integrations/github/6.0.1/docs/resources/branch_default) | resource | +| [github_branch_protection_v3.branch_protection](https://registry.terraform.io/providers/integrations/github/6.0.1/docs/resources/branch_protection_v3) | resource | +| [github_repository.acm_repo](https://registry.terraform.io/providers/integrations/github/6.0.1/docs/resources/repository) | resource | +| [google-beta_google_gke_hub_feature.configmanagement_acm_feature](https://registry.terraform.io/providers/hashicorp/google-beta/5.19.0/docs/resources/google_gke_hub_feature) | resource | +| [google-beta_google_gke_hub_feature_membership.feature_member](https://registry.terraform.io/providers/hashicorp/google-beta/5.19.0/docs/resources/google_gke_hub_feature_membership) | resource | +| [google-beta_google_gke_hub_membership.membership](https://registry.terraform.io/providers/hashicorp/google-beta/5.19.0/docs/resources/google_gke_hub_membership) | resource | +| [google_project_service.project_services-an](https://registry.terraform.io/providers/hashicorp/google/5.19.0/docs/resources/project_service) | resource | +| [google_project_service.project_services-anc](https://registry.terraform.io/providers/hashicorp/google/5.19.0/docs/resources/project_service) | resource | +| [google_project_service.project_services-com](https://registry.terraform.io/providers/hashicorp/google/5.19.0/docs/resources/project_service) | resource | +| [google_project_service.project_services-con](https://registry.terraform.io/providers/hashicorp/google/5.19.0/docs/resources/project_service) | resource | +| [google_project_service.project_services-cr](https://registry.terraform.io/providers/hashicorp/google/5.19.0/docs/resources/project_service) | resource | +| [google_project_service.project_services-gate](https://registry.terraform.io/providers/hashicorp/google/5.19.0/docs/resources/project_service) | resource | +| [google_project_service.project_services-gkecon](https://registry.terraform.io/providers/hashicorp/google/5.19.0/docs/resources/project_service) | resource | +| [google_project_service.project_services-gkeh](https://registry.terraform.io/providers/hashicorp/google/5.19.0/docs/resources/project_service) | resource | +| [google_project_service.project_services-iam](https://registry.terraform.io/providers/hashicorp/google/5.19.0/docs/resources/project_service) | resource | +| [null_resource.create_git_cred_cms](https://registry.terraform.io/providers/hashicorp/null/3.2.2/docs/resources/resource) | resource | +| [null_resource.create_git_cred_ns](https://registry.terraform.io/providers/hashicorp/null/3.2.2/docs/resources/resource) | resource | +| [null_resource.create_namespace](https://registry.terraform.io/providers/hashicorp/null/3.2.2/docs/resources/resource) | resource | +| [null_resource.install_kuberay_operator](https://registry.terraform.io/providers/hashicorp/null/3.2.2/docs/resources/resource) | resource | +| [null_resource.install_ray_cluster](https://registry.terraform.io/providers/hashicorp/null/3.2.2/docs/resources/resource) | resource | +| [null_resource.manage_ray_ns](https://registry.terraform.io/providers/hashicorp/null/3.2.2/docs/resources/resource) | resource | + +## Inputs + +| Name | Description | Type | Default | Required | +|------|-------------|------|---------|:--------:| +| [billing\_account](#input\_billing\_account) | GCP billing account | `string` | `null` | no | +| [cluster\_name](#input\_cluster\_name) | Name of the GKE cluster | `string` | `"gke-ml"` | no | +| [configsync\_repo\_name](#input\_configsync\_repo\_name) | Name of the GitHub repo that will be synced to the cluster with Config sync. | `string` | `"config-sync-repo"` | no | +| [create\_namespace](#input\_create\_namespace) | Setup a namespace to demo. | `number` | `1` | no | +| [create\_projects](#input\_create\_projects) | Flag to create GCP projects | `number` | `0` | no | +| [env](#input\_env) | List of environments | `set(string)` |
[
"dev"
]
| no | +| [folder\_id](#input\_folder\_id) | Folder Id where the GCP projects will be created | `string` | `null` | no | +| [github\_email](#input\_github\_email) | GitHub user email. | `string` | n/a | yes | +| [github\_org](#input\_github\_org) | GitHub org. | `string` | n/a | yes | +| [github\_token](#input\_github\_token) | GitHub token. It is a token with write permissions as it will create a repo in the GitHub org. | `string` | n/a | yes | +| [github\_user](#input\_github\_user) | GitHub user name. | `string` | n/a | yes | +| [install\_kuberay](#input\_install\_kuberay) | Flag to install kuberay operator. | `number` | `1` | no | +| [install\_ray\_in\_ns](#input\_install\_ray\_in\_ns) | Flag to install ray cluster in the namespace created with the demo. | `number` | `1` | no | +| [namespace](#input\_namespace) | Name of the namespace to demo. | `string` | `"ml-team"` | no | +| [network\_name](#input\_network\_name) | VPC network where GKE cluster will be created | `string` | `"ml-vpc"` | no | +| [ondemand\_taints](#input\_ondemand\_taints) | Taints to be applied to the on-demand node pool. |
list(object({
key = string
value = any
effect = string
}))
|
[
{
"effect": "NO_SCHEDULE",
"key": "ondemand",
"value": true
}
]
| no | +| [org\_id](#input\_org\_id) | The GCP orig id | `string` | `null` | no | +| [project\_id](#input\_project\_id) | The GCP project where the resources will be created. It is a map with environments as keys and project\_ids s values | `map` | n/a | yes | +| [project\_name](#input\_project\_name) | GCP project name | `string` | `null` | no | +| [reserved\_taints](#input\_reserved\_taints) | Taints to be applied to the reserved node pool. |
list(object({
key = string
value = any
effect = string
}))
|
[
{
"effect": "NO_SCHEDULE",
"key": "reserved",
"value": true
}
]
| no | +| [routing\_mode](#input\_routing\_mode) | VPC routing mode. | `string` | `"GLOBAL"` | no | +| [secret\_for\_rootsync](#input\_secret\_for\_rootsync) | Create git-cred in config-management-system namespace. | `number` | `1` | no | +| [spot\_taints](#input\_spot\_taints) | Taints to be applied to the spot node pool. |
list(object({
key = string
value = any
effect = string
}))
|
[
{
"effect": "NO_SCHEDULE",
"key": "spot",
"value": true
}
]
| no | +| [subnet\_01\_description](#input\_subnet\_01\_description) | Description of the first subnet. | `string` | `"subnet 01"` | no | +| [subnet\_01\_ip](#input\_subnet\_01\_ip) | CIDR of the first subnet. | `string` | `"10.40.0.0/22"` | no | +| [subnet\_01\_name](#input\_subnet\_01\_name) | Name of the first subnet in the VPC network. | `string` | `"ml-vpc-subnet-01"` | no | +| [subnet\_01\_region](#input\_subnet\_01\_region) | Region of the first subnet. | `string` | `"us-central1"` | no | +| [subnet\_02\_description](#input\_subnet\_02\_description) | Description of the second subnet. | `string` | `"subnet 02"` | no | +| [subnet\_02\_ip](#input\_subnet\_02\_ip) | CIDR of the second subnet. | `string` | `"10.12.0.0/22"` | no | +| [subnet\_02\_name](#input\_subnet\_02\_name) | Name of the second subnet in the VPC network. | `string` | `"gke-vpc-subnet-02"` | no | +| [subnet\_02\_region](#input\_subnet\_02\_region) | Region of the second subnet. | `string` | `"us-west2"` | no | + +## Outputs + +| Name | Description | +|------|-------------| +| [project\_ids](#output\_project\_ids) | n/a | + + +## Platform Principles + +This reference architecture demonstrates how to build a GKE platform that facilitates Machine Learning. The reference architecture is based on the following principles: + + - The platform admin will create the GKE platform using IaC tool like [Terraform][terraform]. The IaC will come with re-usuable modules that can be referred to create more resources as the demand grows. + - The platform will be based on [GitOps][gitops]. + - After the GKE platform has been created, cluster scoped resources on it will be created through [Config Sync][config-sync] by the admins. + - Platform admins will create a namespace per application and provide the application team member full access to it. + - The namespace scoped resources will be created by the Application/ML teams either via [Config Sync][config-sync] or through a deployment tool like [Cloud Deploy][cloud-deploy] + +## CUJ and Personae addressed in the reference architecture + +### Persona : Platform Admin + +**CUJ 1** : Provide templates with built-in standard practices to stamp out GKE platforms to be used by ML Engineers, Data Scientists and Application teams. + +**CUJ 2** : Provide GKE clusters. + +**CUJ 2** : Provide space for the teams on GKE cluster to run their workloads and the permissions following the principle of least privilege. + +**CUJ 3** : Provide secure methods to the ML Engineers, Data Scientist, Application teams and the Operators to connect to the private GKE clusters. **CUJ 4** : Enforcing security policies on the underlying platform. -### ML Engineers +### Persona : ML Engineers **CUJ 1** : Use ML tools like `ray` to perform their day to day tasks like data pre-processing, ML training etc. **CUJ 2** : Use a development environment like Jupyter Notebook for faster inner loop of ML development. **[TBD]** -### Operators +### Persona : Operators **CUJ 1**: Act as a bridge between the Platform admins and the ML Engineers by providing and maintaining software needed by the ML engineers so they can focus on their job. @@ -52,29 +145,100 @@ It addresses following personae and provides means to automate and simplify thei ## Prerequistes 1. This tutorial has been tested on [Cloud Shell](https://shell.cloud.google.com) which comes preinstalled with [Google Cloud SDK](https://cloud.google.com/sdk) is required to complete this tutorial. - -## Deploy resources. - -Follow these steps in order to build the platform and use it. - -- Run Terraform in [01_gcp_project folder][projects]. This module creates GCP projects for your ML environments. This is an optional module. If you already have created GCP projects, directly run 02_gke module. - -- Run Terraform in [02_gke folder][gke]. This modules creates private GKE clusters for each environment. - -- Run Terraform in [03_configsync folder][configsync]. This modules enables Config management on GKE clusters, creates a repository in GitHub and creates a [root-sync][root-sync] on the clusters connected to the repo. - -- Run steps in [04_setup_clusters][setup-clusters]. This modules walks through how as platform admin you can set up cluster level software to the ML teams. - -- Run steps in [05_setup_teams][setup-teams]. This modules walks through how as platform admin you can set up spaces for ML teams on the cluster and transfer ownership to operators to maintain that space. - -- Run steps in [06_operating_teams][operating-teams]. This module walks through how as an operator you will provide the software required by ML engineers. - - -[projects]: ./01_gcp_project/README.md -[gke]: ./02_gke/README.md -[configsync]: ./03_configsync/README.md -[setup-clusters]: ./04_setup_clusters/README.md -[setup-teams]: ./05_setup_teams/README.md -[operating-teams]: ./06_operating_teams/README.md +2. Familiarity with [Google Kubernetes Engine][gke], [Terraform][terraform], [root-sync][root-sync] , [repo-sync][repo-sync] , [Git][git], [GitHub][github] + +# Workflow + +This reference architecture can be implemented in one of the following ways: + +- Deploy a single env reference architecture. +- Deploy a multi env reference architecture in single [GCP project][gcp-project] +- Deploy a multi env reference architecture with each env in its own [GCP project][gcp-project] + +## Deploy a single env reference architecture +This is the quick-start deployment. It can be used to quickly set up an environment and start playing with it to get an understanding of the flow. Single env reference architecture can be deployed with the provided default values. + +### Configuration +- You can either create a new GCP project or use an existing one. Skip this step if you choose to use an already existing project. + - To create a new project, open `cloudshell` and run the following command: + ``` + gcloud projects create + ``` + - Associate billing account to the project: + ``` + gcloud beta billing projects link \ + --billing-account + ``` +- Set up PROJECT_ID in environment variable in `cloudshell` : + ``` + export PROJECT_ID="" >> ~/.bashrc + ``` + + Replace with the id of the project that you created in the previous step or the id of an already existing project that you want to use. + +- Update ~/bashrc to automatically point to the required project when a new instance of the `cloudshell` is created: + ``` + echo gcloud config set project $PROJECT_ID >> ~/.bashrc && source ~/.bashrc + ``` + +- Create a GCS bucket in the project for storing TF state. + - To create a new bucket, run the following command in `cloudshell` : + ``` + export STATE_BUCKET="${PROJECT_ID}-tf-state" >> ~/.bashrc && source ~/.bashrc + + gcloud storage buckets create gs://${STATE_BUCKET} + ``` + +- Store github configurations in environment variables: + ``` + export GITHUB_USER= >> ~/.bashrc + export GITHUB_ORG= >> ~/.bashrc + export GITHUB_EMAIL= >> ~/.bashrc + source ~/.bashrc + ``` + +- Create a [Personal Access Token][personal-access-token] in [GitHub][github]: + + Note: It is recommended to use a [machine user account][machine-user-account] for this but you can use a personal user account just to try this reference architecture. + - Go to https://github.com/settings/tokens and login using your credentials + - Click "Generate new token" >> "Generate new token (classic)". + - You will be directed to a screen to created the new token. Provide the note and expiration. + - Choose the following two access: + - [x] repo + - [x] delete_repo + - Click "Generate token" + - Store the token safely. + +### Run Terraform + +- Clone the repo and change dir + ``` + git clone https://github.com/GoogleCloudPlatform/ai-on-gke + + cd ml-platform + ``` + +- Perform variable replacement. + ``` + sed -i "s/YOUR_STATE_BUCKET/${STATE_BUCKET}/g" backend.tf + + sed -i "s/YOUR_PROJECT_ID/${PROJECT_ID}/g" terraform.tfvars + ``` + +Typically, you would want to have dev, staging and production environments created in separate projects. To have such isolation, pass `env` input variable as `[ "dev", "staging", "prod" ]`. This will create one project for dev, staging and prod environments. You can update the input variable `env` based on how many environments/projects you want to create. + +However, if you want to use a single project for multiple environments, you can create just one project by passing one element to `env` input variable list e.g [ "dev" ] or ["my-playground"] etc. + + +[gitops]: https://about.gitlab.com/topics/gitops/ [repo-sync]: https://cloud.google.com/anthos-config-management/docs/reference/rootsync-reposync-fields -[root-sync]: https://cloud.google.com/anthos-config-management/docs/reference/rootsync-reposync-fields \ No newline at end of file +[root-sync]: https://cloud.google.com/anthos-config-management/docs/reference/rootsync-reposync-fields +[config-sync]: https://cloud.google.com/anthos-config-management/docs/config-sync-overview +[cloud-deploy]: https://cloud.google.com/deploy?hl=en +[terraform]: https://www.terraform.io/ +[gke]: https://cloud.google.com/kubernetes-engine?hl=en +[git]: https://git-scm.com/ +[github]: https://github.com/ +[gcp-project]: https://cloud.google.com/resource-manager/docs/creating-managing-projects +[personal-access-token]: https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/managing-your-personal-access-tokens +[machine-user-account]: https://docs.github.com/en/get-started/learning-about-github/types-of-github-accounts \ No newline at end of file diff --git a/ml-platform/02_gke/backend.tf b/ml-platform/backend.tf similarity index 96% rename from ml-platform/02_gke/backend.tf rename to ml-platform/backend.tf index 97deced77..a676d7219 100644 --- a/ml-platform/02_gke/backend.tf +++ b/ml-platform/backend.tf @@ -14,7 +14,8 @@ terraform { backend "gcs" { - prefix = "02_gke" + prefix = "terraform" bucket = "YOUR_STATE_BUCKET" } } + diff --git a/ml-platform/03_configsync/create_cluster_yamls.sh b/ml-platform/create_cluster_yamls.sh similarity index 53% rename from ml-platform/03_configsync/create_cluster_yamls.sh rename to ml-platform/create_cluster_yamls.sh index 3c659a198..627e8fd7a 100755 --- a/ml-platform/03_configsync/create_cluster_yamls.sh +++ b/ml-platform/create_cluster_yamls.sh @@ -28,49 +28,30 @@ sleep $sleep_total random=$(echo $RANDOM | md5sum | head -c 20; echo) log="$(pwd)/log" flag=0 -#github_token=${7} -#echo "${github_token}" >> log -#echo "${TF_VAR_github_token}" >> log -#ls -lrt >> log -#ls -lrt ../ >> log -#TIMESTAMP=$(date "+%Y%m%d%H%M%S") -download_acm_repo_name=$(echo ${acm_repo_name} | awk -F "/" '{print $2}')-${random} + +download_acm_repo_name="/tmp/$(echo ${acm_repo_name} | awk -F "/" '{print $2}')-${random}" git config --global user.name ${github_user} git config --global user.email ${github_emai} git clone https://${github_user}:${TF_VAR_github_token}@github.com/${acm_repo_name} ${download_acm_repo_name} -echo "Download repo is ${download_acm_repo_name}" >> ${log} -echo "ls -lrt before going into download repo is $(ls -lrt)" >> ${log} -cd ${download_acm_repo_name} -echo "ls -lrt in download repo is $(ls -lrt)" >> ${log} -if [ ! -d "manifests" ] && [ ! -d "templates" ]; then - echo "copying files" >> ${log} - cp -r ../templates/acm-template/* . + +if [ ! -d "${download_acm_repo_name}/manifests" ] && [ ! -d "${download_acm_repo_name}/templates" ]; then + echo "copying files" + cp -r templates/acm-template/* ${download_acm_repo_name} flag=1 fi -cd manifests/clusters -if [ ${flag} -eq 0 ]; then - echo "not copying files" >> ${log} +cd ${download_acm_repo_name}/manifests/clusters +if [ "${flag}" -eq 0 ]; then + echo "not copying files" fi -echo "In directory $(pwd)" >> ${log} -echo "level0 $(ls -lrt)" >> ${log} -echo "level1 $(ls -lrt ../)" >> ${log} -echo "level2 $(ls -lrt ../../)" >> ${log} -echo "level3 $(ls -lrt ../../../)" >> ${log} -echo "level4 $(ls -lrt ../../../../ )" >> ${log} -echo "env is ${cluster_env}" >> ${log} - -cp ../../templates/_cluster-template/cluster.yaml ./${cluster_name}-cluster.yaml -cp ../../templates/_cluster-template/selector.yaml ./${cluster_env}-selector.yaml -#cp ../../templates/_cluster-template/connect-gateway-rbac.yaml ./${cluster_name}-connect-gateway-rbac.yaml +cp ../../templates/_cluster_template/cluster.yaml ./${cluster_name}-cluster.yaml +cp ../../templates/_cluster_template/selector.yaml ./${cluster_env}-selector.yaml find . -type f -name ${cluster_name}-cluster.yaml -exec sed -i "s/CLUSTER_NAME/${cluster_name}/g" {} + find . -type f -name ${cluster_name}-cluster.yaml -exec sed -i "s/ENV/${cluster_env}/g" {} + find . -type f -name ${cluster_env}-selector.yaml -exec sed -i "s/ENV/${cluster_env}/g" {} + -#find . -type f -name ${cluster_name}-connect-gateway-rbac.yaml -exec sed -i "s/CLUSTER_NAME/${cluster_name}/g" {} + -#find . -type f -name ${cluster_name}-connect-gateway-rbac.yaml -exec sed -i "s/ENV/${cluster_env}/g" {} + -cp ../../templates/_cluster-template/kuberay . +#cp ../../templates/_cluster_template/kuberay . git add ../../. git config --global user.name ${github_user} @@ -78,5 +59,5 @@ git config --global user.email ${github_email} git commit -m "Adding ${cluster_name} cluster to the ${cluster_env} environment." git push origin -cd .. +cd - rm -rf ${download_acm_repo_name} diff --git a/ml-platform/create_git_cred.sh b/ml-platform/create_git_cred.sh new file mode 100755 index 000000000..da5a92104 --- /dev/null +++ b/ml-platform/create_git_cred.sh @@ -0,0 +1,38 @@ +#!/bin/bash +# +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +gke_cluster=${1} +project_id=${2} +git_user=${3} +namespace=${4} +index=${5} +sleep_time=60 +sleep_index=$((${index}+1)) +sleep_total=$((${sleep_time}*${sleep_index})) +sleep $sleep_total +gcloud container fleet memberships get-credentials ${gke_cluster} --project ${project_id} +ns_exists=$(kubectl get ns ${namespace} -o name | awk -F '/' '{print $2}') + +while [ "${ns_exists}" != "${namespace}" ] +do +sleep 10 +ns_exists=$(kubectl get ns ${namespace} -o name | awk -F '/' '{print $2}') +done +secret_exists=$(kubectl get secret git-creds -n ${namespace} -o name) +if [[ "${secret_exists}" == "secret/git-creds" ]]; then + exit 0 +else + kubectl create secret generic git-creds --namespace="${namespace}" --from-literal=username="${git_user}" --from-literal=token="${TF_VAR_github_token}" +fi \ No newline at end of file diff --git a/ml-platform/create_namespace.sh b/ml-platform/create_namespace.sh new file mode 100755 index 000000000..7a4e28d06 --- /dev/null +++ b/ml-platform/create_namespace.sh @@ -0,0 +1,57 @@ +#!/bin/bash +# +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +configsync_repo_name=${1} +github_email=${2} +github_org=${3} +github_user=${4} +namespace=${5} + +logfile=$(pwd)/log +random=$(echo $RANDOM | md5sum | head -c 20; echo) +download_acm_repo_name="/tmp/$(echo ${configsync_repo_name} | awk -F "/" '{print $2}')-${random}" +git config --global user.name ${github_user} +git config --global user.email ${github_emai} +git clone https://${github_user}:${TF_VAR_github_token}@github.com/${configsync_repo_name} ${download_acm_repo_name} +cd ${download_acm_repo_name}/manifests/clusters + +if [ -d "${namespace}" ]; then + exit 0 +fi +chars_in_namespace=$(echo -n ${namespace} | wc -c) +#adding 4 for number of chars in "dev-" +chars_in_reposync_name=$(expr $chars_in_namespace + 4) +mkdir ${namespace} || exit 1 +cp -r ../../templates/_cluster_template/team/* ${namespace} +sed -i "s?NAMESPACE?$namespace?g" ${namespace}/* +sed -ni '/#END OF SINGLE ENV DECLARATION/q;p' ${namespace}/reposync.yaml +sed -i "s?GIT_REPO?https://github.com/$configsync_repo_name?g" ${namespace}/reposync.yaml +sed -i "s??$chars_in_reposync_name?g" ${namespace}/reposync.yaml + +mkdir ../apps/${namespace} +touch ../apps/${namespace}/.gitkeep + +cat <>kustomization.yaml +- ./${namespace} +EOF +cd .. +git add . +git config --global user.name ${github_user} +git config --global user.email ${github_email} +git commit -m "Adding manifests to create a new namespace." +git push origin + +cd - +rm -rf ${download_acm_repo_name} \ No newline at end of file diff --git a/ml-platform/install_kuberay_operator.sh b/ml-platform/install_kuberay_operator.sh new file mode 100755 index 000000000..08bb41410 --- /dev/null +++ b/ml-platform/install_kuberay_operator.sh @@ -0,0 +1,47 @@ +#!/bin/bash +# +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +configsync_repo_name=${1} +github_email=${2} +github_org=${3} +github_user=${4} + +random=$(echo $RANDOM | md5sum | head -c 20; echo) +download_acm_repo_name="/tmp/$(echo ${configsync_repo_name} | awk -F "/" '{print $2}')-${random}" +git config --global user.name ${github_user} +git config --global user.email ${github_emai} +git clone https://${github_user}:${TF_VAR_github_token}@github.com/${configsync_repo_name} ${download_acm_repo_name} +cd ${download_acm_repo_name}/manifests/clusters +if [ -f "kustomization.yaml" ]; then + exit 0 +fi +yamlfiles=$(find . -type f -name "*.yaml") +cp ../../templates/_cluster_template/kustomization.yaml . +for yamlfile in `echo ${yamlfiles}` +do +cat <>kustomization.yaml + +- ${yamlfile} +EOF +done +cp -r ../../templates/_cluster_template/kuberay . +git add . +git config --global user.name ${github_user} +git config --global user.email ${github_email} +git commit -m "Adding manifests to install kuberay operator." +git push origin + +cd - +rm -rf ${download_acm_repo_name} \ No newline at end of file diff --git a/ml-platform/install_ray_cluster.sh b/ml-platform/install_ray_cluster.sh new file mode 100755 index 000000000..d7d62c8ba --- /dev/null +++ b/ml-platform/install_ray_cluster.sh @@ -0,0 +1,48 @@ +#!/bin/bash +# +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +configsync_repo_name=${1} +github_email=${2} +github_org=${3} +github_user=${4} +namespace=${5} + +random=$(echo $RANDOM | md5sum | head -c 20; echo) +download_acm_repo_name="/tmp/$(echo ${configsync_repo_name} | awk -F "/" '{print $2}')-${random}" +git config --global user.name ${github_user} +git config --global user.email ${github_emai} +git clone https://${github_user}:${TF_VAR_github_token}@github.com/${configsync_repo_name} ${download_acm_repo_name} +cd ${download_acm_repo_name}/manifests/apps +if [ ! -d "${namespace}" ]; then + echo "${namespace} folder doesnt exist in the configsync repo" + exit 1 +fi + +if [ -f "${namespace}/kustomization.yaml" ]; then + echo "${namespace} is already set up" + exit 0 +fi + +cp -r ../../templates/_namespace_template/app/* ${namespace}/ +sed -i "s?NAMESPACE?${namespace}?g" ${namespace}/* + +git add . +git config --global user.name ${github_user} +git config --global user.email ${github_email} +git commit -m "Installing ray cluster in ${namespace} namespace." +git push origin + +cd - +rm -rf ${download_acm_repo_name} \ No newline at end of file diff --git a/ml-platform/main.tf b/ml-platform/main.tf new file mode 100644 index 000000000..3f5b53b42 --- /dev/null +++ b/ml-platform/main.tf @@ -0,0 +1,390 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#TODO: Add a validation that the value if default_env must be one of the values in env list +module "gcp-project" { + count = var.create_projects + source = "./modules/projects" + org_id = var.org_id + folder_id = var.folder_id + env = var.env + billing_account = var.billing_account + project_name = var.project_name +} + + +locals { + #parsed_project_id = length(keys("${var.project_id}")) == 0 ? data.terraform_remote_state.gcp-projects[0].outputs.project_ids : var.project_id + #var.create_projects == 1 ? {for k, v in "${module.gcp-project.project_ids}" : k => v.project_id} : "" + parsed_project_id = var.create_projects == 0 ? var.project_id : { for k, v in "${module.gcp-project.project_ids}" : k => v.project_id } + parsed_gke_info = module.gke + parsed_gke_info_without_default_env = { for k, v in "${local.parsed_gke_info}" : k => v if k != var.default_env } + project_id_list = [for k, v in "${module.gke}" : v.gke_project_id] + gke_project_map = { for k, v in "${module.gke}" : v.cluster_name => v.gke_project_id } +} + +resource "google_project_service" "project_services-cr" { + for_each = local.parsed_project_id + project = each.value + service = "cloudresourcemanager.googleapis.com" + disable_on_destroy = false + disable_dependent_services = false + depends_on = [module.gcp-project] +} + +resource "google_project_service" "project_services-an" { + for_each = local.parsed_project_id + project = each.value + service = "anthos.googleapis.com" + disable_on_destroy = false + disable_dependent_services = false + depends_on = [module.gcp-project, google_project_service.project_services-cr] +} +resource "google_project_service" "project_services-anc" { + for_each = local.parsed_project_id + project = each.value + service = "anthosconfigmanagement.googleapis.com" + disable_on_destroy = false + disable_dependent_services = false + depends_on = [module.gcp-project, google_project_service.project_services-cr] +} +resource "google_project_service" "project_services-con" { + for_each = local.parsed_project_id + project = each.value + service = "container.googleapis.com" + disable_on_destroy = false + disable_dependent_services = false + depends_on = [module.gcp-project, google_project_service.project_services-cr] +} +resource "google_project_service" "project_services-com" { + for_each = local.parsed_project_id + project = each.value + service = "compute.googleapis.com" + disable_on_destroy = false + disable_dependent_services = false + depends_on = [module.gcp-project, google_project_service.project_services-cr] +} +resource "google_project_service" "project_services-gkecon" { + for_each = local.parsed_project_id + project = each.value + service = "gkeconnect.googleapis.com" + disable_on_destroy = false + disable_dependent_services = false + depends_on = [module.gcp-project, google_project_service.project_services-cr] +} +resource "google_project_service" "project_services-gkeh" { + for_each = local.parsed_project_id + project = each.value + service = "gkehub.googleapis.com" + disable_on_destroy = false + disable_dependent_services = false + depends_on = [module.gcp-project, google_project_service.project_services-cr] +} +resource "google_project_service" "project_services-iam" { + for_each = local.parsed_project_id + project = each.value + service = "iam.googleapis.com" + disable_on_destroy = false + disable_dependent_services = false + depends_on = [module.gcp-project, google_project_service.project_services-cr] +} + +resource "google_project_service" "project_services-gate" { + for_each = local.parsed_project_id + project = each.value + service = "connectgateway.googleapis.com" + disable_on_destroy = false + disable_dependent_services = false + depends_on = [module.gcp-project, google_project_service.project_services-cr] +} + +module "create-vpc" { + for_each = local.parsed_project_id + source = "./modules/network" + project_id = each.value + network_name = format("%s-%s", var.network_name, each.key) + routing_mode = var.routing_mode + subnet_01_name = format("%s-%s", var.subnet_01_name, each.key) + subnet_01_ip = var.subnet_01_ip + subnet_01_region = var.subnet_01_region + subnet_02_name = format("%s-%s", var.subnet_02_name, each.key) + subnet_02_ip = var.subnet_02_ip + subnet_02_region = var.subnet_02_region + #default_route_name = format("%s-%s","default-route",each.key) + depends_on = [module.gcp-project, google_project_service.project_services-com] +} + +resource "google_gke_hub_feature" "configmanagement_acm_feature" { + count = length(distinct(values(local.parsed_project_id))) + name = "configmanagement" + project = distinct(values(local.parsed_project_id))[count.index] + location = "global" + provider = google-beta + depends_on = [google_project_service.project_services-gkeh, google_project_service.project_services-anc, google_project_service.project_services-an, google_project_service.project_services-com, google_project_service.project_services-gkecon] +} + +module "gke" { + for_each = local.parsed_project_id + source = "./modules/cluster" + cluster_name = format("%s-%s", var.cluster_name, each.key) + network = module.create-vpc[each.key].vpc + subnet = module.create-vpc[each.key].subnet-1 + project_id = each.value + region = var.subnet_01_region + zone = "${var.subnet_01_region}-a" + master_auth_networks_ipcidr = var.subnet_01_ip + depends_on = [google_gke_hub_feature.configmanagement_acm_feature, google_project_service.project_services-con, google_project_service.project_services-com] + env = each.key +} +module "reservation" { + for_each = local.parsed_project_id + source = "./modules/vm-reservations" + cluster_name = module.gke[each.key].cluster_name + zone = "${var.subnet_01_region}-a" + project_id = each.value + depends_on = [module.gke] +} +module "node_pool-reserved" { + for_each = local.parsed_project_id + source = "./modules/node-pools" + node_pool_name = "reservation" + project_id = each.value + cluster_name = module.gke[each.key].cluster_name + region = "${var.subnet_01_region}" + taints = var.reserved_taints + resource_type = "reservation" + reservation_name = module.reservation[each.key].reservation_name + depends_on = [module.reservation] +} + +module "node_pool-ondemand" { + for_each = local.parsed_project_id + source = "./modules/node-pools" + node_pool_name = "ondemand" + project_id = each.value + cluster_name = module.gke[each.key].cluster_name + region = "${var.subnet_01_region}" + taints = var.ondemand_taints + resource_type = "ondemand" + depends_on = [module.gke] +} + +module "node_pool-spot" { + for_each = local.parsed_project_id + source = "./modules/node-pools" + node_pool_name = "spot" + project_id = each.value + cluster_name = module.gke[each.key].cluster_name + region = "${var.subnet_01_region}" + taints = var.spot_taints + resource_type = "spot" + depends_on = [module.gke] +} + +module "cloud-nat" { + for_each = local.parsed_project_id + source = "./modules/cloud-nat" + project_id = each.value + region = split("/", module.create-vpc[each.key].subnet-1)[3] + name = format("%s-%s", "nat-for-acm", each.key) + network = module.create-vpc[each.key].vpc + create_router = true + router = format("%s-%s", "router-for-acm", each.key) + depends_on = [module.create-vpc, google_project_service.project_services-com] +} + + + +//data "terraform_remote_state" "gke-clusters" { +// backend = "gcs" +// config = { +// bucket = var.lookup_state_bucket +// prefix = "02_gke" +// } +//} +// +//locals { +// parsed_gke_info = module.gke +// project_id_list = [for k,v in "${module.gke}" : v.gke_project_id] +//} + +//resource "google_gke_hub_feature" "configmanagement_acm_feature" { +// count = length(distinct(local.project_id_list)) +// name = "configmanagement" +// project = distinct(local.project_id_list)[count.index] +// location = "global" +// provider = google-beta +//} + +resource "google_gke_hub_membership" "membership" { + provider = google-beta + for_each = local.parsed_gke_info + project = each.value["gke_project_id"] + membership_id = each.value["cluster_name"] + endpoint { + gke_cluster { + resource_link = format("%s/%s", "//container.googleapis.com", each.value["cluster_id"]) + } + } + lifecycle { + ignore_changes = [ + labels + ] + } + depends_on = [google_gke_hub_feature.configmanagement_acm_feature, google_project_service.project_services-gkeh, google_project_service.project_services-gkecon] +} + +resource "github_repository" "acm_repo" { + name = var.configsync_repo_name + description = "Repo for Config Sync" + visibility = "private" + has_issues = false + has_projects = false + has_wiki = false + + allow_merge_commit = true + allow_squash_merge = true + allow_rebase_merge = true + delete_branch_on_merge = false + auto_init = true + vulnerability_alerts = true +} +//Create a branch for each env +resource "github_branch" "branch" { + for_each = local.parsed_gke_info + repository = split("/", github_repository.acm_repo.full_name)[1] + branch = each.key + depends_on = [github_repository.acm_repo] +} +//Set default branch as the lowest env +resource "github_branch_default" "default_branch" { + repository = split("/", github_repository.acm_repo.full_name)[1] + #branch = tostring(keys(local.parsed_gke_info)[0]) + branch = var.default_env + #rename = true + depends_on = [github_branch.branch] +} +#Protect branches other than the default branch +resource "github_branch_protection_v3" "branch_protection" { + for_each = length(keys(local.parsed_project_id)) > 1 ? local.parsed_gke_info_without_default_env : {} + repository = split("/", github_repository.acm_repo.full_name)[1] + branch = each.key + required_pull_request_reviews { + required_approving_review_count = 1 + require_code_owner_reviews = true + } + restrictions { + + } + + depends_on = [github_branch.branch] +} + +resource "google_gke_hub_feature_membership" "feature_member" { + provider = google-beta + for_each = local.parsed_gke_info + project = each.value["gke_project_id"] + location = "global" + feature = "configmanagement" + membership = google_gke_hub_membership.membership[each.key].membership_id + configmanagement { + version = "1.17.0" + config_sync { + source_format = "unstructured" + git { + sync_repo = "https://github.com/${github_repository.acm_repo.full_name}.git" + sync_branch = each.value["env"] + policy_dir = "manifests/clusters" + secret_type = "token" + } + } + policy_controller { + enabled = true + template_library_installed = true + referential_rules_enabled = true + } + } + + provisioner "local-exec" { + command = "${path.module}/create_cluster_yamls.sh ${var.github_org} ${github_repository.acm_repo.full_name} ${var.github_user} ${var.github_email} ${each.value["env"]} ${each.value["cluster_name"]} ${index(keys(local.parsed_gke_info), each.key)}" + } + + depends_on = [google_project_service.project_services-gkecon, google_project_service.project_services-gkeh, google_project_service.project_services-an, google_project_service.project_services-anc] +} + +resource "null_resource" "create_git_cred_cms" { + for_each = var.secret_for_rootsync == 1 ? local.gke_project_map : {} + triggers = { + timestamp = timestamp() + } + provisioner "local-exec" { + command = "${path.module}/create_git_cred.sh ${each.key} ${each.value} ${var.github_user} config-management-system ${index(keys(local.gke_project_map), each.key)}" + } + depends_on = [google_gke_hub_feature_membership.feature_member, module.gke, module.node_pool-reserved, module.node_pool-ondemand, module.node_pool-spot, module.cloud-nat] +} + +resource "null_resource" "install_kuberay_operator" { + count = var.install_kuberay + triggers = { + timestamp = timestamp() + } + provisioner "local-exec" { + command = "${path.module}/install_kuberay_operator.sh ${github_repository.acm_repo.full_name} ${var.github_email} ${var.github_org} ${var.github_user}" + } + depends_on = [google_gke_hub_feature_membership.feature_member, null_resource.create_git_cred_cms] +} + +resource "null_resource" "create_namespace" { + count = var.create_namespace + triggers = { + timestamp = timestamp() + } + provisioner "local-exec" { + command = "${path.module}/create_namespace.sh ${github_repository.acm_repo.full_name} ${var.github_email} ${var.github_org} ${var.github_user} ${var.namespace}" + } + depends_on = [google_gke_hub_feature_membership.feature_member, null_resource.install_kuberay_operator] +} + +resource "null_resource" "create_git_cred_ns" { + count = var.create_namespace + triggers = { + timestamp = timestamp() + } + provisioner "local-exec" { + command = "${path.module}/create_git_cred.sh ${local.parsed_gke_info[var.default_env].cluster_name} ${local.parsed_gke_info[var.default_env].gke_project_id} ${var.github_user} ${var.namespace}" + } + depends_on = [ google_gke_hub_feature_membership.feature_member, null_resource.create_namespace ] +} + +resource "null_resource" "install_ray_cluster" { + count = var.install_ray_in_ns + triggers = { + timestamp = timestamp() + } + provisioner "local-exec" { + command = "${path.module}/install_ray_cluster.sh ${github_repository.acm_repo.full_name} ${var.github_email} ${var.github_org} ${var.github_user} ${var.namespace}" + } + depends_on = [google_gke_hub_feature_membership.feature_member, null_resource.create_git_cred_ns] +} + +resource "null_resource" "manage_ray_ns" { + count = var.install_ray_in_ns + triggers = { + timestamp = timestamp() + } + provisioner "local-exec" { + command = "${path.module}/manage_ray_ns.sh ${github_repository.acm_repo.full_name} ${var.github_email} ${var.github_org} ${var.github_user} ${var.namespace}" + } + depends_on = [google_gke_hub_feature_membership.feature_member, null_resource.create_git_cred_ns, null_resource.install_ray_cluster] +} \ No newline at end of file diff --git a/ml-platform/manage_ray_ns.sh b/ml-platform/manage_ray_ns.sh new file mode 100755 index 000000000..021559fee --- /dev/null +++ b/ml-platform/manage_ray_ns.sh @@ -0,0 +1,43 @@ +#!/bin/bash +# +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +configsync_repo_name=${1} +github_email=${2} +github_org=${3} +github_user=${4} +namespace=${5} + +random=$(echo $RANDOM | md5sum | head -c 20; echo) +download_acm_repo_name="/tmp/$(echo ${configsync_repo_name} | awk -F "/" '{print $2}')-${random}" +git config --global user.name ${github_user} +git config --global user.email ${github_emai} +git clone https://${github_user}:${TF_VAR_github_token}@github.com/${configsync_repo_name} ${download_acm_repo_name} +cd ${download_acm_repo_name}/manifests/clusters/kuberay +ns_exists=$(grep ${namespace} values.yaml | wc -l) +if [ "${ns_exists}" -ne 0 ]; then + echo "namespace already present in values.yaml" + exit 0 +fi + +sed -i "s/watchNamespace:/watchNamespace:\n - ${namespace}/g" values.yaml + +git add . +git config --global user.name ${github_user} +git config --global user.email ${github_email} +git commit -m "Installing ray cluster in ${namespace} namespace." +git push origin + +cd - +rm -rf ${download_acm_repo_name} \ No newline at end of file diff --git a/ml-platform/mlenv.auto.tfvars b/ml-platform/mlenv.auto.tfvars new file mode 100644 index 000000000..3a7fe5c74 --- /dev/null +++ b/ml-platform/mlenv.auto.tfvars @@ -0,0 +1,9 @@ +project_id = {"dev":"YOUR_PROJECT_ID"} +default_env = "dev" +github_user = "YOUR_GITHUB_USER" +github_email = "YOUR_GITHUB_EMAIL" +github_org = "YOUR_GITHUB_ORG" +#github_token = "DO NOT ADD TOKEN HERE. PASS IT AS `export TF_VAR_github_token="YOUR TOKEN"` ON CLOUDSHELL" +#folder_id = "YOUR_FOLDER_ID" +#org_id = "YOUR_GCP_ORG_ID" +#billing_account = "YOUR_BILLING_ACCOUNT" \ No newline at end of file diff --git a/ml-platform/02_gke/modules/cloud-nat/README.md b/ml-platform/modules/cloud-nat/README.md similarity index 99% rename from ml-platform/02_gke/modules/cloud-nat/README.md rename to ml-platform/modules/cloud-nat/README.md index e498d7958..6952d4e9f 100644 --- a/ml-platform/02_gke/modules/cloud-nat/README.md +++ b/ml-platform/modules/cloud-nat/README.md @@ -1,4 +1,3 @@ - # Terraform Google Cloud NAT Module This module handles opinionated Google Cloud Platform Cloud NAT creation and configuration. diff --git a/ml-platform/02_gke/modules/cloud-nat/main.tf b/ml-platform/modules/cloud-nat/main.tf similarity index 100% rename from ml-platform/02_gke/modules/cloud-nat/main.tf rename to ml-platform/modules/cloud-nat/main.tf diff --git a/ml-platform/02_gke/modules/cloud-nat/outputs.tf b/ml-platform/modules/cloud-nat/outputs.tf similarity index 99% rename from ml-platform/02_gke/modules/cloud-nat/outputs.tf rename to ml-platform/modules/cloud-nat/outputs.tf index acd7f8ce6..86bf7c39d 100644 --- a/ml-platform/02_gke/modules/cloud-nat/outputs.tf +++ b/ml-platform/modules/cloud-nat/outputs.tf @@ -31,3 +31,4 @@ output "router_name" { description = "Cloud NAT router name" value = local.router } + diff --git a/ml-platform/02_gke/modules/cloud-nat/variables.tf b/ml-platform/modules/cloud-nat/variables.tf similarity index 100% rename from ml-platform/02_gke/modules/cloud-nat/variables.tf rename to ml-platform/modules/cloud-nat/variables.tf diff --git a/ml-platform/modules/cloud-nat/versions.tf b/ml-platform/modules/cloud-nat/versions.tf new file mode 100644 index 000000000..a6e8142dd --- /dev/null +++ b/ml-platform/modules/cloud-nat/versions.tf @@ -0,0 +1,50 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +//terraform { +// required_providers { +// +// google = { +// source = "hashicorp/google" +// #version = ">= 4.51, < 5.0" +// version = "4.72.1" +// } +// +// random = { +// source = "hashicorp/random" +// version = "~> 2.2" +// } +// } +// +//} +terraform { + required_providers { + google = { + source = "hashicorp/google" + version = "5.19.0" + } + google-beta = { + source = "hashicorp/google-beta" + version = "5.19.0" + } + github = { + source = "integrations/github" + version = "6.0.1" + } + random = { + source = "hashicorp/random" + version = "2.2" + } + } +} diff --git a/ml-platform/02_gke/modules/cluster/gke.tf b/ml-platform/modules/cluster/gke.tf similarity index 87% rename from ml-platform/02_gke/modules/cluster/gke.tf rename to ml-platform/modules/cluster/gke.tf index 418068752..b08e92b9b 100644 --- a/ml-platform/02_gke/modules/cluster/gke.tf +++ b/ml-platform/modules/cluster/gke.tf @@ -19,14 +19,15 @@ data "google_project" "project" { } resource "google_container_cluster" "gke_batch" { - provider = google-beta - name = var.cluster_name - project = var.project_id - location = var.region - network = var.network - subnetwork = var.subnet - node_locations = ["${var.region}-a", "${var.region}-b", "${var.region}-c"] - initial_node_count = 2 + provider = google-beta + deletion_protection = false + name = var.cluster_name + project = var.project_id + location = var.region + network = var.network + subnetwork = var.subnet + node_locations = ["${var.region}-a", "${var.region}-b", "${var.region}-c"] + initial_node_count = 2 workload_identity_config { workload_pool = "${var.project_id}.svc.id.goog" } @@ -104,7 +105,7 @@ resource "google_container_cluster" "gke_batch" { } } release_channel { - channel = "RAPID" + channel = "STABLE" } private_cluster_config { enable_private_nodes = true diff --git a/ml-platform/02_gke/modules/cluster/outputs.tf b/ml-platform/modules/cluster/outputs.tf similarity index 99% rename from ml-platform/02_gke/modules/cluster/outputs.tf rename to ml-platform/modules/cluster/outputs.tf index 57bd8a0de..b26d3be8e 100644 --- a/ml-platform/02_gke/modules/cluster/outputs.tf +++ b/ml-platform/modules/cluster/outputs.tf @@ -30,4 +30,4 @@ output "gke_project_id" { output "env" { value = var.env -} +} \ No newline at end of file diff --git a/ml-platform/02_gke/modules/cluster/variables.tf b/ml-platform/modules/cluster/variables.tf similarity index 99% rename from ml-platform/02_gke/modules/cluster/variables.tf rename to ml-platform/modules/cluster/variables.tf index 5d76462c4..6eccda35b 100644 --- a/ml-platform/02_gke/modules/cluster/variables.tf +++ b/ml-platform/modules/cluster/variables.tf @@ -55,4 +55,4 @@ variable "env" { type = string description = "environment" -} +} \ No newline at end of file diff --git a/ml-platform/02_gke/modules/node-pools/versions.tf b/ml-platform/modules/cluster/versions.tf similarity index 71% rename from ml-platform/02_gke/modules/node-pools/versions.tf rename to ml-platform/modules/cluster/versions.tf index fc374eab1..d4aada15b 100644 --- a/ml-platform/02_gke/modules/node-pools/versions.tf +++ b/ml-platform/modules/cluster/versions.tf @@ -12,15 +12,28 @@ # See the License for the specific language governing permissions and # limitations under the License. +//terraform { +// required_providers { +// google-beta = { +// source = "hashicorp/google-beta" +// version = "4.72.1" +// } +// google = { +// source = "hashicorp/google" +// version = "4.72.1" +// } +// } +//} + terraform { required_providers { - google-beta = { - source = "hashicorp/google-beta" - version = "4.72.1" - } google = { source = "hashicorp/google" - version = "4.72.1" + version = "5.19.0" + } + google-beta = { + source = "hashicorp/google-beta" + version = "5.19.0" } } } diff --git a/ml-platform/02_gke/modules/network/README.md b/ml-platform/modules/network/README.md similarity index 99% rename from ml-platform/02_gke/modules/network/README.md rename to ml-platform/modules/network/README.md index 14e1591d8..6de9bdc13 100644 --- a/ml-platform/02_gke/modules/network/README.md +++ b/ml-platform/modules/network/README.md @@ -1,4 +1,3 @@ - ## Requirements | Name | Version | diff --git a/ml-platform/02_gke/modules/network/outputs.tf b/ml-platform/modules/network/outputs.tf similarity index 99% rename from ml-platform/02_gke/modules/network/outputs.tf rename to ml-platform/modules/network/outputs.tf index 13026f645..bf9d36dad 100644 --- a/ml-platform/02_gke/modules/network/outputs.tf +++ b/ml-platform/modules/network/outputs.tf @@ -25,4 +25,4 @@ output "subnet-1" { output "subnet-2" { value = google_compute_subnetwork.subnet-2.id description = "subnet2." -} +} \ No newline at end of file diff --git a/ml-platform/02_gke/modules/network/variables.tf b/ml-platform/modules/network/variables.tf similarity index 92% rename from ml-platform/02_gke/modules/network/variables.tf rename to ml-platform/modules/network/variables.tf index db344133d..c7c12296c 100644 --- a/ml-platform/02_gke/modules/network/variables.tf +++ b/ml-platform/modules/network/variables.tf @@ -16,28 +16,23 @@ variable "project_id" { description = "Id of the GCP project where VPC is to be created." type = string } - variable "network_name" { description = "Name of the VPC network." type = string } - variable "routing_mode" { description = "The network routing mode." type = string default = "GLOBAL" } - variable "subnet_01_name" { description = "Name of first subnet." type = string } - variable "subnet_01_ip" { description = "IP range of first subnet." type = string } - variable "subnet_01_region" { description = "Region of first subnet." type = string @@ -47,13 +42,15 @@ variable "subnet_02_name" { description = "Name of the second subnet." type = string } - variable "subnet_02_ip" { description = "IP range of second subnet." type = string } - variable "subnet_02_region" { description = "Region of second subnet." type = string } +//variable "default_route_name" { +// description = "Name of the default route to internet." +// type = string +//} diff --git a/ml-platform/02_gke/modules/vm-reservations/versions.tf b/ml-platform/modules/network/versions.tf similarity index 80% rename from ml-platform/02_gke/modules/vm-reservations/versions.tf rename to ml-platform/modules/network/versions.tf index fc374eab1..e2e5241f2 100644 --- a/ml-platform/02_gke/modules/vm-reservations/versions.tf +++ b/ml-platform/modules/network/versions.tf @@ -12,15 +12,20 @@ # See the License for the specific language governing permissions and # limitations under the License. +//terraform { +// required_providers { +// google = { +// source = "hashicorp/google" +// version = ">= 4.28.0" +// } +// } +//} + terraform { required_providers { - google-beta = { - source = "hashicorp/google-beta" - version = "4.72.1" - } google = { source = "hashicorp/google" - version = "4.72.1" + version = "5.19.0" } } } diff --git a/ml-platform/02_gke/modules/network/vpc.tf b/ml-platform/modules/network/vpc.tf similarity index 85% rename from ml-platform/02_gke/modules/network/vpc.tf rename to ml-platform/modules/network/vpc.tf index a80166be5..37266b5d2 100644 --- a/ml-platform/02_gke/modules/network/vpc.tf +++ b/ml-platform/modules/network/vpc.tf @@ -36,3 +36,11 @@ resource "google_compute_subnetwork" "subnet-2" { network = google_compute_network.vpc-network.id private_ip_google_access = true } + +//resource "google_compute_route" "default-route" { +//name = var.default_route_name +//dest_range = "0.0.0.0/0" +//network = google_compute_network.vpc-network.id +//priority = 1000 +//next_hop_gateway = "default-internet-gateway" +//} diff --git a/ml-platform/02_gke/modules/node-pools/nodepools.tf b/ml-platform/modules/node-pools/nodepools.tf similarity index 90% rename from ml-platform/02_gke/modules/node-pools/nodepools.tf rename to ml-platform/modules/node-pools/nodepools.tf index 402e45695..72b07a239 100644 --- a/ml-platform/02_gke/modules/node-pools/nodepools.tf +++ b/ml-platform/modules/node-pools/nodepools.tf @@ -19,7 +19,14 @@ resource "google_container_node_pool" "node-pool" { location = var.region node_config { machine_type = var.machine_type - taint = var.taints + dynamic "taint" { + for_each = var.taints + content { + key = taint.value.key + value = taint.value.value + effect = taint.value.effect + } + } labels = { "resource-type" : var.resource_type } @@ -61,4 +68,4 @@ resource "google_container_node_pool" "node-pool" { network_config { enable_private_nodes = true } -} +} \ No newline at end of file diff --git a/ml-platform/02_gke/modules/node-pools/variables.tf b/ml-platform/modules/node-pools/variables.tf similarity index 98% rename from ml-platform/02_gke/modules/node-pools/variables.tf rename to ml-platform/modules/node-pools/variables.tf index 973d7a1fe..6a2f20e56 100644 --- a/ml-platform/02_gke/modules/node-pools/variables.tf +++ b/ml-platform/modules/node-pools/variables.tf @@ -16,19 +16,16 @@ variable "node_pool_name" { type = string description = "Name of the node pool" } - variable "project_id" { type = string description = "The GCP project where the resources will be created" default = "" } - variable "cluster_name" { type = string description = "GKE cluster name" default = "" } - variable "region" { type = string description = "The GCP zone where the reservation will be created" @@ -56,6 +53,7 @@ variable "resource_type" { default = "ondemand" } + variable "accelerator" { type = string description = "The GPU accelerator to use." @@ -67,7 +65,6 @@ variable "accelerator_count" { description = "The number of accelerators per machine." default = 2 } - variable "machine_reservation_count" { type = number description = "Number of machines reserved instances with GPUs" @@ -75,7 +72,7 @@ variable "machine_reservation_count" { } variable "autoscaling" { - type = map(any) + type = map default = { "total_min_node_count" : 0, "total_max_node_count" : 24, "location_policy" : "ANY" } } @@ -83,4 +80,4 @@ variable "reservation_name" { description = "reservation name to which the nodepool will be associated" type = string default = "" -} +} \ No newline at end of file diff --git a/ml-platform/02_gke/providers.tf b/ml-platform/modules/node-pools/versions.tf similarity index 71% rename from ml-platform/02_gke/providers.tf rename to ml-platform/modules/node-pools/versions.tf index fc374eab1..d4aada15b 100644 --- a/ml-platform/02_gke/providers.tf +++ b/ml-platform/modules/node-pools/versions.tf @@ -12,15 +12,28 @@ # See the License for the specific language governing permissions and # limitations under the License. +//terraform { +// required_providers { +// google-beta = { +// source = "hashicorp/google-beta" +// version = "4.72.1" +// } +// google = { +// source = "hashicorp/google" +// version = "4.72.1" +// } +// } +//} + terraform { required_providers { - google-beta = { - source = "hashicorp/google-beta" - version = "4.72.1" - } google = { source = "hashicorp/google" - version = "4.72.1" + version = "5.19.0" + } + google-beta = { + source = "hashicorp/google-beta" + version = "5.19.0" } } } diff --git a/ml-platform/01_gcp_project/modules/projects/outputs.tf b/ml-platform/modules/projects/outputs.tf similarity index 99% rename from ml-platform/01_gcp_project/modules/projects/outputs.tf rename to ml-platform/modules/projects/outputs.tf index 431fe53dd..e087e6c85 100644 --- a/ml-platform/01_gcp_project/modules/projects/outputs.tf +++ b/ml-platform/modules/projects/outputs.tf @@ -14,4 +14,4 @@ output "project_ids" { value = "${google_project.project_under_folder}" == {} ? "${google_project.project_under_org}" : "${google_project.project_under_folder}" -} +} \ No newline at end of file diff --git a/ml-platform/01_gcp_project/modules/projects/projects.tf b/ml-platform/modules/projects/projects.tf similarity index 99% rename from ml-platform/01_gcp_project/modules/projects/projects.tf rename to ml-platform/modules/projects/projects.tf index 76f7d1ef3..2b5c6b020 100644 --- a/ml-platform/01_gcp_project/modules/projects/projects.tf +++ b/ml-platform/modules/projects/projects.tf @@ -93,4 +93,4 @@ resource "google_project_service" "project_services-6" { disable_on_destroy = true disable_dependent_services = true depends_on = [google_project.project_under_folder, google_project.project_under_org] -} +} \ No newline at end of file diff --git a/ml-platform/01_gcp_project/modules/projects/variables.tf b/ml-platform/modules/projects/variables.tf similarity index 100% rename from ml-platform/01_gcp_project/modules/projects/variables.tf rename to ml-platform/modules/projects/variables.tf diff --git a/ml-platform/02_gke/modules/cluster/versions.tf b/ml-platform/modules/projects/versions.tf similarity index 80% rename from ml-platform/02_gke/modules/cluster/versions.tf rename to ml-platform/modules/projects/versions.tf index fc374eab1..e2e5241f2 100644 --- a/ml-platform/02_gke/modules/cluster/versions.tf +++ b/ml-platform/modules/projects/versions.tf @@ -12,15 +12,20 @@ # See the License for the specific language governing permissions and # limitations under the License. +//terraform { +// required_providers { +// google = { +// source = "hashicorp/google" +// version = ">= 4.28.0" +// } +// } +//} + terraform { required_providers { - google-beta = { - source = "hashicorp/google-beta" - version = "4.72.1" - } google = { source = "hashicorp/google" - version = "4.72.1" + version = "5.19.0" } } } diff --git a/ml-platform/02_gke/modules/vm-reservations/outputs.tf b/ml-platform/modules/vm-reservations/outputs.tf similarity index 99% rename from ml-platform/02_gke/modules/vm-reservations/outputs.tf rename to ml-platform/modules/vm-reservations/outputs.tf index 11ffcc6d8..5a4562e1a 100644 --- a/ml-platform/02_gke/modules/vm-reservations/outputs.tf +++ b/ml-platform/modules/vm-reservations/outputs.tf @@ -14,4 +14,4 @@ output "reservation_name" { value = split("/", google_compute_reservation.machine_reservation.id)[5] -} +} \ No newline at end of file diff --git a/ml-platform/02_gke/modules/vm-reservations/reservations.tf b/ml-platform/modules/vm-reservations/reservations.tf similarity index 99% rename from ml-platform/02_gke/modules/vm-reservations/reservations.tf rename to ml-platform/modules/vm-reservations/reservations.tf index 03438d0f7..177b0d384 100644 --- a/ml-platform/02_gke/modules/vm-reservations/reservations.tf +++ b/ml-platform/modules/vm-reservations/reservations.tf @@ -27,4 +27,4 @@ resource "google_compute_reservation" "machine_reservation" { } } } -} +} \ No newline at end of file diff --git a/ml-platform/02_gke/modules/vm-reservations/variables.tf b/ml-platform/modules/vm-reservations/variables.tf similarity index 99% rename from ml-platform/02_gke/modules/vm-reservations/variables.tf rename to ml-platform/modules/vm-reservations/variables.tf index 7ca5e5af3..3a8e3482d 100644 --- a/ml-platform/02_gke/modules/vm-reservations/variables.tf +++ b/ml-platform/modules/vm-reservations/variables.tf @@ -17,19 +17,16 @@ variable "project_id" { description = "The GCP project where the resources will be created" default = "" } - variable "cluster_name" { type = string description = "GKE cluster name" default = "" } - variable "zone" { type = string description = "The GCP zone where the reservation will be created" default = "us-central1-a" } - variable "machine_type" { type = string description = "The machine type to use." @@ -47,7 +44,6 @@ variable "accelerator_count" { description = "The number of accelerators per machine." default = 2 } - variable "machine_reservation_count" { type = number description = "Number of machines reserved instances with GPUs" diff --git a/ml-platform/modules/vm-reservations/versions.tf b/ml-platform/modules/vm-reservations/versions.tf new file mode 100644 index 000000000..7f4362ad6 --- /dev/null +++ b/ml-platform/modules/vm-reservations/versions.tf @@ -0,0 +1,39 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +//terraform { +// required_providers { +// google-beta = { +// source = "hashicorp/google-beta" +// version = "4.72.1" +// } +// google = { +// source = "hashicorp/google" +// version = "4.72.1" +// } +// } +//} + +terraform { + required_providers { + google = { + source = "hashicorp/google" + version = "5.19.0" + } + google-beta = { + source = "hashicorp/google-beta" + version = "5.19.0" + } + } +} \ No newline at end of file diff --git a/ml-platform/01_gcp_project/outputs.tf b/ml-platform/outputs.tf similarity index 72% rename from ml-platform/01_gcp_project/outputs.tf rename to ml-platform/outputs.tf index 11352c942..f9f8ea6f3 100644 --- a/ml-platform/01_gcp_project/outputs.tf +++ b/ml-platform/outputs.tf @@ -12,6 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. +//output "project_ids" { +// value = {for k, v in "${module.gcp-project.project_ids}" : k => v.project_id} +//} + output "project_ids" { - value = { for k, v in "${module.gcp-project.project_ids}" : k => v.project_id } -} + value = var.create_projects == 1 ? { for k, v in "${module.gcp-project.project_ids}" : k => v.project_id } : "" +} \ No newline at end of file diff --git a/ml-platform/03_configsync/templates/acm-template/manifests/apps/.gitkeep b/ml-platform/templates/acm-template/manifests/apps/.gitkeep similarity index 100% rename from ml-platform/03_configsync/templates/acm-template/manifests/apps/.gitkeep rename to ml-platform/templates/acm-template/manifests/apps/.gitkeep diff --git a/ml-platform/03_configsync/templates/acm-template/manifests/clusters/.gitkeep b/ml-platform/templates/acm-template/manifests/clusters/.gitkeep similarity index 100% rename from ml-platform/03_configsync/templates/acm-template/manifests/clusters/.gitkeep rename to ml-platform/templates/acm-template/manifests/clusters/.gitkeep diff --git a/ml-platform/03_configsync/templates/acm-template/templates/_cluster_template/cluster.yaml b/ml-platform/templates/acm-template/templates/_cluster_template/cluster.yaml similarity index 100% rename from ml-platform/03_configsync/templates/acm-template/templates/_cluster_template/cluster.yaml rename to ml-platform/templates/acm-template/templates/_cluster_template/cluster.yaml diff --git a/ml-platform/03_configsync/templates/acm-template/templates/_cluster_template/config-selector.yaml b/ml-platform/templates/acm-template/templates/_cluster_template/config-selector.yaml similarity index 100% rename from ml-platform/03_configsync/templates/acm-template/templates/_cluster_template/config-selector.yaml rename to ml-platform/templates/acm-template/templates/_cluster_template/config-selector.yaml diff --git a/ml-platform/03_configsync/templates/acm-template/templates/_cluster_template/kuberay/kustomization.yaml b/ml-platform/templates/acm-template/templates/_cluster_template/kuberay/kustomization.yaml similarity index 100% rename from ml-platform/03_configsync/templates/acm-template/templates/_cluster_template/kuberay/kustomization.yaml rename to ml-platform/templates/acm-template/templates/_cluster_template/kuberay/kustomization.yaml diff --git a/ml-platform/03_configsync/templates/acm-template/templates/_cluster_template/kuberay/rayclusters.yaml b/ml-platform/templates/acm-template/templates/_cluster_template/kuberay/rayclusters.yaml similarity index 100% rename from ml-platform/03_configsync/templates/acm-template/templates/_cluster_template/kuberay/rayclusters.yaml rename to ml-platform/templates/acm-template/templates/_cluster_template/kuberay/rayclusters.yaml diff --git a/ml-platform/03_configsync/templates/acm-template/templates/_cluster_template/kuberay/rayjobs.yaml b/ml-platform/templates/acm-template/templates/_cluster_template/kuberay/rayjobs.yaml similarity index 100% rename from ml-platform/03_configsync/templates/acm-template/templates/_cluster_template/kuberay/rayjobs.yaml rename to ml-platform/templates/acm-template/templates/_cluster_template/kuberay/rayjobs.yaml diff --git a/ml-platform/03_configsync/templates/acm-template/templates/_cluster_template/kuberay/rayservices.yaml b/ml-platform/templates/acm-template/templates/_cluster_template/kuberay/rayservices.yaml similarity index 100% rename from ml-platform/03_configsync/templates/acm-template/templates/_cluster_template/kuberay/rayservices.yaml rename to ml-platform/templates/acm-template/templates/_cluster_template/kuberay/rayservices.yaml diff --git a/ml-platform/03_configsync/templates/acm-template/templates/_cluster_template/kuberay/rbac.yaml b/ml-platform/templates/acm-template/templates/_cluster_template/kuberay/rbac.yaml similarity index 100% rename from ml-platform/03_configsync/templates/acm-template/templates/_cluster_template/kuberay/rbac.yaml rename to ml-platform/templates/acm-template/templates/_cluster_template/kuberay/rbac.yaml diff --git a/ml-platform/03_configsync/templates/acm-template/templates/_cluster_template/kuberay/values.yaml b/ml-platform/templates/acm-template/templates/_cluster_template/kuberay/values.yaml similarity index 99% rename from ml-platform/03_configsync/templates/acm-template/templates/_cluster_template/kuberay/values.yaml rename to ml-platform/templates/acm-template/templates/_cluster_template/kuberay/values.yaml index 7226bf446..626a6cb2a 100644 --- a/ml-platform/03_configsync/templates/acm-template/templates/_cluster_template/kuberay/values.yaml +++ b/ml-platform/templates/acm-template/templates/_cluster_template/kuberay/values.yaml @@ -85,8 +85,7 @@ singleNamespaceInstall: true # The KubeRay operator will watch the custom resources in the namespaces listed in the "watchNamespace" parameter. watchNamespace: -# - ml-team -# - ds-team + # Environment variables env: diff --git a/ml-platform/03_configsync/templates/acm-template/templates/_cluster_template/kustomization.yaml b/ml-platform/templates/acm-template/templates/_cluster_template/kustomization.yaml similarity index 100% rename from ml-platform/03_configsync/templates/acm-template/templates/_cluster_template/kustomization.yaml rename to ml-platform/templates/acm-template/templates/_cluster_template/kustomization.yaml diff --git a/ml-platform/03_configsync/templates/acm-template/templates/_cluster_template/selector.yaml b/ml-platform/templates/acm-template/templates/_cluster_template/selector.yaml similarity index 100% rename from ml-platform/03_configsync/templates/acm-template/templates/_cluster_template/selector.yaml rename to ml-platform/templates/acm-template/templates/_cluster_template/selector.yaml diff --git a/ml-platform/03_configsync/templates/acm-template/templates/_cluster_template/team/kustomization.yaml b/ml-platform/templates/acm-template/templates/_cluster_template/team/kustomization.yaml similarity index 100% rename from ml-platform/03_configsync/templates/acm-template/templates/_cluster_template/team/kustomization.yaml rename to ml-platform/templates/acm-template/templates/_cluster_template/team/kustomization.yaml diff --git a/ml-platform/03_configsync/templates/acm-template/templates/_cluster_template/team/namespace.yaml b/ml-platform/templates/acm-template/templates/_cluster_template/team/namespace.yaml similarity index 97% rename from ml-platform/03_configsync/templates/acm-template/templates/_cluster_template/team/namespace.yaml rename to ml-platform/templates/acm-template/templates/_cluster_template/team/namespace.yaml index 832e04dc4..08474cb90 100644 --- a/ml-platform/03_configsync/templates/acm-template/templates/_cluster_template/team/namespace.yaml +++ b/ml-platform/templates/acm-template/templates/_cluster_template/team/namespace.yaml @@ -17,4 +17,4 @@ kind: Namespace metadata: name: NAMESPACE labels: - app: APP_NAME \ No newline at end of file + app: NAMESPACE \ No newline at end of file diff --git a/ml-platform/03_configsync/templates/acm-template/templates/_cluster_template/team/network-policy.yaml b/ml-platform/templates/acm-template/templates/_cluster_template/team/network-policy.yaml similarity index 100% rename from ml-platform/03_configsync/templates/acm-template/templates/_cluster_template/team/network-policy.yaml rename to ml-platform/templates/acm-template/templates/_cluster_template/team/network-policy.yaml diff --git a/ml-platform/03_configsync/templates/acm-template/templates/_cluster_template/team/rbac.yaml b/ml-platform/templates/acm-template/templates/_cluster_template/team/rbac.yaml similarity index 100% rename from ml-platform/03_configsync/templates/acm-template/templates/_cluster_template/team/rbac.yaml rename to ml-platform/templates/acm-template/templates/_cluster_template/team/rbac.yaml diff --git a/ml-platform/03_configsync/templates/acm-template/templates/_cluster_template/team/reposync.yaml b/ml-platform/templates/acm-template/templates/_cluster_template/team/reposync.yaml similarity index 99% rename from ml-platform/03_configsync/templates/acm-template/templates/_cluster_template/team/reposync.yaml rename to ml-platform/templates/acm-template/templates/_cluster_template/team/reposync.yaml index 191a5b7f0..73149d513 100644 --- a/ml-platform/03_configsync/templates/acm-template/templates/_cluster_template/team/reposync.yaml +++ b/ml-platform/templates/acm-template/templates/_cluster_template/team/reposync.yaml @@ -50,7 +50,7 @@ roleRef: name: cluster-admin apiGroup: rbac.authorization.k8s.io --- - +#END OF SINGLE ENV DECLARATION #ROOT_SOURCE/namespaces/NAMESPACE/repo-sync.yaml apiVersion: configsync.gke.io/v1beta1 kind: RepoSync diff --git a/ml-platform/03_configsync/templates/acm-template/templates/_namespace_template/app/fluentd_config.yaml b/ml-platform/templates/acm-template/templates/_namespace_template/app/fluentd_config.yaml similarity index 100% rename from ml-platform/03_configsync/templates/acm-template/templates/_namespace_template/app/fluentd_config.yaml rename to ml-platform/templates/acm-template/templates/_namespace_template/app/fluentd_config.yaml diff --git a/ml-platform/03_configsync/templates/acm-template/templates/_namespace_template/app/kustomization.yaml b/ml-platform/templates/acm-template/templates/_namespace_template/app/kustomization.yaml similarity index 100% rename from ml-platform/03_configsync/templates/acm-template/templates/_namespace_template/app/kustomization.yaml rename to ml-platform/templates/acm-template/templates/_namespace_template/app/kustomization.yaml diff --git a/ml-platform/03_configsync/templates/acm-template/templates/_namespace_template/app/serviceaccount.yaml b/ml-platform/templates/acm-template/templates/_namespace_template/app/serviceaccount.yaml similarity index 100% rename from ml-platform/03_configsync/templates/acm-template/templates/_namespace_template/app/serviceaccount.yaml rename to ml-platform/templates/acm-template/templates/_namespace_template/app/serviceaccount.yaml diff --git a/ml-platform/03_configsync/templates/acm-template/templates/_namespace_template/app/values.yaml b/ml-platform/templates/acm-template/templates/_namespace_template/app/values.yaml similarity index 100% rename from ml-platform/03_configsync/templates/acm-template/templates/_namespace_template/app/values.yaml rename to ml-platform/templates/acm-template/templates/_namespace_template/app/values.yaml diff --git a/ml-platform/02_gke/variables.tf b/ml-platform/variables.tf similarity index 56% rename from ml-platform/02_gke/variables.tf rename to ml-platform/variables.tf index 05765c043..157c49d6b 100644 --- a/ml-platform/02_gke/variables.tf +++ b/ml-platform/variables.tf @@ -12,12 +12,51 @@ # See the License for the specific language governing permissions and # limitations under the License. +variable "org_id" { + type = string + description = "The GCP orig id" + default = null +} + +variable "env" { + type = set(string) + description = "List of environments" + default = ["dev"] +} + +variable "default_env" { + type = string + description = "Lowest environments" + default = "dev" +} + +variable "folder_id" { + type = string + description = "Folder Id where the GCP projects will be created" + default = null +} + +variable "billing_account" { + type = string + description = "GCP billing account" + default = null +} + +variable "project_name" { + type = string + description = "GCP project name" + default = null +} + +variable "create_projects" { + type = number + description = "Flag to create GCP projects" + default = 0 +} + variable "project_id" { - type = map(any) - description = "The GCP project where the resources will be created. It is a map with environments a skeys and project_ids s values" - default = {} - #Below is an example of not null project_id variable - #default = { "dev" : "project_id1", "staging" : "project_id2", "prod" : "project_id3" } + type = map + description = "The GCP project where the resources will be created. It is a map with environments as keys and project_ids s values" } variable "network_name" { @@ -25,31 +64,26 @@ variable "network_name" { description = "VPC network where GKE cluster will be created" type = string } - variable "routing_mode" { default = "GLOBAL" description = "VPC routing mode." type = string } - variable "subnet_01_name" { default = "ml-vpc-subnet-01" description = "Name of the first subnet in the VPC network." type = string } - variable "subnet_01_ip" { default = "10.40.0.0/22" description = "CIDR of the first subnet." type = string } - variable "subnet_01_region" { default = "us-central1" description = "Region of the first subnet." type = string } - variable "subnet_01_description" { default = "subnet 01" description = "Description of the first subnet." @@ -60,37 +94,33 @@ variable "subnet_02_name" { description = "Name of the second subnet in the VPC network." type = string } - variable "subnet_02_ip" { default = "10.12.0.0/22" description = "CIDR of the second subnet." type = string } - variable "subnet_02_region" { default = "us-west2" description = "Region of the second subnet." type = string } - variable "subnet_02_description" { default = "subnet 02" description = "Description of the second subnet." type = string } - -variable "lookup_state_bucket" { - description = "GCS bucket to look up TF state from previous steps." - type = string - default = "YOUR_STATE_BUCKET" -} +// +//variable "lookup_state_bucket" { +// description = "GCS bucket to look up TF state from previous steps." +// type = string +// default = "YOUR_STATE_BUCKET" +//} variable "cluster_name" { description = "Name of the GKE cluster" default = "gke-ml" type = string } - variable "reserved_taints" { description = "Taints to be applied to the reserved node pool." type = list(object({ @@ -132,3 +162,56 @@ variable "spot_taints" { effect = "NO_SCHEDULE" }] } + +variable "configsync_repo_name" { + type = string + description = "Name of the GitHub repo that will be synced to the cluster with Config sync." + default = "config-sync-repo" +} + +variable "github_user" { + description = "GitHub user name." + type = string +} +variable "github_email" { + description = "GitHub user email." + type = string +} +variable "github_org" { + type = string + description = "GitHub org." +} +variable "github_token" { + type = string + description = "GitHub token. It is a token with write permissions as it will create a repo in the GitHub org." +} + +variable "secret_for_rootsync" { + type = number + description = "Create git-cred in config-management-system namespace." + default = 1 +} + +variable "create_namespace" { + type = number + description = "Setup a namespace to demo." + default = 1 +} + +variable "namespace" { + type = string + description = "Name of the namespace to demo." + default = "ml-team" +} + +variable "install_kuberay" { + type = number + description = "Flag to install kuberay operator." + default = 1 +} + +variable "install_ray_in_ns" { + type = number + description = "Flag to install ray cluster in the namespace created with the demo." + default = 1 +} \ No newline at end of file diff --git a/ml-platform/03_configsync/providers.tf b/ml-platform/versions.tf similarity index 81% rename from ml-platform/03_configsync/providers.tf rename to ml-platform/versions.tf index 6ba18fc39..4f0c767da 100644 --- a/ml-platform/03_configsync/providers.tf +++ b/ml-platform/versions.tf @@ -14,23 +14,24 @@ terraform { required_providers { - google-beta = { - source = "hashicorp/google-beta" - version = "4.72.1" - } google = { source = "hashicorp/google" - version = "4.72.1" + version = "5.19.0" } - kubernetes = { - source = "hashicorp/kubernetes" - version = "2.21.1" + google-beta = { + source = "hashicorp/google-beta" + version = "5.19.0" } github = { - source = "hashicorp/github" - version = ">= 4.3.0" + source = "integrations/github" + version = "6.0.1" + } + null = { + source = "hashicorp/null" + version = "3.2.2" } } + } provider "github" {