From 87917d74ac8a80c591f64f53c081f25e20c7cddd Mon Sep 17 00:00:00 2001 From: Carlos Timoteo Date: Fri, 15 Nov 2024 16:54:25 -0500 Subject: [PATCH] Support property id in resources (#246) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * predicting for only the users with traffic in the past 72h - purchase propensity * running inference only for users events in the past 72h * including 72h users for all models predictions * considering null values in TabWorkflow models * deleting unused pipfile * upgrading lib versions * implementing reporting preprocessing as a new pipeline * adding more code documentation * adding important information on the main README.md and DEVELOPMENT.md * adding schedule run name and more code documentation * implementing a new scheduler using the vertex ai sdk & adding user_id to procedures for consistency * adding more code documentation * adding code doc to the python custom component * adding more code documentation * fixing aggregated predictions query * removing unnecessary resources from deployment * Writing MDS guide * adding the MDS developer and troubleshooting documentation * fixing deployment for activation pipelines and gemini dataset * Update README.md * Update README.md * Update README.md * Update README.md * removing deprecated api * fixing purchase propensity pipelines names * adding extra condition for when there is not enough data for the window interval to be applied on backfill procedures * adding more instructions for post deployment and fixing issues when GA4 export was configured for less than 10 days * removing unnecessary comments * adding the number of past days to process in the variables files * adding comment about combining data from different ga4 export datasets to data store * fixing small issues with feature engineering and ml pipelines * fixing hyper parameter tuning for kmeans modeling * fixing optuna parameters * adding cloud shell image * fixing the list of all possible users in the propensity training preparation tables * additional guardrails for when there is not enough data * adding more documentation * adding more doc to feature store * add feature store documentation * adding ml pipelines docs * adding ml pipelines docs * adding more documentation * adding user agent client info * fixing scope of client info * fix * removing client_info from vertex components * fixing versioning of tf submodules * reconfiguring meta providers * fixing issue 187 * chore(deps): upgrade terraform providers and modules version * chore(deps): set the provider version * chore: formatting * fix: brand naming * fix: typo * fixing secrets issue * implementing secrets region as tf variable * implementing secrets region as tf variable * last changes requested by lgrangeau * documenting keys location better * implementing vpc peering network * Update README.md * Rebase Main into Multi-property (#243) * Update README.md * ensure the build bucket is created in the specified region (#230) * Update audience_segmentation_query_template.sqlx * Update auto_audience_segmentation_query_template.sqlx * Update churn_propensity_query_template.sqlx * Update cltv_query_template.sqlx * Update purchase_propensity_query_template.sqlx * Restrict regions for GCP Cloud Build support (#241) * Update README.md * Move to uv (#242) * add uv required project table segment in toml file * switch to uv in terraform deployment * switch to uv * remove poetry usage from terraform * format * remove poetry * Add files via upload --------- Co-authored-by: Charlie Wang <2144018+kingman@users.noreply.github.com> Co-authored-by: Mårten Lindblad * supporting property id in the resources --------- Co-authored-by: Carlos Timoteo Co-authored-by: Laurent Grangeau Co-authored-by: Charlie Wang <2144018+kingman@users.noreply.github.com> Co-authored-by: Mårten Lindblad --- docs/data_store.md | 7 +- .../cloudshell/terraform-template.tfvars | 5 +- infrastructure/terraform/main.tf | 21 ++--- .../terraform/modules/data-store/main.tf | 79 +------------------ .../terraform/modules/data-store/variables.tf | 22 ++---- .../dataform-workflow/dataform-workflow.tf | 6 +- .../modules/dataform-workflow/scheduler.tf | 4 +- .../dataform-workflow/service-account.tf | 12 +-- .../modules/dataform-workflow/variables.tf | 2 +- .../terraform/terraform-sample.tfvars | 6 +- infrastructure/terraform/variables.tf | 26 +++--- 11 files changed, 46 insertions(+), 144 deletions(-) diff --git a/docs/data_store.md b/docs/data_store.md index 70a4b05e..f015d357 100644 --- a/docs/data_store.md +++ b/docs/data_store.md @@ -107,12 +107,11 @@ To deploy the Marketing Data Store, follow the pre-requisites and instructions i Next, after creating the Terraform variables file by making a copy from the template, set the Terraform variables to create the environments you need for Dataform. ```bash -create_dev_environment = false -create_staging_environment = false -create_prod_environment = true +deploy_dataform = true +property_id = "PROPERTY_ID" ``` -When the `create_dev_environment` variable is set to `true`, a development environment will be created. When the `create_staging_environment` variable is set to `true`, a staging environment will be created. When the `create_prod_environment` variable is set to `true`, a production environment will be created. +When the `deploy_dataform` variable is set to `true`, a dataform workspace will be created. ![Dataform Repository](images/data_store_dataform_github_repository.png) After deploying the Marketing Data Store, the repository called `marketing_analytics` is created in Dataform. diff --git a/infrastructure/cloudshell/terraform-template.tfvars b/infrastructure/cloudshell/terraform-template.tfvars index 3f42716b..c38764da 100644 --- a/infrastructure/cloudshell/terraform-template.tfvars +++ b/infrastructure/cloudshell/terraform-template.tfvars @@ -17,10 +17,7 @@ tf_state_project_id = "${MAJ_DEFAULT_PROJECT_ID}" google_default_region = "${MAJ_DEFAULT_REGION}" -create_dev_environment = false -create_staging_environment = false -create_prod_environment = true - +deploy_dataform = true deploy_activation = true deploy_feature_store = true deploy_pipelines = true diff --git a/infrastructure/terraform/main.tf b/infrastructure/terraform/main.tf index 1136cb91..4bf297a2 100644 --- a/infrastructure/terraform/main.tf +++ b/infrastructure/terraform/main.tf @@ -72,7 +72,7 @@ locals { # The uv_run_alias is the alias of the uv run command. uv_run_alias = "${var.uv_cmd} run" # The mds_dataset_suffix is the suffix of the marketing data store dataset. - mds_dataset_suffix = var.create_staging_environment ? "staging" : var.create_dev_environment ? "dev" : "prod" + mds_dataset_suffix = var.property_id # The project_toml_file_path is the path to the project.toml file. project_toml_file_path = "${local.source_root_dir}/pyproject.toml" # The project_toml_content_hash is the hash of the project.toml file. @@ -284,8 +284,7 @@ resource "null_resource" "check_iam_api" { # Create the data store module. # The data store module creates the marketing data store in BigQuery, creates the ETL pipeline in Dataform # for the marketing data from Google Ads and Google Analytics. -# The data store is created only if the `create_prod_environment`, `create_staging_environment` -# or `create_dev_environment` variable is set to true in the terraform.tfvars file. +# The data store is created only if the `deploy_dataform` variable is set to true in the terraform.tfvars file. # The data store is created in the `data_project_id` project. module "data_store" { # The source directory of the data store module. @@ -317,18 +316,10 @@ module "data_store" { dataform_github_repo = var.dataform_github_repo dataform_github_token = var.dataform_github_token - # The create_dev_environment is set in the terraform.tfvars file. - # The create_dev_environment determines if the dev environment is created. - # When the value is true, the dev environment is created. - # The create_staging_environment is set in the terraform.tfvars file. - # The create_staging_environment determines if the staging environment is created. - # When the value is true, the staging environment is created. - # The create_prod_environment is set in the terraform.tfvars file. - # The create_prod_environment determines if the prod environment is created. - # When the value is true, the prod environment is created. - create_dev_environment = var.create_dev_environment - create_staging_environment = var.create_staging_environment - create_prod_environment = var.create_prod_environment + # The create_dataform determines if dataform is created. + # When the value is true, the dataform environment is created. + deploy_dataform = var.deploy_dataform + property_id = var.property_id # The dev_data_project_id is the project ID of where the dev datasets will created. #If not provided, data_project_id will be used. diff --git a/infrastructure/terraform/modules/data-store/main.tf b/infrastructure/terraform/modules/data-store/main.tf index 35f6011e..9704661b 100644 --- a/infrastructure/terraform/modules/data-store/main.tf +++ b/infrastructure/terraform/modules/data-store/main.tf @@ -29,90 +29,19 @@ provider "google" { region = var.google_default_region } -# This module sets up a Dataform workflow environment for the "dev" environment. -module "dataform-workflow-dev" { - # The count argument specifies how many instances of the module should be created. - # In this case, it's set to var.create_dev_environment ? 1 : 0, which means that - # the module will be created only if the var.create_dev_environment variable is set to `true`. - # Check the terraform.tfvars file for more information. - count = var.create_dev_environment ? 1 : 0 - # the path to the Terraform module that will be used to create the Dataform workflow environment. - source = "../dataform-workflow" - - project_id = null_resource.check_dataform_api.id != "" ? module.data_processing_project_services.project_id : data.google_project.data_processing.project_id - # The name of the Dataform workflow environment. - environment = "dev" - region = var.google_default_region - # The ID of the Dataform repository that will be used by the Dataform workflow environment. - dataform_repository_id = google_dataform_repository.marketing-analytics.id - # A list of tags that will be used to filter the Dataform files that are included in the Dataform workflow environment. - includedTags = ["ga4"] - - source_ga4_export_project_id = var.source_ga4_export_project_id - source_ga4_export_dataset = var.source_ga4_export_dataset - ga4_incremental_processing_days_back = var.ga4_incremental_processing_days_back - source_ads_export_data = var.source_ads_export_data - destination_bigquery_project_id = length(var.dev_data_project_id) > 0 ? var.staging_data_project_id : var.data_project_id - destination_bigquery_dataset_location = length(var.dev_destination_data_location) > 0 ? var.dev_destination_data_location : var.destination_data_location - - # The daily schedule for running the Dataform workflow. - # Depending on the hour that your Google Analytics 4 BigQuery Export is set, - # you may have to change this to execute at a later time of the day. - # Observe that the GA4 BigQuery Export Schedule documentation - # https://support.google.com/analytics/answer/9358801?hl=en#:~:text=A%20full%20export%20of%20data,(see%20Streaming%20export%20below). - # Check https://crontab.guru/#0_5-23/4_*_*_* to see next execution times. - daily_schedule = "0 5-23/4 * * *" - time_zone = var.time_zone -} - -# This module sets up a Dataform workflow environment for the "staging" environment. -module "dataform-workflow-staging" { - # The count argument specifies how many instances of the module should be created. - # In this case, it's set to var.create_staging_environment ? 1 : 0, which means that - # the module will be created only if the var.create_staging_environment variable is set to `true`. - # Check the terraform.tfvars file for more information. - count = var.create_staging_environment ? 1 : 0 - # the path to the Terraform module that will be used to create the Dataform workflow environment. - source = "../dataform-workflow" - - project_id = null_resource.check_dataform_api.id != "" ? module.data_processing_project_services.project_id : data.google_project.data_processing.project_id - # The name of the Dataform workflow environment. - environment = "staging" - region = var.google_default_region - # The ID of the Dataform repository that will be used by the Dataform workflow environment. - dataform_repository_id = google_dataform_repository.marketing-analytics.id - # A list of tags that will be used to filter the Dataform files that are included in the Dataform workflow environment. - includedTags = ["ga4"] - - source_ga4_export_project_id = var.source_ga4_export_project_id - source_ga4_export_dataset = var.source_ga4_export_dataset - source_ads_export_data = var.source_ads_export_data - destination_bigquery_project_id = length(var.staging_data_project_id) > 0 ? var.staging_data_project_id : var.data_project_id - destination_bigquery_dataset_location = length(var.staging_destination_data_location) > 0 ? var.staging_destination_data_location : var.destination_data_location - - # The daily schedule for running the Dataform workflow. - # Depending on the hour that your Google Analytics 4 BigQuery Export is set, - # you may have to change this to execute at a later time of the day. - # Observe that the GA4 BigQuery Export Schedule documentation - # https://support.google.com/analytics/answer/9358801?hl=en#:~:text=A%20full%20export%20of%20data,(see%20Streaming%20export%20below). - # Check https://crontab.guru/#0_5-23/4_*_*_* to see next execution times. - daily_schedule = "0 5-23/4 * * *" - time_zone = var.time_zone -} - # This module sets up a Dataform workflow environment for the "prod" environment. module "dataform-workflow-prod" { # The count argument specifies how many instances of the module should be created. - # In this case, it's set to var.create_prod_environment ? 1 : 0, which means that - # the module will be created only if the var.create_prod_environment variable is set to `true`. + # In this case, it's set to var.deploy_dataform ? 1 : 0, which means that + # the module will be created only if the var.deploy_dataform variable is set to `true`. # Check the terraform.tfvars file for more information. - count = var.create_prod_environment ? 1 : 0 + count = var.deploy_dataform ? 1 : 0 # the path to the Terraform module that will be used to create the Dataform workflow environment. source = "../dataform-workflow" project_id = null_resource.check_dataform_api.id != "" ? module.data_processing_project_services.project_id : data.google_project.data_processing.project_id # The name of the Dataform workflow environment. - environment = "prod" + property_id = var.property_id region = var.google_default_region dataform_repository_id = google_dataform_repository.marketing-analytics.id diff --git a/infrastructure/terraform/modules/data-store/variables.tf b/infrastructure/terraform/modules/data-store/variables.tf index bd85ad53..bd11aab7 100644 --- a/infrastructure/terraform/modules/data-store/variables.tf +++ b/infrastructure/terraform/modules/data-store/variables.tf @@ -47,12 +47,6 @@ variable "dataform_github_token" { type = string } -variable "create_dev_environment" { - description = "Indicates that a development environment needs to be created" - type = bool - default = true -} - variable "dev_data_project_id" { description = "Project ID of where the dev datasets will created. If not provided, data_project_id will be used." type = string @@ -65,12 +59,6 @@ variable "dev_destination_data_location" { default = "" } -variable "create_staging_environment" { - description = "Indicates that a staging environment needs to be created" - type = bool - default = true -} - variable "staging_data_project_id" { description = "Project ID of where the staging datasets will created. If not provided, data_project_id will be used." type = string @@ -83,12 +71,18 @@ variable "staging_destination_data_location" { default = "" } -variable "create_prod_environment" { - description = "Indicates that a production environment needs to be created" +variable "deploy_dataform" { + description = "Indicates that a dataform workspace needs to be created" type = bool default = true } +variable "property_id" { + description = "Google Analytics 4 Property id to create an MDS for it" + type = string + default = "" +} + variable "prod_data_project_id" { description = "Project ID of where the prod datasets will created. If not provided, data_project_id will be used." type = string diff --git a/infrastructure/terraform/modules/dataform-workflow/dataform-workflow.tf b/infrastructure/terraform/modules/dataform-workflow/dataform-workflow.tf index a0d7d153..11914892 100644 --- a/infrastructure/terraform/modules/dataform-workflow/dataform-workflow.tf +++ b/infrastructure/terraform/modules/dataform-workflow/dataform-workflow.tf @@ -22,9 +22,9 @@ locals { # This resources creates a workflow that runs the Dataform incremental pipeline. resource "google_workflows_workflow" "dataform-incremental-workflow" { project = null_resource.check_workflows_api.id != "" ? module.data_processing_project_services.project_id : var.project_id - name = "dataform-${var.environment}-incremental" + name = "dataform-${var.property_id}-incremental" region = var.region - description = "Dataform incremental workflow for ${var.environment} environment" + description = "Dataform incremental workflow for ${var.property_id} ga4 property" service_account = google_service_account.workflow-dataform.email # The source code includes the following steps: # Init: This step initializes the workflow by assigning the value of the dataform_repository_id variable to the repository variable. @@ -49,7 +49,7 @@ main: defaultDatabase: ${var.destination_bigquery_project_id} defaultLocation: ${var.destination_bigquery_dataset_location} vars: - env: ${var.environment} + ga4_property_id: '${var.property_id}' ga4_export_project: ${var.source_ga4_export_project_id} ga4_export_dataset: ${var.source_ga4_export_dataset} ga4_incremental_processing_days_back: '${var.ga4_incremental_processing_days_back}' diff --git a/infrastructure/terraform/modules/dataform-workflow/scheduler.tf b/infrastructure/terraform/modules/dataform-workflow/scheduler.tf index ecb2cbd6..4c8ec0a8 100644 --- a/infrastructure/terraform/modules/dataform-workflow/scheduler.tf +++ b/infrastructure/terraform/modules/dataform-workflow/scheduler.tf @@ -15,8 +15,8 @@ # This creates a Cloud Scheduler job that triggers the Dataform incremental workflow on a daily schedule. resource "google_cloud_scheduler_job" "daily-dataform-increments" { project = module.data_processing_project_services.project_id - name = "daily-dataform-${var.environment}" - description = "Daily Dataform ${var.environment} environment incremental update" + name = "daily-dataform-${var.property_id}" + description = "Daily Dataform ${var.property_id} property export incremental update" # The schedule attribute specifies the schedule for the job. In this case, the job is scheduled to run daily at the specified times. schedule = var.daily_schedule time_zone = var.time_zone diff --git a/infrastructure/terraform/modules/dataform-workflow/service-account.tf b/infrastructure/terraform/modules/dataform-workflow/service-account.tf index b89e91af..8c21dff2 100644 --- a/infrastructure/terraform/modules/dataform-workflow/service-account.tf +++ b/infrastructure/terraform/modules/dataform-workflow/service-account.tf @@ -19,13 +19,13 @@ resource "google_service_account" "scheduler" { ] project = null_resource.check_cloudscheduler_api.id != "" ? module.data_processing_project_services.project_id : var.project_id - account_id = "workflow-scheduler-${var.environment}" - display_name = "Service Account to schedule Dataform workflows in ${var.environment}" + account_id = "workflow-scheduler-${var.property_id}" + display_name = "Service Account to schedule Dataform workflows in ${var.property_id}" } locals { - scheduler_sa = "workflow-scheduler-${var.environment}@${module.data_processing_project_services.project_id}.iam.gserviceaccount.com" - workflows_sa = "workflow-dataform-${var.environment}@${module.data_processing_project_services.project_id}.iam.gserviceaccount.com" + scheduler_sa = "workflow-scheduler-${var.property_id}@${module.data_processing_project_services.project_id}.iam.gserviceaccount.com" + workflows_sa = "workflow-dataform-${var.property_id}@${module.data_processing_project_services.project_id}.iam.gserviceaccount.com" } # Wait for the scheduler service account to be created @@ -74,8 +74,8 @@ resource "google_service_account" "workflow-dataform" { ] project = null_resource.check_workflows_api.id != "" ? module.data_processing_project_services.project_id : var.project_id - account_id = "workflow-dataform-${var.environment}" - display_name = "Service Account to run Dataform workflows in ${var.environment}" + account_id = "workflow-dataform-${var.property_id}" + display_name = "Service Account to run Dataform workflows in ${var.property_id}" } # Wait for the workflows service account to be created diff --git a/infrastructure/terraform/modules/dataform-workflow/variables.tf b/infrastructure/terraform/modules/dataform-workflow/variables.tf index 38a20fee..97d5dc73 100644 --- a/infrastructure/terraform/modules/dataform-workflow/variables.tf +++ b/infrastructure/terraform/modules/dataform-workflow/variables.tf @@ -22,7 +22,7 @@ variable "region" { type = string } -variable "environment" { +variable "property_id" { type = string } diff --git a/infrastructure/terraform/terraform-sample.tfvars b/infrastructure/terraform/terraform-sample.tfvars index 3595a9d0..71f440d7 100644 --- a/infrastructure/terraform/terraform-sample.tfvars +++ b/infrastructure/terraform/terraform-sample.tfvars @@ -16,10 +16,7 @@ tf_state_project_id = "Google Cloud project where the terraform state file is stored" -create_dev_environment = false -create_staging_environment = false -create_prod_environment = true - +deploy_dataform = true deploy_activation = true deploy_feature_store = true deploy_pipelines = true @@ -28,6 +25,7 @@ deploy_monitoring = true #################### DATA VARIABLES ################################# data_project_id = "Project id where the MDS datasets will be created" +property_id = "Google Analytics 4 property id to identify an unique MDS deployment" destination_data_location = "BigQuery location (either regional or multi-regional) for the MDS BigQuery datasets." data_processing_project_id = "Project id where the Dataform will be installed and run" source_ga4_export_project_id = "Project id which contains the GA4 export dataset" diff --git a/infrastructure/terraform/variables.tf b/infrastructure/terraform/variables.tf index cd1e84ae..9819041a 100644 --- a/infrastructure/terraform/variables.tf +++ b/infrastructure/terraform/variables.tf @@ -81,12 +81,6 @@ variable "pipelines_github_owner" { default = "temporarily unused" } -variable "create_dev_environment" { - description = "Indicates that a development environment needs to be created" - type = bool - default = true -} - variable "dev_data_project_id" { description = "Project ID of where the dev datasets will created. If not provided, data_project_id will be used." type = string @@ -99,12 +93,6 @@ variable "dev_destination_data_location" { default = "" } -variable "create_staging_environment" { - description = "Indicates that a staging environment needs to be created" - type = bool - default = true -} - variable "staging_data_project_id" { description = "Project ID of where the staging datasets will created. If not provided, data_project_id will be used." type = string @@ -117,10 +105,10 @@ variable "staging_destination_data_location" { default = "" } -variable "create_prod_environment" { - description = "Indicates that a production environment needs to be created" - type = bool - default = true +variable "property_id" { + description = "Google Analytics 4 Property ID to install the MDS" + type = string + default = "" } variable "prod_data_project_id" { @@ -189,6 +177,12 @@ variable "ga4_measurement_secret" { sensitive = true } +variable "deploy_dataform" { + description = "Toggler for activation module" + type = bool + default = false +} + variable "deploy_activation" { description = "Toggler for activation module" type = bool